In [46]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder



In [8]:
df = pd.read_csv('datasets/bank-full.csv', sep=';')
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


In [10]:
df.describe()

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous
count,45211.0,45211.0,45211.0,45211.0,45211.0,45211.0,45211.0
mean,40.93621,1362.272058,15.806419,258.16308,2.763841,40.197828,0.580323
std,10.618762,3044.765829,8.322476,257.527812,3.098021,100.128746,2.303441
min,18.0,-8019.0,1.0,0.0,1.0,-1.0,0.0
25%,33.0,72.0,8.0,103.0,1.0,-1.0,0.0
50%,39.0,448.0,16.0,180.0,2.0,-1.0,0.0
75%,48.0,1428.0,21.0,319.0,3.0,-1.0,0.0
max,95.0,102127.0,31.0,4918.0,63.0,871.0,275.0


In [None]:
target = df['y'] == 'yes'
target = target.astype(int)
target 

0        0
1        0
2        0
3        0
4        0
        ..
45206    1
45207    1
45208    1
45209    0
45210    0
Name: y, Length: 45211, dtype: int64

In [16]:
#df = df.drop('y', axis=1)
df.head()
df2 = df.drop('duration', axis=1)
df2

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,campaign,pdays,previous,poutcome
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,1,-1,0,unknown
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,1,-1,0,unknown
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,1,-1,0,unknown
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,1,-1,0,unknown
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,1,-1,0,unknown
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45206,51,technician,married,tertiary,no,825,no,no,cellular,17,nov,3,-1,0,unknown
45207,71,retired,divorced,primary,no,1729,no,no,cellular,17,nov,2,-1,0,unknown
45208,72,retired,married,secondary,no,5715,no,no,cellular,17,nov,5,184,3,success
45209,57,blue-collar,married,secondary,no,668,no,no,telephone,17,nov,4,-1,0,unknown


job            288
marital          0
education     1857
default          0
housing          0
loan             0
contact      13020
month            0
poutcome     36959
dtype: int64

In [None]:
df2.isnull().sum() #nao temos dados nulos

AttributeError: 'DataFrame' object has no attribute 'isnan'

In [23]:
Xtr, Xte, ytr, yte = train_test_split(df2, target, test_size=0.2, random_state=42, stratify=target)

In [26]:
yte.value_counts(normalize=True) #treino e teste proporcional

y
0    0.883003
1    0.116997
Name: proportion, dtype: float64

# Baseline 
## Pré processamento com OHE e baseline de Dummy e Reg Log

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn.dummy import DummyClassifier

num_cols = df2.select_dtypes(include=['int64', 'float64']).columns
cat_cols = df2.select_dtypes(include=['object']).columns


## Preprocessamento
preproc = ColumnTransformer([
    ('num', SimpleImputer(strategy='median'), num_cols), 
    ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols)
])

## Método de cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scores = {"roc_auc": "roc_auc", "avg_prec": "average_precision"}

def cv_report(model, X, y):
    res = cross_validate(model, X, y, cv=cv, scoring = scores)
    return {k: (res[f"test_{k}"].mean(), res[f"test_{k}"].std()) for k in scores}

In [76]:
dic = { 'nome' : 'Brendon'}
dic['nome']

'Brendon'

In [86]:
cross_validate(dummy, Xtr, ytr, cv=cv, scoring = scores)


{'fit_time': array([0.13733339, 0.12679672, 0.11978531, 0.11606526, 0.11452055]),
 'score_time': array([0.03243876, 0.03049827, 0.03032207, 0.02943683, 0.02889514]),
 'test_roc_auc': array([0.5, 0.5, 0.5, 0.5, 0.5]),
 'test_avg_prec': array([0.11694775, 0.11694775, 0.11708598, 0.11696392, 0.11696392])}

In [48]:
dummy = Pipeline([
    ('preproc', preproc),
    ('model', DummyClassifier(strategy='most_frequent'))
])

logreg = Pipeline([
    ('preproc', preproc),
    ('model', LogisticRegression(random_state=42, max_iter = 2000, solver = 'liblinear', class_weight='balanced'))
])

In [94]:
print("Dummy:", cv_report(dummy, Xtr, ytr))
print("LogReg:", cv_report(logreg, Xtr, ytr))

Dummy: {'roc_auc': (np.float64(0.5), np.float64(0.0)), 'avg_prec': (np.float64(0.11698186142744749), np.float64(5.256047063156758e-05))}
LogReg: {'roc_auc': (np.float64(0.7637744430267743), np.float64(0.0031713824009845723)), 'avg_prec': (np.float64(0.39841655794592323), np.float64(0.007598354164028438))}
