In [62]:
import re
import pandas as pd
import numpy as np
from tableone import TableOne
from scipy import stats
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, accuracy_score, precision_recall_fscore_support, confusion_matrix, precision_score, recall_score, roc_auc_score

In [63]:
url = "https://raw.githubusercontent.com/4GeeksAcademy/logistic-regression-project-tutorial/main/bank-marketing-campaign-data.csv"

In [64]:
df = pd.read_csv(url, sep = ";")
df

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41183,73,retired,married,professional.course,no,yes,no,cellular,nov,fri,...,1,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,yes
41184,46,blue-collar,married,professional.course,no,no,no,cellular,nov,fri,...,1,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,no
41185,56,retired,married,university.degree,no,yes,no,cellular,nov,fri,...,2,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,no
41186,44,technician,married,professional.course,no,no,no,cellular,nov,fri,...,1,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,yes


In [65]:
def is_binary(df_, nums):
    df = df_.copy()
    variables = []
    for var in nums:
        flag = True
        unique = df_[var].unique()
        for value in unique:
            if value not in [0, 1, np.nan, 0.0, 1.0]:
                flag = False
        if flag == True:
            variables.append(var)
    return variables


def breakdown_vars(df):
    """
    This function allow us categorize accodign to numerical or not
    """
    categorial = []
    nonormal = []
    normal = []
    binaries = is_binary(df, df.columns)
    for t in df.columns:
            if (df[t].dtypes.name=="object" or df[t].dtypes.name=='category') and t not in binaries:
                categorial.append(t)
            if (df[t].dtypes=="int64" or df[t].dtypes=="float64") and t not in binaries:
              n,p = stats.shapiro(df[t])
              if p<0.05:
                nonormal.append(t)
              else:
                normal.append(t)
    return categorial, binaries, nonormal, normal

def Xy(df,target):
    """
    Split the data in X,y to ML implementations
    """
    X = df.loc[ : , df.columns != target]
    y = df[target].astype('int')
    return X,y

def grid_lr(X_train, y_train):
    model = LogisticRegression(random_state=666, max_iter=1000)
    class_weight =  [{0:0.05, 1:0.95}, {0:0.1, 1:0.9}, {0:0.2, 1:0.8},{0:0.02, 1:0.98},{0:0.5, 1:0.5}]
    solvers = ['liblinear']
    penalty = ['l2', 'l1']
    c_values = [1000, 100, 10, 1.0, 0.1, 0.01, 0.001]
    grid = dict(solver=solvers,penalty=penalty,C=c_values, class_weight= class_weight)
    cv = RepeatedStratifiedKFold(n_splits=15, n_repeats=3, random_state=1)
    grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv,
                           scoring='f1',error_score='raise')
    grid_result = grid_search.fit(X_train, y_train)
    return  grid_result.best_estimator_

In [66]:
categorial, binaries, nonormal, normal = breakdown_vars(df)

In [67]:
df['y'] = np.where(df['y']=='yes',int(1),int(0))

In [68]:
df

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,0
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,0
2,37,services,married,high.school,no,yes,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,0
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,0
4,56,services,married,high.school,no,no,yes,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41183,73,retired,married,professional.course,no,yes,no,cellular,nov,fri,...,1,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,1
41184,46,blue-collar,married,professional.course,no,no,no,cellular,nov,fri,...,1,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,0
41185,56,retired,married,university.degree,no,yes,no,cellular,nov,fri,...,2,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,0
41186,44,technician,married,professional.course,no,no,no,cellular,nov,fri,...,1,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,1


In [69]:
mytable = TableOne(df,
                   categorical=categorial,
                   nonnormal=nonormal,
                   groupby= 'y',
                   pval=True)


In [70]:
print(mytable.tabulate(tablefmt = "fancy_grid"))

╒════════════════════════════════╤═════════════════════╤═══════════╤════════════════════════╤════════════════════════╤════════════════════════╤═══════════╕
│                                │                     │ Missing   │ Overall                │ 0                      │ 1                      │ P-Value   │
╞════════════════════════════════╪═════════════════════╪═══════════╪════════════════════════╪════════════════════════╪════════════════════════╪═══════════╡
│ n                              │                     │           │ 41188                  │ 36548                  │ 4640                   │           │
├────────────────────────────────┼─────────────────────┼───────────┼────────────────────────┼────────────────────────┼────────────────────────┼───────────┤
│ age, median [Q1,Q3]            │                     │ 0         │ 38.0 [32.0,47.0]       │ 38.0 [32.0,47.0]       │ 37.0 [31.0,50.0]       │ 0.016     │
├────────────────────────────────┼─────────────────────┼────────

In [71]:
df.drop(columns=['loan','housing'], inplace = True)

In [72]:
def normalize(array):
  return (array - array.mean())/array.std()

In [73]:
def Xy(df,target):
    """
    Split the data in X,y to ML implementations
    """
    X = df.loc[ : , df.columns != target]
    y = df[target].astype('int')
    return X,y
X , y = Xy(df,'y')

In [89]:
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.3,
                                                    shuffle = True,
                                                    random_state = 123,
                                                    stratify = y)

In [91]:
ls

model.ipynb


In [95]:
X_train.to_csv('/workspaces/Logistic-Regression-Project/data/X_train.csv')
X_test.to_csv('/workspaces/Logistic-Regression-Project/data/X_test.csv')
y_train.to_csv('/workspaces/Logistic-Regression-Project/data/y_train.csv')
y_test.to_csv('/workspaces/Logistic-Regression-Project/data/y_test.csv')

In [75]:
meanage = X_train['age'].mean()
stdage = X_train['age'].std()
meanduration = X_train['duration'].mean()
stdduration = X_train['duration'].std()
meancampaign = X_train['campaign'].mean()
stdcampaign = X_train['campaign'].std()
meanpdays = X_train['pdays'].mean()
stdpdays = X_train['pdays'].std()
meanprevious = X_train['previous'].mean()
stdprevious = X_train['previous'].std()
meanemp = X_train['emp.var.rate'].mean()
stdemp = X_train['emp.var.rate'].std()
meanconsp = X_train['cons.price.idx'].mean()
stdconsp = X_train['cons.price.idx'].std()
meanconsc = X_train['cons.conf.idx'].mean()
stdconsc = X_train['cons.conf.idx'].std()
meaneur = X_train['euribor3m'].mean()
stdeur = X_train['euribor3m'].std()
meannremp = X_train['nr.employed'].mean()
stdnremp = X_train['nr.employed'].std()

In [76]:
num = normal + nonormal
for numerical in num:
  X_train[numerical] = normalize(X_train[numerical].to_numpy())

In [77]:
categorial, binaries, nonormal, normal = breakdown_vars(X_train)

In [78]:
X_train['job'] = np.where(X_train['job'] == 'unknown','unkn',X_train['job'])
X_train['marital'] = np.where(X_train['marital'] == 'unknown','unk',X_train['marital'])
X_train['marital'] = np.where(X_train['marital'] == 'unkn','unk',X_train['marital'])
X_train['education'] = np.where(X_train['education'] == 'unknown','uk',X_train['education'])
X_train['education'] = np.where(X_train['education'] == 'unkn','unk',X_train['education'])

In [79]:
X_test['job'] = np.where(X_test['job'] == 'unknown','unkn',X_test['job'])
X_test['marital'] = np.where(X_test['marital'] == 'unknown','unk',X_test['marital'])
X_test['marital'] = np.where(X_test['marital'] == 'unkn','unk',X_test['marital'])
X_test['education'] = np.where(X_test['education'] == 'unknown','uk',X_test['education'])
X_test['education'] = np.where(X_test['education'] == 'unkn','unk',X_test['education'])

In [80]:
X_test['age'] = (X_test['age']-meanage)/stdage
X_test['duration'] = (X_test['duration']-meanduration)/meanduration
X_test['campaign'] = (X_test['campaign']-meancampaign)/stdcampaign
X_test['pdays'] = (X_test['pdays']-meanpdays)/stdpdays
X_test['previous'] = (X_test['previous']-meanprevious)/stdprevious
X_test['emp.var.rate'] = (X_test['emp.var.rate']-meanemp)/stdemp
X_test['cons.price.idx'] = (X_test['cons.price.idx']-meanconsp)/stdconsp
X_test['cons.conf.idx'] = (X_test['cons.conf.idx']-meanconsc)/stdconsc
X_test['euribor3m'] = (X_test['euribor3m']-meaneur)/stdeur
X_test['nr.employed'] = (X_test['nr.employed']-meannremp)/stdnremp

In [81]:
for cat in categorial:
    dummies = pd.get_dummies(X_train[cat], drop_first = True, dtype = int)
    X_train = X_train.join(dummies)
    X_train.drop(columns = [cat], inplace = True)

In [82]:
X_train.drop(columns=['yes'], inplace=True)

In [83]:
for cat in categorial:
    dummies = pd.get_dummies(X_test[cat], drop_first = True, dtype = int)
    X_test = X_test.join(dummies)
    X_test.drop(columns = [cat], inplace = True)

Modelado y optimizado

In [84]:
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

best_model = grid_lr(X_train, y_train)
preds = best_model.predict(X_test)
print(classification_report(y_test, preds))

              precision    recall  f1-score   support

           0       0.97      0.91      0.94     10965
           1       0.52      0.78      0.62      1392

    accuracy                           0.89     12357
   macro avg       0.74      0.84      0.78     12357
weighted avg       0.92      0.89      0.90     12357



In [86]:
pip install imbalanced-learn

Defaulting to user installation because normal site-packages is not writeable
Collecting imbalanced-learn
  Downloading imbalanced_learn-0.12.3-py3-none-any.whl (258 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m258.3/258.3 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m00:01[0m
Installing collected packages: imbalanced-learn
Successfully installed imbalanced-learn-0.12.3

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [87]:
from imblearn.under_sampling import RandomUnderSampler
rus = RandomUnderSampler(random_state=1234)
X_train, y_train = rus.fit_resample(X_train, y_train)

In [88]:
best_model = grid_lr(X_train, y_train) 
preds = best_model.predict(X_test)
print(classification_report(y_test, preds))

              precision    recall  f1-score   support

           0       1.00      0.75      0.86     10965
           1       0.34      0.98      0.50      1392

    accuracy                           0.78     12357
   macro avg       0.67      0.87      0.68     12357
weighted avg       0.92      0.78      0.82     12357



El primer modelo parece predecir mejor el comportamiento de los datos basados en los f1-score. Sin embargo, si queremos ser mas restrictivos podemos usar el segundo que tiene mayor sensibilidad.