In [207]:
import pandas as pd
import requests
import zipfile
import os
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_curve
from sklearn.metrics import auc
from sklearn.metrics import roc_auc_score

In [208]:
# Getting the data 1
url = 'https://archive.ics.uci.edu/static/public/222/bank+marketing.zip'
dataset_path = '../datasets/'
dataset_file = 'bank+marketing.zip'
dataset_full_path = os.path.join(dataset_path, dataset_file)
response = requests.get(url)
with open(f'{dataset_path}bank+marketing.zip', 'wb') as file:
    for chunk in response.iter_content(chunk_size=1024):
        file.write(chunk)

with zipfile.ZipFile(f'{dataset_path}bank+marketing.zip') as zip_ref:
    zip_ref.extractall(f'{dataset_path}/bank+marketing')

with zipfile.ZipFile(f'{dataset_path}/bank+marketing/bank.zip') as zip_ref:
    zip_ref.extractall(f'{dataset_path}/bank+marketing')

In [209]:
bank = pd.read_csv(f'{dataset_path}/bank+marketing/bank-full.csv', sep=';')

In [210]:
df = bank[
    ['age', 'job', 'marital', 'education', 'balance', 'housing', 'contact', 'day', 'month', 'duration', 'campaign',
     'pdays', 'previous', 'poutcome', 'y']]

In [211]:
df.loc[:, 'y'] = df['y'].map({'yes': 1, 'no': 0})

In [212]:
# Split the data into 3 parts: train/validation/test with 60%/20%/20% distribution. Use train_test_split function for that with random_state=1
df_test, df_train = train_test_split(df, test_size=0.6, random_state=1)
df_val, df_test = train_test_split(df_test, test_size=0.5, random_state=1)
len(df), len(df_train), len(df_test), len(df_val)

(45211, 27127, 9042, 9042)

In [213]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [214]:
y_train = df_train.y.values
y_val = df_val.y.values
y_test = df_test.y.values

del df_train['y']
del df_val['y']
del df_test['y']

In [215]:
numerical = ['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']
categorical = ['job', 'marital', 'education', 'housing', 'contact', 'month', 'poutcome']

In [216]:
# Question 1: ROC AUC feature importance
# ROC AUC could also be used to evaluate feature importance of numerical variables.
# 
# Let's do that
# 
# For each numerical variable, use it as score (aka prediction) and compute the AUC with the y variable as ground truth.
# Use the training dataset for that
# If your AUC is < 0.5, invert this variable by putting "-" in front
# 
# (e.g. -df_train['engine_hp'])
# 
# AUC can go below 0.5 if the variable is negatively correlated with the target variable. You can change the direction of the correlation by negating this variable - then negative correlation becomes positive.
# 
# Which numerical variable (among the following 4) has the highest AUC?
# 
# balance
# day
# duration
# previous


In [243]:
def train_logistic_regression(features, df_train, y_train, C=1.0):
    dv = DictVectorizer(sparse=False)
    dict = df_train[features].to_dict(orient='records')
    X_train = dv.fit_transform(dict)
    model = LogisticRegression(solver='liblinear', C=C, max_iter=1000)
    model.fit(X_train, y_train)
    return dv, model

In [244]:
def predict_logistic_regression(features, df, dv, model):
    dict = df[features].to_dict(orient='records')
    X_val = dv.transform(dict)
    y_pred = model.predict_proba(X_val)[:, 1]
    return y_pred

In [245]:
y_train = y_train.astype('int')
y_val = y_val.astype('int')

In [246]:
all_features = categorical + numerical
all_without_balance = list(filter(lambda x: 'balance' not in x, all_features))
all_without_day = list(filter(lambda x: 'day' not in x, all_features))
all_without_duration = list(filter(lambda x: 'duration' not in x, all_features))
all_without_previous = list(filter(lambda x: 'previous' not in x, all_features))

In [251]:
y_train = y_train.astype('int')
y_val = y_val.astype('int')
features = all_without_balance

dv, model = train_logistic_regression(features, df_train, y_train, C=0.001)
y_pred = predict_logistic_regression(features, df_val, dv, model)
roc_auc_score(y_val, y_pred)

np.float64(0.8705406780924718)

In [252]:
y_train = y_train.astype('int')
y_val = y_val.astype('int')
features = all_without_day

dv, model = train_logistic_regression(features, df_train, y_train, C=0.001)
y_pred = predict_logistic_regression(features, df_val, dv, model)
roc_auc_score(y_val, y_pred)

np.float64(0.8723618961654769)

In [249]:
y_train = y_train.astype('int')
y_val = y_val.astype('int')
features = all_without_duration

dv, model = train_logistic_regression(features, df_train, y_train, C=0.001)
y_pred = predict_logistic_regression(features, df_val, dv, model)
roc_auc_score(y_val, y_pred)

np.float64(0.7268139273176828)

In [250]:
y_train = y_train.astype('int')
y_val = y_val.astype('int')
features = all_without_previous

dv, model = train_logistic_regression(features, df_train, y_train, C=0.001)
y_pred = predict_logistic_regression(features, df_val, dv, model)
roc_auc_score(y_val, y_pred)

np.float64(0.8704997370045363)

In [ ]:
# Question 2: Training the model
# Apply one-hot-encoding using DictVectorizer and train the logistic regression with these parameters:
# 
# LogisticRegression(solver='liblinear', C=1.0, max_iter=1000)
# What's the AUC of this model on the validation dataset? (round to 3 digits)
# 
# 0.69
# 0.79
# 0.89
# 0.99

In [254]:
y_train = y_train.astype('int')
y_val = y_val.astype('int')
features = all_features

dv, model = train_logistic_regression(features, df_train, y_train)
y_pred = predict_logistic_regression(features, df_val, dv, model)
roc_auc_score(y_val, y_pred).round(3)

np.float64(0.913)

In [238]:
# Question 3: Precision and Recall
# Now let's compute precision and recall for our model.
# 
# Evaluate the model on all thresholds from 0.0 to 1.0 with step 0.01
# For each threshold, compute precision and recall
# Plot them
# At which threshold precision and recall curves intersect?
# 
# 0.265
# 0.465
# 0.665
# 0.865

In [219]:
y_pred = model.predict_proba(X_val)[:, 1]
decision = (y_pred >= 0.5).astype(int)
(y_val == decision).mean()

np.float64(0.8927228489272285)

In [220]:
accuracy_score(y_val, decision)

0.8927228489272285

In [221]:
t = 0.5
predict_positive = (y_pred >= t)
predict_negative = (y_pred < t)

actual_positive = (y_val == 1)
actual_negative = (y_val == 0)

tp = (predict_positive & actual_positive).sum()
tn = (predict_negative & actual_negative).sum()

fp = (predict_positive & actual_negative).sum()
fn = (predict_negative & actual_positive).sum()

In [222]:
tpr = tp / (tp + fn)
tpr

np.float64(0.17002881844380405)

In [223]:
fpr = fp / (fp + tn)
fpr

np.float64(0.013248343957005373)

In [224]:
fpr, tpr, thresholds = roc_curve(y_val, y_pred)

In [225]:
auc(fpr, tpr)

np.float64(0.8557011545506858)

In [226]:
roc_auc_score(y_val, y_pred)

np.float64(0.8557011545506858)