In [102]:
import pandas as pd
import requests
import numpy as np
import matplotlib.pyplot as plt
import zipfile
import os
from sklearn.model_selection import train_test_split
from sklearn.metrics import mutual_info_score
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression

In [103]:
# Getting the data 1
url = 'https://archive.ics.uci.edu/static/public/222/bank+marketing.zip'
dataset_path = '../datasets/'
dataset_file = 'bank+marketing.zip'
dataset_full_path = os.path.join(dataset_path, dataset_file)
response = requests.get(url)
with open(f'{dataset_path}bank+marketing.zip', 'wb') as file:
    for chunk in response.iter_content(chunk_size=1024):
        file.write(chunk)

with zipfile.ZipFile(f'{dataset_path}bank+marketing.zip') as zip_ref:
    zip_ref.extractall(f'{dataset_path}/bank+marketing')

with zipfile.ZipFile(f'{dataset_path}/bank+marketing/bank.zip') as zip_ref:
    zip_ref.extractall(f'{dataset_path}/bank+marketing')

In [104]:
bank = pd.read_csv(f'{dataset_path}/bank+marketing/bank-full.csv', sep=';')

In [105]:
df = bank[
    ['age', 'job', 'marital', 'education', 'balance', 'housing', 'contact', 'day', 'month', 'duration', 'campaign',
     'pdays', 'previous', 'poutcome', 'y']]

In [106]:
# Question 1
# What is the most frequent observation (mode) for the column education?
# 
# unknown
# primary
# secondary
# tertiary
df['education'].value_counts()

education
secondary    23202
tertiary     13301
primary       6851
unknown       1857
Name: count, dtype: int64

In [107]:
df

Unnamed: 0,age,job,marital,education,balance,housing,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,2143,yes,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,29,yes,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,2,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,1506,yes,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,1,no,unknown,5,may,198,1,-1,0,unknown,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45206,51,technician,married,tertiary,825,no,cellular,17,nov,977,3,-1,0,unknown,yes
45207,71,retired,divorced,primary,1729,no,cellular,17,nov,456,2,-1,0,unknown,yes
45208,72,retired,married,secondary,5715,no,cellular,17,nov,1127,5,184,3,success,yes
45209,57,blue-collar,married,secondary,668,no,telephone,17,nov,508,4,-1,0,unknown,no


In [108]:
# Question 2
# Create the correlation matrix for the numerical features of your dataset. In a correlation matrix, you compute the correlation coefficient between every pair of features.
# 
# What are the two features that have the biggest correlation?
# 
# age and balance
# day and campaign
# day and pdays
# pdays and previous
numerical = ['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']
categorical = ['job', 'marital', 'education', 'housing', 'contact', 'month', 'poutcome']

correlation_matrix = df[numerical].corr()
correlation_matrix

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous
age,1.0,0.097783,-0.00912,-0.004648,0.00476,-0.023758,0.001288
balance,0.097783,1.0,0.004503,0.02156,-0.014578,0.003435,0.016674
day,-0.00912,0.004503,1.0,-0.030206,0.16249,-0.093044,-0.05171
duration,-0.004648,0.02156,-0.030206,1.0,-0.08457,-0.001565,0.001203
campaign,0.00476,-0.014578,0.16249,-0.08457,1.0,-0.088628,-0.032855
pdays,-0.023758,0.003435,-0.093044,-0.001565,-0.088628,1.0,0.45482
previous,0.001288,0.016674,-0.05171,0.001203,-0.032855,0.45482,1.0


In [109]:
print(correlation_matrix.loc['age', 'balance'])
print(correlation_matrix.loc['day', 'campaign'])
print(correlation_matrix.loc['day', 'pdays'])
print(correlation_matrix.loc['pdays', 'previous'])


0.09778273937134807
0.16249021632619218
-0.0930440737729405
0.4548196354805043


In [110]:
# Target encoding
# Now we want to encode the y variable.
# Let's replace the values yes/no with 1/0.
df.loc[:, 'y'] = df['y'].map({'yes': 1, 'no': 0})

In [111]:
# Split the data
# Split your data in train/val/test sets with 60%/20%/20% distribution.
# Use Scikit-Learn for that (the train_test_split function) and set the seed to 42.
# Make sure that the target value y is not in your dataframe.
len(df)

45211

In [112]:
df_full_train, df_test = train_test_split(df, test_size=0.19999, random_state=42)
df_train, df_val = train_test_split(df_full_train, test_size=0.24999, random_state=42)
len(df_train), len(df_test), len(df_val)

(27127, 9042, 9042)

In [113]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [114]:
y_train = df_train.y.values
y_val = df_val.y.values
y_test = df_test.y.values

del df_train['y']
del df_val['y']
del df_test['y']

In [115]:
df_full_train = df_full_train.reset_index(drop=True)

In [116]:
# Question 3
# Calculate the mutual information score between y and other categorical variables in the dataset. Use the training set only.
# Round the scores to 2 decimals using round(score, 2).
# Which of these variables has the biggest mutual information score?
# 
# contact
# education
# housing
# poutcome
df_full_train[categorical]

Unnamed: 0,job,marital,education,housing,contact,month,poutcome
0,technician,married,secondary,no,unknown,jun,unknown
1,blue-collar,married,primary,yes,unknown,may,unknown
2,technician,married,primary,yes,cellular,jul,unknown
3,admin.,married,secondary,no,cellular,jul,unknown
4,management,single,tertiary,yes,unknown,jun,unknown
...,...,...,...,...,...,...,...
36164,housemaid,single,primary,no,unknown,jun,unknown
36165,student,single,tertiary,no,cellular,sep,failure
36166,technician,divorced,tertiary,yes,cellular,may,unknown
36167,retired,married,secondary,no,unknown,may,unknown


In [117]:
mutual_info_score(df_full_train.y, df_full_train['job'])

np.float64(0.007765171743301404)

In [118]:
for column in df_full_train[categorical]:
    val = mutual_info_score(df_full_train.y, df_full_train[column])
    print(f'{column}: {round(val, 2)}')

job: 0.01
marital: 0.0
education: 0.0
housing: 0.01
contact: 0.01
month: 0.02
poutcome: 0.03


In [119]:
# Question 4
# Now let's train a logistic regression.
# Remember that we have several categorical variables in the dataset. Include them using one-hot encoding.
# Fit the model on the training dataset.
# To make sure the results are reproducible across different versions of Scikit-Learn, fit the model with these parameters:
# model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
# Calculate the accuracy on the validation dataset and round it to 2 decimal digits.
# What accuracy did you get?
# 
# 0.6
# 0.7
# 0.8
# 0.9

In [120]:
dv = DictVectorizer(sparse=False)
train_dicts = df_train[categorical + numerical].to_dict(orient='records')
val_dicts = df_val[categorical + numerical].to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)
X_val = dv.transform(val_dicts)
X_train

array([[ 3.600e+01,  7.220e+02,  2.000e+00, ...,  0.000e+00,  1.000e+00,
         0.000e+00],
       [ 3.600e+01, -5.470e+02,  1.000e+00, ...,  0.000e+00,  0.000e+00,
         2.000e+00],
       [ 2.900e+01,  3.400e+02,  6.000e+00, ...,  0.000e+00,  1.000e+00,
         0.000e+00],
       ...,
       [ 4.000e+01,  1.498e+03,  1.000e+00, ...,  0.000e+00,  1.000e+00,
         0.000e+00],
       [ 5.000e+01,  1.260e+03,  1.000e+00, ...,  0.000e+00,  1.000e+00,
         0.000e+00],
       [ 3.700e+01,  4.150e+02,  4.000e+00, ...,  0.000e+00,  1.000e+00,
         0.000e+00]])

In [121]:
y_train = y_train.astype('int')
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train, y_train)

In [122]:
y_pred = model.predict_proba(X_val)[:, 1]
decision = (y_pred >= 0.5)

In [123]:
df_pred = pd.DataFrame()
df_pred['probability'] = y_pred
df_pred['prediction'] = decision.astype(int)
df_pred['actual'] = y_val
df_pred

Unnamed: 0,probability,prediction,actual
0,0.013610,0,0
1,0.057271,0,0
2,0.135607,0,1
3,0.007369,0,0
4,0.008754,0,0
...,...,...,...
9037,0.161242,0,1
9038,0.005796,0,0
9039,0.018819,0,0
9040,0.031830,0,0


In [124]:
df_pred['correct'] = df_pred.prediction == df_pred.actual
df_pred.correct.mean().round(decimals=2)

np.float64(0.9)

In [125]:
(y_val == decision).mean().round(decimals=2)

np.float64(0.9)

In [126]:
# Question 5
# Let's find the least useful feature using the feature elimination technique.
# Train a model with all these features (using the same parameters as in Q4).
# Now exclude each feature from this set and train a model without it. Record the accuracy for each model.
# For each feature, calculate the difference between the original accuracy and the accuracy without the feature.
# Which of following feature has the smallest difference?
# 
# age
# balance
# marital
# previous
# Note: The difference doesn't have to be positive.

In [127]:
all_features = categorical + numerical
all_features

['job',
 'marital',
 'education',
 'housing',
 'contact',
 'month',
 'poutcome',
 'age',
 'balance',
 'day',
 'duration',
 'campaign',
 'pdays',
 'previous']

In [128]:
all_without_age = list(filter(lambda x: 'age' not in x, all_features))
all_without_balance = list(filter(lambda x: 'balance' not in x, all_features))
all_without_marital = list(filter(lambda x: 'marital' not in x, all_features))
all_without_previous = list(filter(lambda x: 'previous' not in x, all_features))

In [129]:
def logistic_regression_accuracy(features, y_target, df_train, df_val, C_val):
    dv = DictVectorizer(sparse=False)
    train_d = df_train[features].to_dict(orient='records')
    val_d = df_val[features].to_dict(orient='records')
    X_train = dv.fit_transform(train_d)
    X_val = dv.transform(val_d)
    y_train = y_target.astype('int')
    model = LogisticRegression(solver='liblinear', C=C_val, max_iter=1000, random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict_proba(X_val)[:, 1]
    decision = (y_pred >= 0.5)
    return (y_val == decision).mean()

In [130]:
accuracy_original = logistic_regression_accuracy(all_features, y_train, df_train, df_val, 1.0)
accuracy_without_age = logistic_regression_accuracy(all_without_age, y_train, df_train, df_val, 1.0)
accuracy_without_balance = logistic_regression_accuracy(all_without_balance, y_train, df_train, df_val, 1.0)
accuracy_without_marital = logistic_regression_accuracy(all_without_marital, y_train, df_train, df_val, 1.0)
accuracy_without_previous = logistic_regression_accuracy(all_without_previous, y_train, df_train, df_val, 1.0)

In [131]:
print(accuracy_original - accuracy_without_age)
print(accuracy_original - accuracy_without_balance)
print(accuracy_original - accuracy_without_marital)
print(accuracy_original - accuracy_without_previous)

0.0009953550099535136
-0.00033178500331787486
0.0008847600088476293
0.00044238000442375913


In [132]:
# Question 6. Smallest `C` that leads to the best accuracy on the validation set
# 0.01
# 0.1
# 1
# 10
# 100
accuracy_001 = logistic_regression_accuracy(all_features, y_train, df_train, df_val, 0.01)
accuracy_01 = logistic_regression_accuracy(all_features, y_train, df_train, df_val, 0.1)
accuracy_1 = logistic_regression_accuracy(all_features, y_train, df_train, df_val, 1.0)
accuracy_10 = logistic_regression_accuracy(all_features, y_train, df_train, df_val, 10.0)
accuracy_100 = logistic_regression_accuracy(all_features, y_train, df_train, df_val, 100.0)

In [133]:
print(accuracy_001)
print(accuracy_01)
print(accuracy_1)
print(accuracy_10)
print(accuracy_100)

0.8974784339747843
0.8989161689891617
0.8993585489935855
0.8995797389957974
0.8990267639902676
