# 03 Classification: Homework

In [74]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mutual_info_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression

## Data Import

In [75]:
df = pd.read_csv('bank-full.csv', sep=';')

In [76]:
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


In [77]:
df.shape

(45211, 17)

## Data Preparation

In [78]:
col_list = ['age', 'job', 'marital', 'education', 'balance', 'housing', 'contact',
            'day', 'month', 'duration', 'campaign', 'pdays', 'previous', 'poutcome', 'y']

In [79]:
df = df[col_list]

In [80]:
df.isnull().sum()

age          0
job          0
marital      0
education    0
balance      0
housing      0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
y            0
dtype: int64

In [81]:
df.education.mode()

0    secondary
Name: education, dtype: object

In [82]:
df.select_dtypes('number').corr()

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous
age,1.0,0.097783,-0.00912,-0.004648,0.00476,-0.023758,0.001288
balance,0.097783,1.0,0.004503,0.02156,-0.014578,0.003435,0.016674
day,-0.00912,0.004503,1.0,-0.030206,0.16249,-0.093044,-0.05171
duration,-0.004648,0.02156,-0.030206,1.0,-0.08457,-0.001565,0.001203
campaign,0.00476,-0.014578,0.16249,-0.08457,1.0,-0.088628,-0.032855
pdays,-0.023758,0.003435,-0.093044,-0.001565,-0.088628,1.0,0.45482
previous,0.001288,0.016674,-0.05171,0.001203,-0.032855,0.45482,1.0


In [83]:
df.y = (df.y == 'yes').astype(int)

In [84]:
df.y.value_counts()

y
0    39922
1     5289
Name: count, dtype: int64

## Data Preprocessing

In [85]:
df_train_full, df_test = train_test_split(df.drop('y', axis=1), test_size=.2, random_state=42)
df_train, df_val = train_test_split(df_train_full, test_size=.25, random_state=42)

In [86]:
print(df_train.shape)
print(df_val.shape)
print(df_test.shape)

(27126, 14)
(9042, 14)
(9043, 14)


In [87]:
y_train = df.loc[df_train.index, 'y']
y_val = df.loc[df_val.index, 'y']
y_test = df.loc[df_test.index, 'y']

In [88]:
print(y_train.shape)
print(y_val.shape)
print(y_test.shape)

(27126,)
(9042,)
(9043,)


In [89]:
df_train.reset_index(drop=True, inplace=True)
df_val.reset_index(drop=True, inplace=True)
df_test.reset_index(drop=True, inplace=True)

## Feature Importance

In [90]:
def compute_y_mutual_info(series: pd.Series) -> float:
  return round(mutual_info_score(y_train, series), 2)

In [91]:
mutual_scores = df_train.select_dtypes(exclude='number').apply(compute_y_mutual_info)
mutual_scores.sort_values(ascending=False)

poutcome     0.03
month        0.03
job          0.01
contact      0.01
housing      0.01
education    0.00
marital      0.00
dtype: float64

## Model Training

In [92]:
def encode_categorical_vars(df: pd.DataFrame, encoder: OneHotEncoder = None) -> pd.DataFrame:
  df = df.copy()

  df_categorical = df.select_dtypes(exclude='number')
  df_numerical = df.select_dtypes(include='number')

  if not encoder:
    encoder = OneHotEncoder(sparse_output=False, dtype=np.int64, handle_unknown='infrequent_if_exist')
    encoder.fit(df_categorical)

  df_categorical_encoded = pd.DataFrame(
    data=encoder.transform(df_categorical),
    columns=encoder.get_feature_names_out()
  )

  df_encoded = df_numerical.merge(df_categorical_encoded, left_index=True, right_index=True)

  return df_encoded, encoder

In [93]:
X_train, encoder = encode_categorical_vars(df_train)
X_val, _ = encode_categorical_vars(df_val, encoder)
X_test, _ = encode_categorical_vars(df_test, encoder)

In [94]:
print(X_train.shape)
print(X_val.shape)
print(X_test.shape)

(27126, 47)
(9042, 47)
(9043, 47)


In [95]:
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train, y_train)

In [96]:
pred_val = model.predict(X_val)
full_model_val_accuracy = (pred_val == y_val).mean()
full_model_val_accuracy

0.9013492590134926

## Feature Selection

In [97]:
def split_dataset(df: pd.DataFrame):
  df = df.copy()

  df_train_full, df_test = train_test_split(df.drop('y', axis=1), test_size=.2, random_state=42)
  df_train, df_val = train_test_split(df_train_full, test_size=.25, random_state=42)

  y_train = df.loc[df_train.index, 'y'].reset_index(drop=True)
  y_val = df.loc[df_val.index, 'y'].reset_index(drop=True)
  y_test = df.loc[df_test.index, 'y'].reset_index(drop=True)

  df_train.reset_index(drop=True, inplace=True)
  df_val.reset_index(drop=True, inplace=True)
  df_test.reset_index(drop=True, inplace=True)

  return (df_train, y_train), (df_val, y_val), (df_test, y_test)

In [98]:
features_list = ['age', 'job', 'marital', 'education', 'balance', 'housing', 'contact',
            'day', 'month', 'duration', 'campaign', 'pdays', 'previous', 'poutcome']

In [99]:
for col in features_list:
  # manually select features
  features_subset = features_list.copy()
  features_subset.append('y')
  features_subset.remove(col)
  
  # split dataset
  df_subset = df[features_subset]
  (df_train, y_train), (df_val, y_val), (df_test, y_test) = split_dataset(df_subset)

  # encode data
  X_train, encoder = encode_categorical_vars(df_train)
  X_val, _ = encode_categorical_vars(df_val, encoder)
  X_test, _ = encode_categorical_vars(df_test, encoder)

  # train model
  model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
  model.fit(X_train, y_train)

  preds_val = model.predict(X_val)
  val_accuracy = (y_val == preds_val).mean()
  accuracy_drop = full_model_val_accuracy - val_accuracy
  
  print(f'Dropped feature: {col} | Accuracy drop: {accuracy_drop} ')

Dropped feature: age | Accuracy drop: 0.0005529750055297544 
Dropped feature: job | Accuracy drop: 0.0001105950011059953 
Dropped feature: marital | Accuracy drop: 0.00033178500331787486 
Dropped feature: education | Accuracy drop: 0.00033178500331787486 
Dropped feature: balance | Accuracy drop: 0.0005529750055297544 
Dropped feature: housing | Accuracy drop: 0.00044238000442387015 
Dropped feature: contact | Accuracy drop: 0.000774165007741634 
Dropped feature: day | Accuracy drop: 0.0011059500110595089 
Dropped feature: month | Accuracy drop: 0.001548330015483268 
Dropped feature: duration | Accuracy drop: 0.011612475116124732 
Dropped feature: campaign | Accuracy drop: 0.0011059500110595089 
Dropped feature: pdays | Accuracy drop: 0.00033178500331787486 
Dropped feature: previous | Accuracy drop: 0.00022119000221187957 
Dropped feature: poutcome | Accuracy drop: 0.007852245078522446 


## Parameter Tuning

In [100]:
C_values = [0.01, 0.1, 1, 10, 100]

# split dataset
(df_train, y_train), (df_val, y_val), (df_test, y_test) = split_dataset(df)

# encode data
X_train, encoder = encode_categorical_vars(df_train)
X_val, _ = encode_categorical_vars(df_val, encoder)
X_test, _ = encode_categorical_vars(df_test, encoder)

# train models
for C_val in C_values:
  model = LogisticRegression(solver='liblinear', C=C_val, max_iter=1000, random_state=42)
  model.fit(X_train, y_train)

  preds_val = model.predict(X_val)
  val_accuracy = round((y_val == preds_val).mean(), 3)
  print(f'{C_val}: {val_accuracy}')

0.01: 0.899
0.1: 0.901
1: 0.901
10: 0.901
100: 0.901
