# 03 Classification: Homework

In [366]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mutual_info_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression

## Data Import

In [367]:
df = pd.read_csv('bank-full.csv', sep=';')

In [368]:
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


## Data Preparation

In [369]:
col_list = ['age', 'job', 'marital', 'education', 'balance', 'housing', 'contact',
            'day', 'month', 'duration', 'campaign', 'pdays', 'previous', 'poutcome', 'y']

In [370]:
df = df[col_list]

In [371]:
df.isnull().sum()

age          0
job          0
marital      0
education    0
balance      0
housing      0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
y            0
dtype: int64

In [372]:
df.education.mode()

0    secondary
Name: education, dtype: object

In [373]:
df.select_dtypes('number').corr()

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous
age,1.0,0.097783,-0.00912,-0.004648,0.00476,-0.023758,0.001288
balance,0.097783,1.0,0.004503,0.02156,-0.014578,0.003435,0.016674
day,-0.00912,0.004503,1.0,-0.030206,0.16249,-0.093044,-0.05171
duration,-0.004648,0.02156,-0.030206,1.0,-0.08457,-0.001565,0.001203
campaign,0.00476,-0.014578,0.16249,-0.08457,1.0,-0.088628,-0.032855
pdays,-0.023758,0.003435,-0.093044,-0.001565,-0.088628,1.0,0.45482
previous,0.001288,0.016674,-0.05171,0.001203,-0.032855,0.45482,1.0


In [374]:
df.y = (df.y == 'yes').astype(int)

In [375]:
round(df.y.value_counts() / df.shape[0], 2)

y
0    0.88
1    0.12
Name: count, dtype: float64

## Data Preprocessing

In [376]:
def split_dataset(df: pd.DataFrame, target_col: str, val_ratio: float = .2, test_ratio: float = .2, seed: int = 42):
  df = df.copy()

  relative_val_ratio = val_ratio / (1 - test_ratio)

  df_train_full, df_test = train_test_split(df.drop(target_col, axis=1), test_size=test_ratio, random_state=seed)
  df_train, df_val = train_test_split(df_train_full, test_size=relative_val_ratio, random_state=seed)

  y_train = df.loc[df_train.index, target_col].reset_index(drop=True)
  y_val = df.loc[df_val.index, target_col].reset_index(drop=True)
  y_test = df.loc[df_test.index, target_col].reset_index(drop=True)

  df_train.reset_index(drop=True, inplace=True)
  df_val.reset_index(drop=True, inplace=True)
  df_test.reset_index(drop=True, inplace=True)

  return (df_train, y_train), (df_val, y_val), (df_test, y_test)


In [377]:
(df_train, y_train), (df_val, y_val), (df_test, y_test) = split_dataset(df, 'y')

In [378]:
print(df_train.shape, y_train.shape)
print(df_val.shape, y_val.shape)
print(df_test.shape, y_test.shape)

(27126, 14) (27126,)
(9042, 14) (9042,)
(9043, 14) (9043,)


## Exploratory Data Analysis

In [379]:
def target_mutual_info(series):
  return mutual_info_score(series, y_train)

In [380]:
df_train.select_dtypes(exclude='number').apply(target_mutual_info).sort_values(ascending=False).round(2)

poutcome     0.03
month        0.03
contact      0.01
housing      0.01
job          0.01
education    0.00
marital      0.00
dtype: float64

## Model Training

In [381]:
def encode_vars(df: pd.DataFrame, dv: DictVectorizer = None):
  df_dicts = df.to_dict(orient='records')

  if not dv:
    dv = DictVectorizer(sparse=False)
    dv.fit(df_dicts)
  
  df_encoded = pd.DataFrame(
    data=dv.transform(df_dicts),
    columns=dv.get_feature_names_out()
  )

  return df_encoded, dv

In [382]:
X_train, dv = encode_vars(df_train)
X_val, _ = encode_vars(df_val, dv)
X_test, _ = encode_vars(df_test, dv)

In [383]:
print(X_train.shape)
print(X_val.shape)
print(X_test.shape)

(27126, 47)
(9042, 47)
(9043, 47)


In [384]:
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train, y_train)

preds_val = model.predict(X_val)
full_model_val_accuracy = (y_val == preds_val).mean()
full_model_val_accuracy

0.9015704490157045

## Feature Selection

In [385]:
features_list = ['age', 'job', 'marital', 'education', 'balance', 'housing', 'contact',
            'day', 'month', 'duration', 'campaign', 'pdays', 'previous', 'poutcome']

In [386]:
for feature in features_list:
  features_subet = features_list.copy()
  features_subet.remove(feature)
  features_subet.append('y')

  subset_df = df[features_subet]

  (df_train, y_train), (df_val, y_val), (df_test, y_test) = split_dataset(subset_df, 'y')

  X_train, dv = encode_vars(df_train)
  X_val, _ = encode_vars(df_val, dv)
  X_test, _ = encode_vars(df_test, dv)

  model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
  model.fit(X_train, y_train)

  preds_val = model.predict(X_val)
  val_accuracy = (y_val == preds_val).mean()
  accuracy_drop = full_model_val_accuracy - val_accuracy
  
  print(f'Dropped feature: {feature} | Accuracy drop: {accuracy_drop} ')

Dropped feature: age | Accuracy drop: 0.0005529750055297544 
Dropped feature: job | Accuracy drop: 0.00022119000221187957 
Dropped feature: marital | Accuracy drop: 0.000774165007741634 
Dropped feature: education | Accuracy drop: 0.0009953550099535136 
Dropped feature: balance | Accuracy drop: 0.0008847600088476293 
Dropped feature: housing | Accuracy drop: 0.00044238000442375913 
Dropped feature: contact | Accuracy drop: 0.0008847600088476293 
Dropped feature: day | Accuracy drop: 0.00033178500331787486 
Dropped feature: month | Accuracy drop: 0.0016589250165892633 
Dropped feature: duration | Accuracy drop: 0.012386640123866366 
Dropped feature: campaign | Accuracy drop: 0.0005529750055297544 
Dropped feature: pdays | Accuracy drop: 0.0005529750055297544 
Dropped feature: previous | Accuracy drop: 0.00033178500331787486 
Dropped feature: poutcome | Accuracy drop: 0.007631055076310567 


## Parameter Tuning

In [387]:
C_vals = [0.01, 0.1, 1, 10, 100]

In [388]:
for C_val in C_vals:
  (df_train, y_train), (df_val, y_val), (df_test, y_test) = split_dataset(df, 'y')

  X_train, dv = encode_vars(df_train)
  X_val, _ = encode_vars(df_val, dv)
  X_test, _ = encode_vars(df_test, dv)

  model = LogisticRegression(solver='liblinear', C=C_val, max_iter=1000, random_state=42)
  model.fit(X_train, y_train)

  preds_val = model.predict(X_val)
  val_accuracy = round((y_val == preds_val).mean(), 3)
  print(C_val, val_accuracy)

0.01 0.898
0.1 0.9
1 0.902
10 0.901
100 0.901
