# Data Classification

## 0. Imports

In [10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from plotly import graph_objects as go, express as px, subplots
import warnings
warnings.filterwarnings('ignore')
from sklearn.preprocessing  import LabelEncoder, StandardScaler, PolynomialFeatures
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier 
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, StackingClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_recall_fscore_support
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, f_classif
from math import ceil
from itertools import permutations
import optuna
optuna.logging.set_verbosity(optuna.logging.WARNING)

## 1. Data Preparation

In [11]:
df = pd.read_csv('data/bank_fin.csv', sep=';')
df.tail(3)

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,deposit
11159,32,technician,single,secondary,no,"29,00 $",no,no,cellular,19,aug,156,2,-1,0,unknown,no
11160,43,technician,married,secondary,no,0 $,no,yes,cellular,8,may,9,2,172,5,failure,no
11161,34,technician,married,secondary,no,0 $,no,no,cellular,9,jul,628,1,-1,0,unknown,no


In [12]:
for col in df:
    df[col] = df[col].apply(lambda x: np.nan if x == 'unknown' else x)
gaps = df.isna().sum()
gaps = gaps[gaps > 0]
display(gaps)

job            70
education     497
balance        25
contact      2346
poutcome     8326
dtype: int64

In [13]:
df['job'] = df['job'].fillna(df['job'].mode()[0])
df['education'] = df['education'].fillna(df['education'].mode()[0])

subplots.make_subplots(rows=1, cols=2, subplot_titles=('Occupation', 'Education'))\
    .add_trace(go.Histogram(x=df['job'], marker_color='rgb(26, 118, 255)'), row=1, col=1)\
    .add_trace(go.Histogram(x=df['education'], marker_color='darkblue'), row=1, col=2)\
    .update_layout(showlegend=False)\
    .show()

In [14]:
df['balance'] = df['balance'].apply(lambda x: float(str(x).replace(' ','').replace('$','').replace(',','.'))) /1000
df['balance'] = df['balance'].fillna(df['balance'].median())
hist1 = go.Histogram(x=df['balance'], marker_color='rgb(26, 118, 255)')

bounds = [np.percentile(df['balance'], 25), np.percentile(df['balance'], 75)]
lower_bound, upper_bound = [(5*x-3*y)/2 for x,y in set(permutations(bounds))]
df = df[(df['balance']<=upper_bound) & (df['balance']>=lower_bound)]
hist2 = go.Histogram(x=df['balance'], marker_color='darkblue')

df['debt'] = df['balance'].apply(lambda x: 1 if x<0 else 0)

subplots.make_subplots(rows=1, cols=2, subplot_titles=('Original (With Outliers)', 'Clean (No Outliers)', 'Debt', 'Savings'))\
    .add_trace(hist1, row=1, col=1).add_trace(hist2, row=1, col=2)\
    .update_layout(title_text='Balance Distribution', showlegend=False)\
    .show()

In [15]:
df['pdays'] = df['pdays'] /20
df['duration'] = np.log(df['duration'])
hist1 = go.Histogram(x=df['duration'], marker_color='rgb(26, 118, 255)')

df['age'] = np.log(df['age'])
hist2 = go.Histogram(x=df['age'], nbinsx=80, marker_color='darkblue')

subplots.make_subplots(rows=1, cols=2, subplot_titles=('Log(Duration)', 'Log(Age)'))\
    .add_trace(hist1,row=1,col=1).add_trace(hist2,row=1,col=2)\
    .update_layout(title_text='Lognormal Features', showlegend=False)\
    .show()

In [16]:
df['age_group'] = df['age'].apply(lambda x: ('<30','30-40','40-50','50-60','60+')[min(max(ceil((np.exp(x)-29)/10), 0), 4)])
go.Figure().add_trace(go.Histogram(x=df['age_group'], marker_color='darkblue')).update_layout(title_text='Age Distribution').show()

## 2. EDA

In [17]:
df.groupby(['deposit']).count()

Unnamed: 0_level_0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,debt,age_group
deposit,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
no,5424,5424,5424,5424,5424,5424,5424,5424,3749,5424,5424,5424,5424,5424,5424,862,5424,5424
yes,4681,4681,4681,4681,4681,4681,4681,4681,4195,4681,4681,4681,4681,4681,4681,1673,4681,4681


In [18]:
fig = go.Figure()
for i, col in enumerate(['age','balance','day','duration','campaign','pdays','previous']):
    fig.add_trace(go.Box(y=df[col], name=col, marker_color=f"rgb({14*i},{17*i},{139+17*i})"))
fig.update_layout(height=600, title_text="Numerical Columns Distributions", showlegend=False).show()
df = df.drop('age', axis=1)

In [19]:
df.describe(include='object')

Unnamed: 0,job,marital,education,default,housing,loan,contact,month,poutcome,deposit,age_group
count,10105,10105,10105,10105,10105,10105,7944,10105,2535,10105,10105
unique,11,3,3,2,2,2,2,12,3,2,5
top,management,married,secondary,no,no,no,cellular,may,failure,no,30-40
freq,2315,5715,5517,9939,5243,8712,7283,2617,1109,5424,4274


In [20]:
graphs = np.array([
    ['poutcome','previous campaign outcome'], ['marital','marital status'], ['education','education level'], 
    ['default','default'], ['housing','housing status'], ['loan','loan status'], 
    ['job', 'occupation'], ['contact','means of contact'], ['month','month'] 
])
fig = subplots.make_subplots(rows=3, cols=3, subplot_titles=['...'+t for t in graphs[:,1]])
for id, graph in enumerate(graphs):
    a = df.groupby(graph[0])['deposit'].value_counts()
    x = df[graph[0]].dropna().unique()
    y = [a[multiindex,'yes']/(a[multiindex,'yes']+a[multiindex,'no']) for multiindex in x]
    fig.add_trace(go.Bar(x=x, y=y, marker_color='rgb(26, 118, 255)' if id%2==1 else 'darkblue'), row=id//3+1, col=id%3+1)
fig.update_layout(height=800, title_text="Campaign Success Rate by...", showlegend=False).show()

In [21]:
deposited, not_deposited = df[df['deposit'] == 'yes'], df[df['deposit'] == 'no']
table_1 = pd.pivot_table(deposited, index='education', columns='marital', values='deposit', aggfunc='count')
table_2 = pd.pivot_table(not_deposited, index='education', columns='marital', values='deposit', aggfunc='count')
table = table_1/(table_1+table_2)
px.imshow(table).update_layout(title='Success Rate by Social Group')

## 3. Feature Transformation/Extraction

In [22]:
print(df.shape[1], '->', end=' ')

for label_col in ('education','age_group'):
    df[label_col] = LabelEncoder().fit_transform(df[label_col])

for bin_col in ('deposit','default','housing','loan'):
    df[bin_col] = df[bin_col].apply(lambda x: {'yes':1,'no':0}[x])

df = pd.get_dummies(df)

print(df.shape[1], 'features')

18 -> 44 features


In [23]:
correlation = df.corr()
for col in correlation:
    correlation[col] = correlation[col].apply(lambda x: np.nan if (x==1) else x)
px.imshow(correlation).update_layout(title='Feature Correlation')

In [24]:
X, y = df.drop(['deposit'], axis=1), np.array(df['deposit'])

print(X.shape[1], '->', end=' ')
X = PolynomialFeatures(2).fit_transform(X)
print(X.shape[1], 'features')

k = 700
if k < X.shape[1]:
    print(X.shape[1], f"-> {k} features")
    selector = SelectKBest(score_func=f_classif, k=k).fit(X, y)
    X = X[:,selector.get_support(indices=True)]

X = StandardScaler().fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state = 42, test_size = 0.2)

43 -> 990 features
990 -> 700 features


## 4: Simple Models: LogReg & Decision Trees

In [25]:
Metrics = pd.DataFrame({'model':[], 'sample':[], 'accuracy':[],'precision':[],'recall':[],'f1':[]})
keys = ('accuracy', 'precision', 'recall', 'f1')

def metrics(model_name, sample, model, X, y, round_to=3, return_row=False):
    y_pred = model.predict(X)
    A, P, R, F1, _ = accuracy_score(y, y_pred), *precision_recall_fscore_support(y, y_pred)
    row = {'model':[model_name], 'sample':[sample], 'accuracy':[round(A,round_to)], \
         'precision':[round(P.mean(),round_to)], 'recall':[round(R.mean(),round_to)], 'f1':[round(F1.mean(),round_to)]}
    global Metrics
    Metrics = pd.concat([Metrics, pd.DataFrame(row)], ignore_index=True)
    print(row['sample'][0].title(), ': ', end='')
    print(*[metric+': '+str(row[metric][0]) for metric in keys], sep=', ')
    if return_row:
        return row

def graph_model(model, fig=None, shift=0.75):
    th = np.array(['train_'+x for x in keys]+['test_'+x for x in keys])
    rs = np.concatenate([np.array(Metrics[(Metrics['model']==model)&(Metrics['sample']==sample)].iloc[0,2:]) \
         for sample in ('train','test')]) - np.array([shift]*8)
    if fig is None:
        fig = go.Figure()
    fig.add_trace(go.Scatterpolar(r=rs, theta=th, fill='toself', name=model)).update_layout(title_text="Model Comparison")
    return fig

LogReg

In [26]:
lr = LogisticRegression(solver='sag', random_state=42, max_iter=1000)
lr.fit(X_train, y_train)

metrics('lr', 'train', lr, X_train, y_train)
metrics('lr', 'test', lr, X_test, y_test)

Train : accuracy: 0.888, precision: 0.887, recall: 0.888, f1: 0.887
Test : accuracy: 0.844, precision: 0.843, recall: 0.843, f1: 0.843


Decision Tree

In [27]:
train_As, test_As = [], []
trees = range(3,18)
for i in trees:
    dt = DecisionTreeClassifier(criterion='entropy', random_state = 42, max_depth=i)
    dt.fit(X_train, y_train)
    train_As.append(accuracy_score(y_train, dt.predict(X_train)))
    test_As.append(accuracy_score(y_test, dt.predict(X_test)))
go.Figure()\
    .add_trace(go.Scatter(x=list(trees), y=train_As, name=f"Train", marker_color='rgb(26, 118, 255)'))\
    .add_trace(go.Scatter(x=list(trees), y=test_As, name=f"Test", marker_color='darkblue'))\
    .update_layout(barmode='overlay', title_text="Accuracy by Tree Depth", xaxis_title_text='Max Depth', yaxis_title_text='Accuracy')\
    .show()

In [28]:
dt = DecisionTreeClassifier(criterion='entropy', random_state = 42, max_depth=trees[np.array(test_As).argmax()])
dt.fit(X_train, y_train)

metrics('dt', 'train', dt, X_train, y_train)
metrics('dt', 'test', dt, X_test, y_test)

Train : accuracy: 0.893, precision: 0.895, recall: 0.891, f1: 0.892
Test : accuracy: 0.818, precision: 0.817, recall: 0.816, f1: 0.817


Comparison

In [29]:
graph_model('dt', graph_model('lr')).show()

## 5. Ensembles: Random Forest, Boosting, Stacking

Random Forest

In [30]:
def optune(trial):
    n_estimators = trial.suggest_int('n_estimators', 50, 200, 2)
    max_depth = trial.suggest_int('max_depth', 1, 30, 2)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 3, 15, 1)
    criterion = trial.suggest_categorical('criterion', ["gini", "entropy"])
    model = RandomForestClassifier(random_state=42)
    model.fit(X_train, y_train)
    return f1_score(y_train, model.predict(X_train))
study = optuna.create_study(study_name="rf", direction="maximize", )
study.optimize(optune, n_trials=50)
print('Best parameters', end=': ')
print(*[k+': '+str(v) for k,v in study.best_params.items()], sep=', ')

Best parameters: n_estimators: 174, max_depth: 27, min_samples_leaf: 4, criterion: gini


In [31]:
rf = RandomForestClassifier(**study.best_params, random_state=42)
rf.fit(X_train, y_train)

metrics('rf', 'train', rf, X_train, y_train)
metrics('rf', 'test', rf, X_test, y_test)

Train : accuracy: 0.93, precision: 0.929, recall: 0.932, f1: 0.93
Test : accuracy: 0.851, precision: 0.851, recall: 0.853, f1: 0.851


Gradient Boosting

In [32]:
def optune(trial):
    n_estimators = trial.suggest_int('n_estimators', 100, 200, 2)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 5, 1)
    model = GradientBoostingClassifier(random_state=42, learning_rate=0.5, max_depth=3)
    model.fit(X_train, y_train)
    return f1_score(y_train, model.predict(X_train))
study = optuna.create_study(study_name="gb", direction="maximize")
study.optimize(optune, n_trials=50)
print('Best parameters', end=': ')
print(*[k+': '+str(v) for k,v in study.best_params.items()], sep=', ')

Best parameters: n_estimators: 170, min_samples_leaf: 2


In [33]:
gb = GradientBoostingClassifier(**study.best_params, random_state=42, learning_rate=0.5, max_depth=3)
gb.fit(X_train, y_train)

metrics('gb', 'train', gb, X_train, y_train)
metrics('gb', 'test', gb, X_test, y_test)

Train : accuracy: 0.958, precision: 0.957, recall: 0.958, f1: 0.957
Test : accuracy: 0.844, precision: 0.843, recall: 0.845, f1: 0.844


Stacking

In [34]:
st = StackingClassifier(estimators=[('rf',rf), ('gb',gb)])
st.fit(X_train, y_train)

metrics('st', 'train', st, X_train, y_train)
metrics('st', 'test', st, X_test, y_test)

Train : accuracy: 0.951, precision: 0.95, recall: 0.951, f1: 0.95
Test : accuracy: 0.853, precision: 0.852, recall: 0.854, f1: 0.853


Comparison

In [35]:
graph_model('st', graph_model('gb', graph_model('rf'))).show()

Total Comparison

In [36]:
graph_model('st', graph_model('gb', graph_model('rf', graph_model('dt', graph_model('lr'))))).show()