In [1]:
import pandas as pd
import numpy as np

from sklearn import metrics
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.model_selection import StratifiedKFold

In [2]:
data = pd.read_csv("../datasets/uci_repo/bank-additional/bank-additional/bank-additional-full.csv", sep=";")
data

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41183,73,retired,married,professional.course,no,yes,no,cellular,nov,fri,...,1,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,yes
41184,46,blue-collar,married,professional.course,no,no,no,cellular,nov,fri,...,1,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,no
41185,56,retired,married,university.degree,no,yes,no,cellular,nov,fri,...,2,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,no
41186,44,technician,married,professional.course,no,no,no,cellular,nov,fri,...,1,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,yes


In [3]:
target = data.pop("y")
target

0         no
1         no
2         no
3         no
4         no
        ... 
41183    yes
41184     no
41185     no
41186    yes
41187     no
Name: y, Length: 41188, dtype: object

In [4]:
data['poutcome'].value_counts()

nonexistent    35563
failure         4252
success         1373
Name: poutcome, dtype: int64

In [5]:
data['pdays'].value_counts()

999    39673
3        439
6        412
4        118
9         64
2         61
7         60
12        58
10        52
5         46
13        36
11        28
1         26
15        24
14        20
8         18
0         15
16        11
17         8
18         7
19         3
22         3
21         2
20         1
25         1
26         1
27         1
Name: pdays, dtype: int64

In [6]:
target.value_counts()

no     36548
yes     4640
Name: y, dtype: int64

In [7]:
target[(data['pdays'] == 999) & (data['poutcome'] != 'nonexistent')].value_counts()

no     3578
yes     532
Name: y, dtype: int64

In [8]:
# Categorical columns
data[data.select_dtypes(exclude=['number']).columns]

Unnamed: 0,job,marital,education,default,housing,loan,contact,month,day_of_week,poutcome
0,housemaid,married,basic.4y,no,no,no,telephone,may,mon,nonexistent
1,services,married,high.school,unknown,no,no,telephone,may,mon,nonexistent
2,services,married,high.school,no,yes,no,telephone,may,mon,nonexistent
3,admin.,married,basic.6y,no,no,no,telephone,may,mon,nonexistent
4,services,married,high.school,no,no,yes,telephone,may,mon,nonexistent
...,...,...,...,...,...,...,...,...,...,...
41183,retired,married,professional.course,no,yes,no,cellular,nov,fri,nonexistent
41184,blue-collar,married,professional.course,no,no,no,cellular,nov,fri,nonexistent
41185,retired,married,university.degree,no,yes,no,cellular,nov,fri,nonexistent
41186,technician,married,professional.course,no,no,no,cellular,nov,fri,nonexistent


In [9]:
# Encode for string labels (target)
label_encoder = LabelEncoder().fit(target)
y = label_encoder.transform(target)
y

array([0, 0, 0, ..., 0, 1, 0])

In [10]:
data[data.select_dtypes(exclude=['number']).columns]

Unnamed: 0,job,marital,education,default,housing,loan,contact,month,day_of_week,poutcome
0,housemaid,married,basic.4y,no,no,no,telephone,may,mon,nonexistent
1,services,married,high.school,unknown,no,no,telephone,may,mon,nonexistent
2,services,married,high.school,no,yes,no,telephone,may,mon,nonexistent
3,admin.,married,basic.6y,no,no,no,telephone,may,mon,nonexistent
4,services,married,high.school,no,no,yes,telephone,may,mon,nonexistent
...,...,...,...,...,...,...,...,...,...,...
41183,retired,married,professional.course,no,yes,no,cellular,nov,fri,nonexistent
41184,blue-collar,married,professional.course,no,no,no,cellular,nov,fri,nonexistent
41185,retired,married,university.degree,no,yes,no,cellular,nov,fri,nonexistent
41186,technician,married,professional.course,no,no,no,cellular,nov,fri,nonexistent


In [11]:
# Encoding
enc = OneHotEncoder()
enc.fit(data[data.select_dtypes(exclude=['number']).columns])
print(enc.categories_)
encoded_array = enc.transform(data[data.select_dtypes(exclude=['number']).columns]).toarray()
# enc.inverse_transform(encoded_array)

[array(['admin.', 'blue-collar', 'entrepreneur', 'housemaid', 'management',
       'retired', 'self-employed', 'services', 'student', 'technician',
       'unemployed', 'unknown'], dtype=object), array(['divorced', 'married', 'single', 'unknown'], dtype=object), array(['basic.4y', 'basic.6y', 'basic.9y', 'high.school', 'illiterate',
       'professional.course', 'university.degree', 'unknown'],
      dtype=object), array(['no', 'unknown', 'yes'], dtype=object), array(['no', 'unknown', 'yes'], dtype=object), array(['no', 'unknown', 'yes'], dtype=object), array(['cellular', 'telephone'], dtype=object), array(['apr', 'aug', 'dec', 'jul', 'jun', 'mar', 'may', 'nov', 'oct',
       'sep'], dtype=object), array(['fri', 'mon', 'thu', 'tue', 'wed'], dtype=object), array(['failure', 'nonexistent', 'success'], dtype=object)]


In [12]:
pd.get_dummies(data[data.select_dtypes(exclude=['number']).columns])

Unnamed: 0,job_admin.,job_blue-collar,job_entrepreneur,job_housemaid,job_management,job_retired,job_self-employed,job_services,job_student,job_technician,...,month_oct,month_sep,day_of_week_fri,day_of_week_mon,day_of_week_thu,day_of_week_tue,day_of_week_wed,poutcome_failure,poutcome_nonexistent,poutcome_success
0,0,0,0,1,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,1,0
1,0,0,0,0,0,0,0,1,0,0,...,0,0,0,1,0,0,0,0,1,0
2,0,0,0,0,0,0,0,1,0,0,...,0,0,0,1,0,0,0,0,1,0
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,1,0
4,0,0,0,0,0,0,0,1,0,0,...,0,0,0,1,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41183,0,0,0,0,0,1,0,0,0,0,...,0,0,1,0,0,0,0,0,1,0
41184,0,1,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,1,0
41185,0,0,0,0,0,1,0,0,0,0,...,0,0,1,0,0,0,0,0,1,0
41186,0,0,0,0,0,0,0,0,0,1,...,0,0,1,0,0,0,0,0,1,0


In [13]:
# Final data frame
train = np.concatenate(
    (
    np.array(data[data.select_dtypes(exclude=['object']).columns]),
    encoded_array),
    axis=1
)

In [14]:
train.shape

(41188, 63)

### Model Training

#### Split Data into train and test



# split data into train and test sets

seed = 1
X_train, X_test, y_train, y_test = train_test_split(train, y, test_size=0.25, random_state=seed)

#### Stratified K-fold Cross Validation

In [15]:
def train_model(train_array, 
                target_array,
                test_array,
                weights=1,
                eta_=0.3, 
                gamma_=3, 
                reg_lambda_=0,
                reg_alpha_=0):
    
    model = XGBClassifier(scale_pos_weight=weights,
                          use_label_encoder=False,  # Use of Label encoder will be deprecated
                          booster="gbtree",
                          verbosity=0,
                          objective="binary:logistic",
                          eval_metric="logloss", 
                          eta=eta_, 
                          gamma=gamma_,
                          reg_lambda=reg_lambda_,
                          reg_alpha=reg_alpha_)
    model.fit(train_array, target_array)
    predictions = model.predict(test_array)
    pred_prob = model.predict_proba(test_array)
    
    return predictions, pred_prob


def calculate_metrics(predictions,
                      prediction_probabilities,
                      target_test):
    precision = metrics.precision_score(y_true=target_test, y_pred=predictions)
    recall = metrics.recall_score(y_true=target_test, y_pred=predictions)
    accuracy = metrics.accuracy_score(y_true=target_test, y_pred=predictions)
    f1_score = 2*precision*recall/(precision + recall)
    lr_auc = metrics.roc_auc_score(y_true=target_test, y_score=prediction_probabilities[:, 1])
    
    cm = metrics.confusion_matrix(y_true=target_test, y_pred=predictions)
    
    return {
        "precision": precision,
        "recall": recall,
        "accuracy": accuracy,
        "f1_score": f1_score,
        "auc": lr_auc,
        "cm": cm
    }

In [19]:
kfold = StratifiedKFold(n_splits=10,shuffle=True,random_state=0)

In [27]:
precision_list = []
recall_list = []
accuracy_list = []
f1_list = []
auc_list = []

for _train, test in kfold.split(train, y):
#     print(_train, test)
    X_train, X_test = train[_train], train[test]
    y_train, y_test = y[_train], y[test]
#     counts = np.unique(y_train, return_counts=True)
    weightage = 1
    pred, pred_prob = train_model(train_array=X_train, 
                                  target_array=y_train,
                                  test_array=X_test, 
                                  weights=weightage)
    output_metrics = calculate_metrics(pred, pred_prob, y_test)
    precision_list.append(output_metrics["precision"])
    recall_list.append(output_metrics["recall"])
    accuracy_list.append(output_metrics["accuracy"])
    f1_list.append(output_metrics["f1_score"])
    auc_list.append(output_metrics["auc"])

print('Results for model not handling imbalanced classes.')
print(f'\nPrecision -> {np.mean(precision_list)}')
print(f'\nRecall -> {np.mean(recall_list)}')
print(f'\nAccuracy -> {np.mean(accuracy_list)}')
print(f'\nAUC -> {np.mean(auc_list)}')
print(f'\nf1_score -> {np.mean(f1_list)}')


Results for model not handling imbalanced classes.

Precision -> 0.644358717217264

Recall -> 0.5422413793103449

Accuracy -> 0.9146840044376733

AUC -> 0.9486126762344828

f1_score -> 0.5887383596485491


In [44]:
import xgboost as xgb
def train_model(train_array, 
                target_array,
                test_array,
                weights=1,
                eta_=0.3, 
                gamma_=3, 
                reg_lambda_=0,
                reg_alpha_=0):
    data_d = xgb.DMatrix(train_array, target_array)
    data_t = xgb.DMatrix(test_array)
    model = xgb.train({'scale_pos_weight':weights,
                          'booster':"gbtree",
                          'verbosity':0,
                          'objective':"binary:logistic",
                          'eval_metric':"logloss", 
                          'eta':eta_, 
                          'gamma':gamma_,
                          'reg_lambda':reg_lambda_,
                          'reg_alpha':reg_alpha_},
                          dtrain=data_d)
#     model.fit(train_array, target_array)
    predictions = model.predict(data_t)
    pred_prob = model.predict_proba(data_t)
    
    return predictions, pred_prob

precision_list = []
recall_list = []
accuracy_list = []
f1_list = []
auc_list = []

for _train, test in kfold.split(train, y):
#     print(_train, test)
    X_train, X_test = train[_train], train[test]
    y_train, y_test = y[_train], y[test]
#     counts = np.unique(y_train, return_counts=True)
    weightage = 1
    pred, pred_prob = train_model(train_array=X_train, 
                                  target_array=y_train,
                                  test_array=X_test, 
                                  weights=weightage)
    output_metrics = calculate_metrics(pred, pred_prob, y_test)
    precision_list.append(output_metrics["precision"])
    recall_list.append(output_metrics["recall"])
    accuracy_list.append(output_metrics["accuracy"])
    f1_list.append(output_metrics["f1_score"])
    auc_list.append(output_metrics["auc"])

print('Results for model not handling imbalanced classes.')
print(f'\nPrecision -> {np.mean(precision_list)}')
print(f'\nRecall -> {np.mean(recall_list)}')
print(f'\nAccuracy -> {np.mean(accuracy_list)}')
print(f'\nAUC -> {np.mean(auc_list)}')
print(f'\nf1_score -> {np.mean(f1_list)}')



AttributeError: 'Booster' object has no attribute 'predict_proba'

In [16]:
precision_list = []
recall_list = []
accuracy_list = []
f1_list = []
auc_list = []

for _train, test in kfold.split(train, y):
#     print(_train, test)
    X_train, X_test = train[_train], train[test]
    y_train, y_test = y[_train], y[test]
    
    counts = np.unique(y_train, return_counts=True)
    weightage = counts[1][0]/ counts[1][1]
    pred, pred_prob = train_model(train_array=X_train, 
                                  target_array=y_train,
                                  test_array=X_test, 
                                  weights=weightage)
    output_metrics = calculate_metrics(pred, pred_prob, y_test)
    precision_list.append(output_metrics["precision"])
    recall_list.append(output_metrics["recall"])
    accuracy_list.append(output_metrics["accuracy"])
    f1_list.append(output_metrics["f1_score"])
    auc_list.append(output_metrics["auc"])
#     metrics.ConfusionMatrixDisplay(output_metrics["cm"]).plot()
#     print(output_metrics["precision"])
#     print(output_metrics["recall"])
print('Results for model handling imbalanced classes.')
print(f'\nPrecision -> {np.mean(precision_list)}')
print(f'\nRecall -> {np.mean(recall_list)}')
print(f'\nAccuracy -> {np.mean(accuracy_list)}')
print(f'\nAUC -> {np.mean(auc_list)}')
print(f'\nf1_score -> {np.mean(f1_list)}')


7.876676245210728
7.876676245210728
7.876676245210728
7.876676245210728
7.876676245210728
7.876676245210728
7.876676245210728
7.876676245210728
7.876915708812261
7.876915708812261
Results for model handling imbalanced classes.

Precision -> 0.4923083420205101

Recall -> 0.8523706896551724

Accuracy -> 0.8842867739627103

AUC -> 0.9428184412124498

f1_score -> 0.6240257505700715


In [21]:
# Make a dataframe using these values for weightages ranging from 1 to 8
out_dict = {}
for _counts in [1,2,3,4,5,6,7,8]:
    precision_list = []
    recall_list = []
    accuracy_list = []
    f1_list = []
    auc_list = []
    for _train, test in kfold.split(train, y):
        X_train, X_test = train[_train], train[test]
        y_train, y_test = y[_train], y[test]

        weightage = _counts
        pred, pred_prob = train_model(train_array=X_train, 
                                      target_array=y_train,
                                      test_array=X_test, 
                                      weights=weightage)
        output_metrics = calculate_metrics(pred, pred_prob, y_test)
        precision_list.append(output_metrics["precision"])
        recall_list.append(output_metrics["recall"])
        accuracy_list.append(output_metrics["accuracy"])
        f1_list.append(output_metrics["f1_score"])
        auc_list.append(output_metrics["auc"])
    
    out_dict[f'Weightage_{_counts}'] = [
        np.mean(precision_list), np.mean(recall_list), np.mean(accuracy_list), np.mean(auc_list), np.mean(f1_list)]

_data_frame = pd.DataFrame(out_dict, index=['Precision', 'Recall', 'Accuracy', 'AUC', 'F1_score'])
_data_frame

Unnamed: 0,Weightage_1,Weightage_2,Weightage_3,Weightage_4,Weightage_5,Weightage_6,Weightage_7,Weightage_8
Precision,0.644359,0.591842,0.552326,0.536382,0.516004,0.512368,0.501243,0.496241
Recall,0.542241,0.709914,0.756681,0.792241,0.830172,0.827802,0.83944,0.853879
Accuracy,0.914684,0.912183,0.903443,0.899413,0.8931,0.891765,0.887734,0.885865
AUC,0.948613,0.947812,0.945811,0.945347,0.945469,0.94375,0.943081,0.944225
F1_score,0.588738,0.64544,0.638387,0.639553,0.636233,0.632847,0.627567,0.627621


In [24]:
# _data_frame.to_csv('scores_for_various_weightages.csv')

In [22]:
precision_list = []
recall_list = []
accuracy_list = []
f1_list = []
auc_list = []

for _train, test in kfold.split(train, y):
#     print(_train, test)
    X_train, X_test = train[_train], train[test]
    y_train, y_test = y[_train], y[test]
    
    counts = np.unique(y_train, return_counts=True)
    weightage = 9
    pred, pred_prob = train_model(train_array=X_train, 
                                  target_array=y_train,
                                  test_array=X_test, 
                                  weights=weightage)
    output_metrics = calculate_metrics(pred, pred_prob, y_test)
    precision_list.append(output_metrics["precision"])
    recall_list.append(output_metrics["recall"])
    accuracy_list.append(output_metrics["accuracy"])
    f1_list.append(output_metrics["f1_score"])
    auc_list.append(output_metrics["auc"])
#     metrics.ConfusionMatrixDisplay(output_metrics["cm"]).plot()
#     print(output_metrics["precision"])
#     print(output_metrics["recall"])
print('Results for model handling imbalanced classes.')
print(f'\nPrecision -> {np.mean(precision_list)}')
print(f'\nRecall -> {np.mean(recall_list)}')
print(f'\nAccuracy -> {np.mean(accuracy_list)}')
print(f'\nAUC -> {np.mean(auc_list)}')
print(f'\nf1_score -> {np.mean(f1_list)}')


Results for model handling imbalanced classes.

Precision -> 0.4853543847317902

Recall -> 0.8614224137931036

Accuracy -> 0.8813975758343247

AUC -> 0.9424792071805264

f1_score -> 0.6207478119634195


In [None]:
# Important observations
# 1. Precision fell - Positives are found more often as the classes are balanced now. Hence Precision fell
# 2. Recall increased - Positives are found more often which has reduced the chances of predicting False negatives.
#                       Hence Recall has increased
# 3. AUC reduced slightly
# 4. Accuracy reduced slightly - As class imbalance has been accounted for, the accuracy has reduced.

In [None]:
# 5. ALIFT is yet to be calculated

In [None]:
# 2 to 8 for scale_pos_weight

#### Rough section

In [None]:
a = pd.DataFrame([['aum', 'pandya'], ['nester', 'joseph'], ['shreyas', 'seshadri']], columns=['name', 'surname'])
a

In [None]:
# Encoding
enc = OneHotEncoder()
enc.fit(a)
print(enc.categories_)
enc.transform(a).toarray()

In [None]:
a = np.array([[1,2], [3,4]])
b = np.array([[1,2,3], [1,2,3]])
b.shape

In [None]:
a
b
np.concatenate((a,b), axis=1)