In [70]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
#!pip install imblearn
from sklearn.utils import shuffle
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report, balanced_accuracy_score

In [30]:
#!pip install imblearn
#from imblearn.under_sampling import RandomUnderSampler
#!pip install -U scikit-learn


#### Ask why it is giving an error

In [4]:
df = pd.read_csv(r"C:\Users\brzro\concordia_bootcamp\bank_marketing\data\bank-additional\bank-additional-full.csv", sep = ';')
pd.set_option('display.max_columns', None)

In [5]:
df = df.drop_duplicates().reset_index()

In [6]:

df['y'].value_counts()

no     36537
yes     4639
Name: y, dtype: int64

### Train_Test Split & Stratifying:

since the number of positive answers is importatn in this analysis, we should have similar percentage of positives in either of the test and train sets as the percentage of positives in the original dataset. 

In [7]:
split = StratifiedShuffleSplit(n_splits = 1, test_size = 0.2, random_state = 42 )
for train_index, test_index in split.split(df, df['y']):
    strat_train = df.loc[train_index]
    strat_test = df.loc[test_index]

In [8]:
print('perdentage of yes in the original dataset', 
      round(len(df[df['y']=='yes'])*100/len(df), 3))

print('perdentage of yes in strat_train', 
      round(len(strat_train[strat_train['y']=='yes'])*100/len(strat_train),3))

print('perdentage of yes in strat_test', 
      round(len(strat_test[strat_test['y']=='yes'])*100/len(strat_test),3))


# it is stratified successfully

perdentage of yes in the original dataset 11.266
perdentage of yes in strat_train 11.266
perdentage of yes in strat_test 11.268


In [9]:
strat_train_X = strat_train.drop('y', axis = 1)
strat_train_y = strat_train['y']

In [10]:
X_train, X_test, y_train, y_test = train_test_split(strat_train_X, strat_train_y, test_size=0.2, random_state=42, stratify = strat_train_y)

In [14]:
# checking the stratification is done correctly

print('percentage of yes and no in the train set\n', 
      100*y_train.value_counts()/len(y_train))
print('percentage of yes and no in the test set\n',
      y_test.value_counts()/len(y_test))
print('number of yes and no in the train set\n', 
      y_train.value_counts()) # data is highly imbalanced, tuning is required



percentage of yes and no in the train set
 no     88.733303
yes    11.266697
Name: y, dtype: float64
percentage of yes and no in the test set
 no     0.887371
yes    0.112629
Name: y, dtype: float64
number of yes and no in the train set
 no     23383
yes     2969
Name: y, dtype: int64


### How to deal with the imbalanced data set?

Most Machine Learning models work well when a classification problem is relatively balanced. Since the data is highly imbalanced, we should use downsampling, upsampling, adding class weights or provide a reason why we believe we shoud keep the data as it is.

For this project, I use undersampling and adding class weight.

### Performance measurement for an imbalanced data set:

1- Balanced Accuracy

2- AUC

3- F1 score which will consider both precision and recall

### Undersampling dataset:

In [34]:
yes = strat_train[strat_train['y']=='yes'] # the train that y == yes
#yes

In [36]:
no = strat_train[strat_train['y']=='no'] # the train that y== no
#no

In [42]:
no_sample = no.sample(random_state = 42, frac = 0.30) # undersampling the No s
#no_sample

In [45]:
under = pd.concat([yes, no_sample])  # create a new dataset and shuffle it
under = shuffle(under, random_state = 42)

In [51]:
under_X = under.drop('y', axis = 1) # the undersampled X

In [55]:
under_y = under.y # the undersampled y
#under_y



Now the sample is ready for further analysis. We have created an undersampled dataset, and also we have the original dataset (we will use both with and without class weight). Therefore, we can compare their performance.

### Feature Engineering

The main purpose of this part is to create dummies for categorical variables, and deal with some outliers.

In [None]:
# ### age:
# under_X.loc[under_X['age']> 65, 'age'] = 65 

# ### campaign:
# under_X.loc[under_X['campaign']> 9, 'campaign'] = 9### the max is 56 and it is not a reasonable number so we jsut put a cap on the max 


In [59]:
#under_X = under_X.drop('index', axis = 1)
under_X = under_X.drop('pdays', axis = 1)

In [64]:
numerics = ['age', 'duration' , 'campaign', 'previous', 'emp.var.rate',
            'cons.price.idx','cons.conf.idx', 'euribor3m', 'nr.employed']
categoricals = ['job', 'marital', 'education', 'default', 'housing',
                    'loan', 'contact', 'month', 'day_of_week', 'poutcome']

In [66]:
# making dummies:

under_X_d = pd.get_dummies(under_X, columns = categoricals, drop_first=True)
#under_X_d

In [72]:
under_y_new = np.where(under_y == 'yes', 1, 0)

In [75]:
#under_y

In [76]:
#under_y_new

now the under_sampled data is ready for further analysis

### Logistic Regression

In [69]:
scalar = StandardScaler()
under_X_scaled = scalar.fit_transform(under_X_d)

In [71]:
LR = LogisticRegression(random_state=42, max_iter=1_000_000)

In [77]:
LR.fit(under_X_scaled, under_y_new)

LogisticRegression(max_iter=1000000, random_state=42)

In [78]:
preds = LR.predict(under_X_scaled)

In [79]:
print(classification_report(under_y_new, preds))

              precision    recall  f1-score   support

           0       0.89      0.92      0.91      8769
           1       0.80      0.74      0.77      3711

    accuracy                           0.87     12480
   macro avg       0.85      0.83      0.84     12480
weighted avg       0.87      0.87      0.87     12480

