# Packages

In [8]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from imblearn.over_sampling import SMOTE

from plotnine import *

# theme adaptation for x, y-labels
from jupyterthemes import jtplot
jtplot.style(theme='monokai', context='notebook', ticks=True, grid=False)

ModuleNotFoundError: No module named 'joblib'

# Data Preparation

## Data Import

The dataset is provided by UCI Machine Learning repository and deals with direct marketing of a bank. The target variable describes a customer subscribing (1) to a deposit or not (0).

In [None]:
banking = pd.read_csv("./data/direct_marketing.csv")

## Data Exploration

In [None]:
banking.describe()

In [None]:
banking.head()

Analyse the target variable:

In [None]:
(ggplot(data=banking) +
 aes(x='y') +
 geom_bar() +
 labs(title = "Target Variable Count", y = "Count", x = "Target Variable")
)

We see that the data is highly unbalanced.

# Modeling

Select a subset of columns for convenience.

In [None]:
cols_to_keep = ['age','duration','campaign', 'pdays', 'previous', 'emp_var_rate', 'cons_price_idx', 'cons_conf_idx', 'euribor3m', 'nr_employed', 'y']
banking_filt = banking[cols_to_keep]

In [None]:
X = banking_filt.drop(["y"], axis=1)
y = banking_filt["y"]

We need to balance our data to get better results. We can use SMOTE from imbalanced learning package.

In [9]:
over_sampling = SMOTE(random_state=123)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
columns = X_train.columns
os_data_X, os_data_y = over_sampling.fit_sample(X_train, y_train)
os_data_X = pd.DataFrame(data=os_data_X, columns=columns)
os_data_y= pd.DataFrame(data=os_data_y,columns=['y'])
# we can Check the numbers of our data
print("Total length oversampled data: ",len(os_data_X))
print("Number of class 0 in oversampled data",len(os_data_y[os_data_y['y']==0]))
print("Number of class 1 in oversampled data",len(os_data_y[os_data_y['y']==1]))
print("Proportion of class 0 in oversampled data is ",len(os_data_y[os_data_y['y']==0])/len(os_data_X))
print("Proportion of class 1 in oversampled data is ",len(os_data_y[os_data_y['y']==1])/len(os_data_X))

NameError: name 'SMOTE' is not defined

In [25]:
steps = [
    ('scaler', StandardScaler()),
    ('log_reg', LogisticRegression())
]

pipeline = Pipeline(steps)

# train the Decision Tree
clf = pipeline.fit(X_train, y_train)

# prediction for Test data
y_pred = clf.predict(X_test)


  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)


# Model Evaluation

## Baseline Classifier

In [40]:
1 - np.sum(y_test) / len(y_test)

0.88864611151574

## Our Classifier

Create the confusion matrix

In [28]:
cm = confusion_matrix(y_true=y_test, y_pred=y_pred)
cm

array([[10724,   257],
       [  840,   536]], dtype=int64)

In [30]:
print(classification_report(y_true=y_test, y_pred=y_pred))

              precision    recall  f1-score   support

           0       0.93      0.98      0.95     10981
           1       0.68      0.39      0.49      1376

   micro avg       0.91      0.91      0.91     12357
   macro avg       0.80      0.68      0.72     12357
weighted avg       0.90      0.91      0.90     12357



In [31]:
accuracy_score(y_true=y_test, y_pred=y_pred)

0.9112244072185806

31880    0
38177    0
2459     0
756      0
11275    0
29677    0
13016    0
1518     0
34983    0
24965    0
26818    0
40946    0
10622    0
20440    0
31820    0
2968     0
10669    0
16107    0
2008     0
32114    0
40215    0
39170    0
39328    0
38224    0
10383    0
28740    0
33125    0
8550     0
37376    0
11585    0
        ..
33523    0
31921    0
24675    0
2496     0
7599     0
1871     1
18430    0
7877     0
37619    0
5072     0
2163     0
38804    0
6921     0
38984    0
27469    0
16921    0
35665    0
24152    0
18983    1
32230    1
17089    0
14650    0
39512    1
15430    0
14935    0
20757    0
32103    0
30403    1
21243    0
2732     0
Name: y, Length: 28831, dtype: int64