<a href="https://colab.research.google.com/github/abdoulayegk/Machine-learning/blob/master/bank_term_deposit_marketing_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import io
from google.colab import files
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
plt.style.use('ggplot')

In [None]:
uploaded = files.upload()

In [None]:
df = pd.read_csv(io.BytesIO(uploaded['bank_term_deposit_marketing_analysis.csv']))

df.head()

In [None]:
df.describe(include='all').T.style.background_gradient()

In [None]:
sns.countplot(df.LOAN)

In [None]:
corr = df.corr()
corr.style.background_gradient()

In [None]:
plt.figure(figsize = (8, 5))
sns.countplot(x = 'MARITAL', hue = 'LOAN', data = df)
plt.title("MARITAL with LOAN")
plt.xlabel("MARITAL")
plt.ylabel("Count")

In [None]:
plt.figure(figsize=(10,7))
sns.countplot(x=df.JOB, hue='LOAN', data=df)
labels = ['management', 'technician', 'entrepreneur', 'blue-collar',
       'unknown', 'retired', 'admin.', 'services', 'self-employed',
       'unemployed', 'housemaid', 'student']
ax=plt.gca()
ax.set_xticklabels(labels = labels, rotation=50);
plt.show()

In [None]:
df['CONTACT'].unique()

In [None]:
columns_to_drop = ['COUNT','PDAYS','PREVIOUS','ID','POUTCOME','CONTACT']

#final df
df = df.drop(columns_to_drop, axis=1)
df.head()

In [None]:
plt.figure(figsize=(10,6))
sns.heatmap(df.corr(),annot=True)
plt.title("Correlation matrix between columns")
plt.show()

There are many way you can transform your categorical data into numbers. 
1. Create a dictionary with all your class labels
2. Map it with your column using the function **map** in python

In [None]:
# use this one to get the unique elements of your columns
df['EDUCATION'].unique()

In [None]:
# Create a dictionary like this one with the mapping. look above output for reference.
dic_education = {'tertiary':3, 'secondary':2, 'unknown':0, 'primary':1}

# Do the mapping like this for all the columns which are not numbers.
df['EDUCATION'] = df['EDUCATION'].map(dic_education)

In [None]:
# see the unique values after mapping
df.EDUCATION.unique()

In [None]:
df.head()

In [None]:
df['LOAN'] = df['LOAN'].map({'yes':1, 'no':0})

In [None]:
from sklearn.model_selection import (StratifiedGroupKFold,GroupKFold,\
                              StratifiedKFold, RepeatedStratifiedKFold, KFold)
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from sklearn.preprocessing import StandardScaler
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
import numpy as np


In [None]:
X = df.drop('LOAN',axis=1)

y = df['LOAN']

In [None]:
X = pd.get_dummies(X)
X.head()

In [None]:
from imblearn.over_sampling import SMOTE
oversample = SMOTE()
X, y = oversample.fit_resample(X, y)
oversample = SMOTE()
# Oversampling the target data
X, y = oversample.fit_resample(X, y)
plt.title("Balanced classes")
sns.countplot(y)
plt.show()

Using Kfold with LightGBM

In [None]:
scores = []

fold = KFold(n_splits=5, shuffle=True, random_state=42)

for train_idx,test_idx in fold.split(X,y):
  X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
  y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
  m1 = LGBMClassifier()
  m1.fit(X_train,y_train,eval_set=[(X_train,y_train),(X_test,y_test)],
         early_stopping_rounds=500, verbose=1, eval_metric='auc')
  pred = m1.predict(X_test)
  print("score",accuracy_score(y_test, pred))
  scores.append(accuracy_score(y_test,pred))
print("The score for Kfold is: ",np.mean(scores))


KFold with Xgboost

In [None]:
scores = []

fold = KFold(n_splits=5, shuffle=True, random_state=42)

for train_idx,test_idx in fold.split(X,y):
  X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
  y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
  m2 = XGBClassifier()
  m2.fit(X_train,y_train,eval_set=[(X_train,y_train),(X_test,y_test)],
         early_stopping_rounds=500, verbose=1, eval_metric='auc')
  pred = m2.predict(X_test)
  print("score",accuracy_score(y_test, pred))
  scores.append(accuracy_score(y_test,pred))
print("The score for Kfold is: ",np.mean(scores))


Using stratifiedKFold

In [None]:
scores = []

fold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for train_idx,test_idx in fold.split(X,y):
  X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
  y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
  m3 = LGBMClassifier()
  m3.fit(X_train,y_train,eval_set=[(X_train,y_train),(X_test,y_test)],
         early_stopping_rounds=500, verbose=1, eval_metric='auc')
  pred = m3.predict(X_test)
  print("score",accuracy_score(y_test, pred))
  scores.append(accuracy_score(y_test,pred))
print("The score for Kfold is: ",np.mean(scores))


In [None]:
scores = []

fold = KFold(n_splits=5, shuffle=True, random_state=42)

for train_idx,test_idx in fold.split(X,y):
  X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
  y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
  m4 = RandomForestClassifier()
  m4.fit(X_train,y_train)
  pred = m4.predict(X_test)
  print("score",accuracy_score(y_test, pred))
  scores.append(accuracy_score(y_test,pred))
print("The score for Kfold is: ",np.mean(scores))


In [None]:
# using RepeatedStratifiedKfold
scores = []
f1_scores = []

fold = RepeatedStratifiedKFold(n_splits=5,n_repeats=10, random_state=42)

for train_idx,test_idx in fold.split(X,y):
  X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
  y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
  m1 = LGBMClassifier()
  m1.fit(X_train,y_train,eval_set=[(X_train,y_train),(X_test,y_test)],\
         early_stopping_rounds=500, verbose=1, eval_metric='auc')
  pred = m1.predict(X_test)
  print("score",accuracy_score(y_test, pred))
  scores.append(accuracy_score(y_test,pred))
print("The score for Kfold is: ",np.mean(scores))
print("F1-score of Group KFold is: ",np.mean(f1_scores))


In [None]:
!pip -q install catboost

In [None]:
# using Group KFold
from catboost import CatBoostClassifier
scores = []
f1_scores = []
fold = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

for train_idx,test_idx in fold.split(X,y):
  X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
  y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
  m1 = CatBoostClassifier()
  m1.fit(X_train,y_train, verbose=1)
  pred = m1.predict(X_test)
  print("Accuracy score",accuracy_score(y_test, pred))
  scores.append(accuracy_score(y_test,pred))
  f1_scores.append(f1_score(y_test, pred))
print("The mean Accuracy score for Kfold is: ",np.mean(scores))
print("F1-score of Stratified KFold is: ",np.mean(f1_scores))

In [None]:
from sklearn.metrics import classification_report

print(classification_report(y_test,pred))