#### We are given a bank dataset in which columns are age	job	marital	education	default	balance	housing	loan	contact	day	month	duration	campaign	pdays	previous	poutcome	deposit

#### Deposit is the target and it is in binary format Yes-No

#### We will use classification Algorithms to Make our model

###  Step1 - EDA
###  Step2 - Data Preprocessing
###  Step3 - Creating Model
###  Step4 - Checking Model Accuracy

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pandas_profiling import ProfileReport


In [2]:
df = pd.read_csv(r"C:\Users\Onkar\My_Python_Workspace\Machine_Learning_Class_Notes\bank.csv")

In [3]:
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,deposit
0,59,admin.,married,secondary,no,2343,yes,no,unknown,5,may,1042,1,-1,0,unknown,yes
1,56,admin.,married,secondary,no,45,no,no,unknown,5,may,1467,1,-1,0,unknown,yes
2,41,technician,married,secondary,no,1270,yes,no,unknown,5,may,1389,1,-1,0,unknown,yes
3,55,services,married,secondary,no,2476,yes,no,unknown,5,may,579,1,-1,0,unknown,yes
4,54,admin.,married,tertiary,no,184,no,no,unknown,5,may,673,2,-1,0,unknown,yes


In [4]:
#Checking for null values and getting idea about the data
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11162 entries, 0 to 11161
Data columns (total 17 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   age        11162 non-null  int64 
 1   job        11162 non-null  object
 2   marital    11162 non-null  object
 3   education  11162 non-null  object
 4   default    11162 non-null  object
 5   balance    11162 non-null  int64 
 6   housing    11162 non-null  object
 7   loan       11162 non-null  object
 8   contact    11162 non-null  object
 9   day        11162 non-null  int64 
 10  month      11162 non-null  object
 11  duration   11162 non-null  int64 
 12  campaign   11162 non-null  int64 
 13  pdays      11162 non-null  int64 
 14  previous   11162 non-null  int64 
 15  poutcome   11162 non-null  object
 16  deposit    11162 non-null  object
dtypes: int64(7), object(10)
memory usage: 1.4+ MB


In [None]:
#Pandas Profiling
profile = ProfileReport(df)
profile.to_notebook_iframe()

In [5]:
#Creating a Preprocessing function to transform out dataset
def preprocessing(df_fun):
    #Removing Irrelevant and correlated Columns 
    #contact - Irrelevant
    #day and month - not necessary as we have duration
    #pdays is highly correlated with poutcome
    #previous is highly correlated with pdays
        #Lets Drop previous and poutcome and keep pdays
    df_fun.drop(['contact','day','month','previous','poutcome'],axis=1,inplace=True)
    df_fun['deposit'] = df_fun['deposit'].map({'yes':1,'no':0})
    return df_fun
    

In [6]:
df_copy = df.copy()
processed_df = preprocessing(df_copy)

In [7]:
processed_df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,duration,campaign,pdays,deposit
0,59,admin.,married,secondary,no,2343,yes,no,1042,1,-1,1
1,56,admin.,married,secondary,no,45,no,no,1467,1,-1,1
2,41,technician,married,secondary,no,1270,yes,no,1389,1,-1,1
3,55,services,married,secondary,no,2476,yes,no,579,1,-1,1
4,54,admin.,married,tertiary,no,184,no,no,673,2,-1,1


# Modelling 
- Feature and Target values - X, y 
- One hot encode any categorical features
- Train, holdout split
- Train on a bunch of algos

In [8]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

In [9]:
# Create feature columns
# Drop identifier column
X = processed_df.drop('deposit', axis=1)
# One hot encode
X = pd.get_dummies(X)
# Create target columns
y = processed_df['deposit']

In [10]:
#Creating train test splitting
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=1234)

### Setup ML Pipelines

In [11]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier

In [12]:
pipelines = {
    'rf' : make_pipeline(StandardScaler(), RandomForestClassifier(random_state=1234)),
    'gb' : make_pipeline(StandardScaler(), GradientBoostingClassifier(random_state=1234)),
    # 'dt' : make_pipeline(StandardScaler(), DecisionTreeClassifier(random_state=1234))
}

In [13]:
RandomForestClassifier().get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [14]:
grid = {
    'rf': {
        'randomforestclassifier__n_estimators':[100,200,300]
        
    },
    'gb':{
        'gradientboostingclassifier__n_estimators':[100,200,300]
    } 
}

In [15]:
pipelines.items()

dict_items([('rf', Pipeline(steps=[('standardscaler', StandardScaler()),
                ('randomforestclassifier',
                 RandomForestClassifier(random_state=1234))])), ('gb', Pipeline(steps=[('standardscaler', StandardScaler()),
                ('gradientboostingclassifier',
                 GradientBoostingClassifier(random_state=1234))]))])

In [16]:
# Create a blank dictionary to hold the models 
fit_models = {}
# Loop through all the algos 
for algo, pipeline in pipelines.items():
  print(f'Training the {algo} model.')
  # Create new Grid Search CV Cclass 
  model = GridSearchCV(pipeline, grid[algo], n_jobs=3, cv=10)
  # Train the model 
  model.fit(X_train, y_train)
  # Store results inside of the dictionary
  fit_models[algo] = model

Training the rf model.
Training the gb model.


## Evaluate Performance on Test Data

In [17]:
from sklearn.metrics import accuracy_score, precision_score, recall_score

In [18]:
# Evaluate the performance of the model 
for algo, model in fit_models.items(): 
  yhat = model.predict(X_test)
  accuracy = accuracy_score(y_test, yhat)
  precision = precision_score(y_test, yhat)
  recall = recall_score(y_test, yhat)
  print(f'Metrics for {algo}: accuracy- {accuracy}, recall- {recall}, precision- {precision}')

Metrics for rf: accuracy- 0.8016121809225257, recall- 0.8059424326833797, precision- 0.7876588021778584
Metrics for gb: accuracy- 0.8172861621137483, recall- 0.8152274837511606, precision- 0.8077276908923643


#### GradientBoostingClassifier is giving better results


## Make Predictions

In [19]:
predictions = fit_models['gb'].predict(X_test)

In [20]:
predictions

array([0, 1, 1, ..., 1, 1, 0], dtype=int64)