# Comparing Classification Algorithms on califonia housing Dataset for different prediction parameters

In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix,accuracy_score,precision_score,recall_score,f1_score
import xgboost as xgb

In [2]:
import warnings #suppressing future warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [3]:
data = pd.read_csv('/content/bank-additional-full.csv', sep= ';')
print("Shape of data : {}".format(data.shape))
print("Name of columns : {}".format(list(data.columns)))
data.head()

Shape of data : (41188, 21)
Name of columns : ['age', 'job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'day_of_week', 'duration', 'campaign', 'pdays', 'previous', 'poutcome', 'emp.var.rate', 'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed', 'y']


Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,duration,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,261,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,149,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,226,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,151,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,307,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no


***Data Preprocessing***

In [4]:
#converting the target column to binary 
data['y'] = data.y.apply(lambda x:0 if x=='no' else 1)
display(data.head())
data.info()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,duration,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,261,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,0
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,149,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,0
2,37,services,married,high.school,no,yes,no,telephone,may,mon,226,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,0
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,151,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,0
4,56,services,married,high.school,no,no,yes,telephone,may,mon,307,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,0


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41188 entries, 0 to 41187
Data columns (total 21 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   age             41188 non-null  int64  
 1   job             41188 non-null  object 
 2   marital         41188 non-null  object 
 3   education       41188 non-null  object 
 4   default         41188 non-null  object 
 5   housing         41188 non-null  object 
 6   loan            41188 non-null  object 
 7   contact         41188 non-null  object 
 8   month           41188 non-null  object 
 9   day_of_week     41188 non-null  object 
 10  duration        41188 non-null  int64  
 11  campaign        41188 non-null  int64  
 12  pdays           41188 non-null  int64  
 13  previous        41188 non-null  int64  
 14  poutcome        41188 non-null  object 
 15  emp.var.rate    41188 non-null  float64
 16  cons.price.idx  41188 non-null  float64
 17  cons.conf.idx   41188 non-null 

In [5]:
#grouping basic- 9y,6y,4y into a single category 
data.education.replace(['basic.9y', 'basic.6y', 'basic.4y'], 'basic', inplace=True)
data['education'].unique()

array(['basic', 'high.school', 'professional.course', 'unknown',
       'university.degree', 'illiterate'], dtype=object)

In [6]:
#creating dummies
df = pd.get_dummies(data) 
display(df.head())

Unnamed: 0,age,duration,campaign,pdays,previous,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y,job_admin.,job_blue-collar,job_entrepreneur,job_housemaid,job_management,job_retired,job_self-employed,job_services,job_student,job_technician,job_unemployed,job_unknown,marital_divorced,marital_married,marital_single,marital_unknown,education_basic,education_high.school,education_illiterate,education_professional.course,education_university.degree,education_unknown,default_no,default_unknown,default_yes,housing_no,housing_unknown,housing_yes,loan_no,loan_unknown,loan_yes,contact_cellular,contact_telephone,month_apr,month_aug,month_dec,month_jul,month_jun,month_mar,month_may,month_nov,month_oct,month_sep,day_of_week_fri,day_of_week_mon,day_of_week_thu,day_of_week_tue,day_of_week_wed,poutcome_failure,poutcome_nonexistent,poutcome_success
0,56,261,1,999,0,1.1,93.994,-36.4,4.857,5191.0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,1,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0
1,57,149,1,999,0,1.1,93.994,-36.4,4.857,5191.0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0
2,37,226,1,999,0,1.1,93.994,-36.4,4.857,5191.0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,0,1,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0
3,40,151,1,999,0,1.1,93.994,-36.4,4.857,5191.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,1,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0
4,56,307,1,999,0,1.1,93.994,-36.4,4.857,5191.0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41188 entries, 0 to 41187
Data columns (total 62 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   age                            41188 non-null  int64  
 1   duration                       41188 non-null  int64  
 2   campaign                       41188 non-null  int64  
 3   pdays                          41188 non-null  int64  
 4   previous                       41188 non-null  int64  
 5   emp.var.rate                   41188 non-null  float64
 6   cons.price.idx                 41188 non-null  float64
 7   cons.conf.idx                  41188 non-null  float64
 8   euribor3m                      41188 non-null  float64
 9   nr.employed                    41188 non-null  float64
 10  y                              41188 non-null  int64  
 11  job_admin.                     41188 non-null  uint8  
 12  job_blue-collar                41188 non-null 

In [8]:
# We observe that the categorical-columns consist of unknown data which is redundant
# so droping them before modeling
df.drop(list(df.filter(regex = '_unknown')), axis = 1, inplace = True)

***Data Scaling and Spliting*** 

In [9]:
#sliplting data and target
X = df.drop('y',axis = 1)
y = df['y']
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3, random_state = 42,stratify= y)

#scaling the data using Standard Scalar
stdscaler = StandardScaler()
X_train_scal = stdscaler.fit_transform(X_train)
X_test_scal = stdscaler.fit_transform(X_test)

* **LOGISITIC REGRESSION** 

In [10]:
logReg = LogisticRegression(max_iter = 200,random_state= 42)
logReg.fit(X_train_scal,y_train)
y_pred_logReg = logReg.predict(X_test_scal)
#confusion matrix
confusion_mat_logReg= confusion_matrix(y_test, y_pred_logReg)
confusion_logReg = pd.DataFrame(confusion_mat_logReg, index=['Deposit Refused','Deposit Accepted'], columns=['Predicted Refusal','Predicted Acceptance'])
#classifcaiton report
cls_logReg =classification_report(y_test, y_pred_logReg,output_dict=True)

confusion_logReg

Unnamed: 0,Predicted Refusal,Predicted Acceptance
Deposit Refused,10677,288
Deposit Accepted,802,590


* **RANDOM FOREST** 

In [11]:
randF= RandomForestClassifier(random_state=42, n_estimators= 50) 
randF.fit(X_train_scal,y_train)
y_pred_randF =randF.predict(X_test_scal)
#confusion matrix
confusion_mat_randF= confusion_matrix(y_test, y_pred_randF)
confusion_randF = pd.DataFrame(confusion_mat_randF, index=['Deposit Refused','Deposit Accepted'], columns=['Predicted Refusal','Predicted Acceptance'])
#classifcaiton report
cls_randF =classification_report(y_test, y_pred_randF,output_dict=True)

confusion_randF

Unnamed: 0,Predicted Refusal,Predicted Acceptance
Deposit Refused,10643,322
Deposit Accepted,750,642


* **DECISION TREES**

In [12]:
decTree= DecisionTreeClassifier(random_state= 42)
decTree.fit(X_train_scal,y_train)
y_pred_decTree =decTree.predict(X_test_scal)
#confusion matrix
confusion_mat_decTree= confusion_matrix(y_test, y_pred_decTree)
confusion_decTree = pd.DataFrame(confusion_mat_decTree, index=['Deposit Refused','Deposit Accepted'], columns=['Predicted Refusal','Predicted Acceptance'])
#classifcaiton report
cls_decTree =classification_report(y_test, y_pred_decTree,output_dict=True)

confusion_decTree

Unnamed: 0,Predicted Refusal,Predicted Acceptance
Deposit Refused,10257,708
Deposit Accepted,655,737


* **XGBOOST**

In [13]:
xgb= xgb.XGBClassifier(max_depth = 10)
xgb.fit(X_train_scal,y_train)
y_pred_xgb =xgb.predict(X_test_scal)

#confusion matrix
confusion_mat_xgb= confusion_matrix(y_test, y_pred_xgb)
confusion_xgb = pd.DataFrame(confusion_mat_xgb, index=['Deposit Refused','Deposit Accepted'], columns=['Predicted Refusal','Predicted Acceptance'])
#classifcaiton report
cls_xgb =classification_report(y_test, y_pred_xgb,output_dict=True)

confusion_xgb

Unnamed: 0,Predicted Refusal,Predicted Acceptance
Deposit Refused,10548,417
Deposit Accepted,638,754


In [14]:
#function for creating a table with all metrics for easy comparison
def compare_metrics():
    metric_table = pd.DataFrame({'Logistic Regression':[cls_logReg['accuracy'],
                                                        cls_logReg['macro avg']['precision'],
                                                        cls_logReg['macro avg']['recall'],
                                                        cls_logReg['macro avg']['f1-score'],
                                                        cls_logReg['macro avg']['support']],
                                                                              
                                      'Random Forest':[cls_randF['accuracy'],
                                                      cls_randF['macro avg']['precision'],
                                                      cls_randF['macro avg']['recall'],
                                                      cls_randF['macro avg']['f1-score'],
                                                      cls_randF['macro avg']['support']],
                                 
                                      'Decision Tree':[cls_decTree['accuracy'],
                                                      cls_decTree['macro avg']['precision'],
                                                      cls_decTree['macro avg']['recall'],
                                                      cls_decTree['macro avg']['f1-score'],
                                                      cls_decTree['macro avg']['support']],
                                       
                                      'XGBoost':[cls_xgb['accuracy'],
                                                cls_xgb['macro avg']['precision'],
                                                cls_xgb['macro avg']['recall'],
                                                cls_xgb['macro avg']['f1-score'],
                                                cls_xgb['macro avg']['support']]},
                                      
                                      index=['Accuracy', 'Precision', 'Recall', 'F1 Score', 'Support'])
    
    metric_table['Best Algorithm'] = metric_table.idxmax(axis=1)
    
    return(metric_table)

compare_metrics()

Unnamed: 0,Logistic Regression,Random Forest,Decision Tree,XGBoost,Best Algorithm
Accuracy,0.911791,0.913248,0.889698,0.914623,XGBoost
Precision,0.801058,0.800073,0.725004,0.793429,Logistic Regression
Recall,0.698793,0.71592,0.732442,0.751818,XGBoost
F1 Score,0.735629,0.748522,0.72863,0.770373,XGBoost
Support,12357.0,12357.0,12357.0,12357.0,Logistic Regression



## **Comparing the Algorithms**

***Comparison Table for metrics***
<table style="width: 612.4px; height: 166px;" border="1">
<tbody>
<tr style="height: 40px;">
<td style="width: 94px; height: 40px;">&nbsp;</td>
<td style="width: 103px; height: 40px;"><strong>Logistic Regression</strong></td>
<td style="width: 104px; height: 40px;"><strong>Random Forest</strong></td>
<td style="width: 82px; height: 40px;"><strong>Decision Tree</strong></td>
<td style="width: 84px; height: 40px;"><strong>XGBoost</strong></td>
<td style="width: 143.4px; height: 40px;"><strong>Best Algorithm&nbsp;</strong></td>
</tr>
<tr style="height: 20px;">
<td style="width: 94px; height: 20px;"><strong>Accuracy</strong></td>
<td style="width: 103px; height: 20px;">&nbsp;0.911791</td>
<td style="width: 104px; height: 20px;">0.913248</td>
<td style="width: 82px; height: 20px;">0.889698</td>
<td style="width: 84px; height: 20px;">0.914623</td>
<td style="width: 143.4px; height: 20px;">&nbsp;XGBoost</td>
</tr>
<tr style="height: 20px;">
<td style="width: 94px; height: 20px;"><strong>Precision</strong></td>
<td style="width: 103px; height: 20px;">0.801058</td>
<td style="width: 104px; height: 20px;">0.800073</td>
<td style="width: 82px; height: 20px;">0.725004</td>
<td style="width: 84px; height: 20px;">0.793429</td>
<td style="width: 143.4px; height: 20px;">Logistic Regression</td>
</tr>
<tr style="height: 20px;">
<td style="width: 94px; height: 20px;"><strong>Recall</strong></td>
<td style="width: 103px; height: 20px;">&nbsp;0.698793</td>
<td style="width: 104px; height: 20px;">&nbsp;0.715920</td>
<td style="width: 82px; height: 20px;">&nbsp;0.732442</td>
<td style="width: 84px; height: 20px;">0.751818</td>
<td style="width: 143.4px; height: 20px;">&nbsp;XGBoost</td>
</tr>
<tr style="height: 20.2px;">
<td style="width: 94px; height: 20.2px;"><strong>F1 Score</strong></td>
<td style="width: 103px; height: 20.2px;">&nbsp;0.735629</td>
<td style="width: 104px; height: 20.2px;">&nbsp;0.748522</td>
<td style="width: 82px; height: 20.2px;">&nbsp;0.728630</td>
<td style="width: 84px; height: 20.2px;">0.770373</td>
<td style="width: 143.4px; height: 20.2px;">&nbsp;XGBoost</td>
</tr>
<tr style="height: 20px;">
<td style="width: 94px; height: 20px;"><strong>Support</strong></td>
<td style="width: 103px; height: 20px;">&nbsp;12357.000000</td>
<td style="width: 104px; height: 20px;">&nbsp;12357.000000</td>
<td style="width: 82px; height: 20px;">&nbsp;12357.000000</td>
<td style="width: 84px; height: 20px;">12357.000000</td>
<td style="width: 143.4px; height: 20px;">&nbsp;Logistic Regression</td>
</tr>
</tbody>
</table>
<BR><Br>   
    
***Comparison Table for Confusion Matrix***
    
<p>&nbsp;</p>
<table style="height: 423px; width: 550px; margin-left: auto; margin-right: auto;" border="1">
<tbody>
<tr style="height: 187px;">
<td style="width: 272.889px; height: 187px;"><strong>&nbsp; &nbsp; &nbsp; &nbsp; LOGISITIC REGRESSION</strong>
<table class="dataframe" style="width: 238px;" border="1">
<thead>
<tr>
<th style="width: 74px;">&nbsp;</th>
<th style="width: 78px;">Predicted Refusal</th>
<th style="width: 84px;">Predicted Acceptance</th>
</tr>
</thead>
<tbody>
<tr>
<th style="width: 74px;">Deposit Refused</th>
<td style="width: 78px;">10677</td>
<td style="width: 84px;">288</td>
</tr>
<tr>
<th style="width: 74px;">Deposit Accepted</th>
<td style="width: 78px;">802</td>
<td style="width: 84px;">590</td>
</tr>
</tbody>
</table>
</td>
<td style="width: 275.556px; height: 187px;"><strong>&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;RANDOM FOREST</strong><br />
<table class="dataframe" style="height: 117px; width: 251px;" border="1">
<thead>
<tr>
<th style="width: 75px;">&nbsp;</th>
<th style="width: 79.2222px;">Predicted Refusal</th>
<th style="width: 94.7778px;">Predicted Acceptance</th>
</tr>
</thead>
<tbody>
<tr>
<th style="width: 75px;">Deposit Refused</th>
<td style="width: 79.2222px;">10643</td>
<td style="width: 94.7778px;">322</td>
</tr>
<tr>
<th style="width: 75px;">Deposit Accepted</th>
<td style="width: 79.2222px;">750</td>
<td style="width: 94.7778px;">642</td>
</tr>
</tbody>
</table>
</td>
</tr>
<tr style="height: 187px;">
<td style="width: 272.889px; height: 187px;"><strong><strong>&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; DECISION TREES</strong></strong>
<table class="dataframe" style="width: 240.444px;" border="1">
<thead>
<tr>
<th style="width: 71px;">&nbsp;</th>
<th style="width: 76px;">Predicted Refusal</th>
<th style="width: 91.4444px;">Predicted Acceptance</th>
</tr>
</thead>
<tbody>
<tr>
<th style="width: 71px;">Deposit Refused</th>
<td style="width: 76px;">10257</td>
<td style="width: 91.4444px;">708</td>
</tr>
<tr>
<th style="width: 71px;">Deposit Accepted</th>
<td style="width: 76px;">655</td>
<td style="width: 91.4444px;">737</td>
</tr>
</tbody>
</table>
</td>
<td style="width: 275.556px; height: 187px;"><strong>&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;XGBOOST</strong><br />
<table class="dataframe" style="height: 117px; width: 251px;" border="1">
<thead>
<tr>
<th style="width: 75px;">&nbsp;</th>
<th style="width: 79.2222px;">Predicted Refusal</th>
<th style="width: 94.7778px;">Predicted Acceptance</th>
</tr>
</thead>
<tbody>
<tr>
<th style="width: 75px;">Deposit Refused</th>
<td style="width: 79.2222px;">10548</td>
<td style="width: 94.7778px;">417</td>
</tr>
<tr>
<th style="width: 75px;">Deposit Accepted</th>
<td style="width: 79.2222px;">638</td>
<td style="width: 94.7778px;">754</td>
</tr>
</tbody>
</table>
</td>
</tr>
</tbody>
</table>
    
<br><br>
*From the above observation of metrics and confusion matrix we can conclude that **XGBoost Algorithm** has performed better than other 3 algorithms*
<br><br> 