<a href="https://colab.research.google.com/github/Chirag314/SVM-bank-data/blob/main/SVM_bank_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

###This notebook is copied from exercises from book Ensemble Machine Learning Cookbook.

In [2]:
#import required libraries
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.metrics import  roc_curve, auc
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

import itertools
from sklearn import tree

In [3]:
# Read data from github. Use raw format and copy url# Note normal url and raw url will be different.
import pandas as pd
pd.options.display.max_rows=None
pd.options.display.max_columns=None
url = 'https://raw.githubusercontent.com/PacktPublishing/Ensemble-Machine-Learning-Cookbook/master/Chapter04/Support%20Vector%20Machines/bank.csv'
df_bankdata = pd.read_csv(url)
#df = pd.read_csv(url)
print(df_bankdata.head(5))

   age          job  marital  education default  balance housing loan  \
0   30   unemployed  married    primary      no     1787      no   no   
1   33     services  married  secondary      no     4789     yes  yes   
2   35   management   single   tertiary      no     1350     yes   no   
3   30   management  married   tertiary      no     1476     yes  yes   
4   59  blue-collar  married  secondary      no        0     yes   no   

    contact  day month  duration  campaign  pdays  previous poutcome   y  
0  cellular   19   oct        79         1     -1         0  unknown  no  
1  cellular   11   may       220         1    339         4  failure  no  
2  cellular   16   apr       185         1    330         1  failure  no  
3   unknown    3   jun       199         4     -1         0  unknown  no  
4   unknown    5   may       226         1     -1         0  unknown  no  


In [4]:
df_bankdata.shape

(4521, 17)

In [5]:
df_bankdata.dtypes
#df_backorder.describe()

age           int64
job          object
marital      object
education    object
default      object
balance       int64
housing      object
loan         object
contact      object
day           int64
month        object
duration      int64
campaign      int64
pdays         int64
previous      int64
poutcome     object
y            object
dtype: object

In [6]:
# check missing values
df_bankdata.isnull().sum()

age          0
job          0
marital      0
education    0
default      0
balance      0
housing      0
loan         0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
y            0
dtype: int64

#####With the following command, we notice there are no missing values, so we can proceed with our next steps:

In [7]:
#We can check the class balance in our target variable as follows:
print("Total number of class labels: {}".format(df_bankdata.shape[0]))
print("Number of people opted for term deposit: {}".format(df_bankdata[df_bankdata.y=='yes'].shape[0]))
print("Number of people do not opted for term deposit: {}".format(df_bankdata[df_bankdata.y=='no'].shape[0]))

Total number of class labels: 4521
Number of people opted for term deposit: 521
Number of people do not opted for term deposit: 4000


In [8]:
#We can convert our target class to the binary values 1 and 0 with the following command:
df_bankdata['y']=(df_bankdata['y']=='yes').astype(int)

We can now perform one-hot encoding on our categorical variables. We only select variables that are categorical in nature. In the following code, we use category_column_names to provide the names of the non-numeric variables:

In [9]:
column_type=['object']
df_bankdata_category_columns=df_bankdata.select_dtypes(column_type)

category_column_names=df_bankdata_category_columns.columns.values.tolist()
category_column_names

['job',
 'marital',
 'education',
 'default',
 'housing',
 'loan',
 'contact',
 'month',
 'poutcome']

In [10]:
#We run a loop over each of the non-numerical variables to perform one-hot encoding on them and add them back to the DataFrame. We will also delete the original non-numerical variables after performing one-hot encoding:

for each_col in category_column_names:
  dummy_var=pd.get_dummies(df_bankdata_category_columns[each_col],prefix=each_col)
  df_joindata=df_bankdata.join(dummy_var)
  df_joindata.drop([each_col],axis=1,inplace=True)
  df_bankdata=df_joindata

In [11]:
#We separate the predictor and response variables as follows:
X=df_bankdata.iloc[:,:-1]
Y=df_bankdata['y']

In [13]:
#We also split our data into training and testing datasets:
X_train, X_test, Y_train,Y_test=train_test_split(X, Y, test_size=0.2,random_state=1)


In [14]:
# We then build our first model using SVC with the default kernel, radial basis function (RBF):
svc_model=SVC(kernel='rbf')
svc_model.fit(X_train, Y_train)

SVC()

We check our training and testing accuracy via the SVC model built with the RBF kernel:

In [16]:
train_predictedvalues=svc_model.predict(X_train)
test_predictedvalues=svc_model.predict(X_test)

print("Train accuracy score :")
print(accuracy_score(Y_train, train_predictedvalues))

print("Test accuracy score :")
print(accuracy_score(Y_test,test_predictedvalues))

Train accuracy score :
0.8877212389380531
Test accuracy score :
0.8729281767955801


In [17]:
# We can rebuild our SVC model with a polynomial kernel as follows:
svc_model=SVC(kernel='poly')
svc_model.fit(X_train,Y_train)

train_predictedvalues=svc_model.predict(X_train)
test_predictedvaues=svc_model.predict(X_test)

print('Train Accuracy Score:')
print(accuracy_score(Y_train,train_predictedvalues))

print('Test Accuracy Score:')
print(accuracy_score(Y_test,test_predictedvalues))

Train Accuracy Score:
0.8877212389380531
Test Accuracy Score:
0.8729281767955801


In [18]:
#We can also build an SVC model with the linear kernel. Instead of kernel='ploy', we can replace this with kernel='linear' in the preceding code:
svc_model=SVC(kernel='linear')
svc_model.fit(X_train,Y_train)

train_predictedvalues=svc_model.predict(X_train)
test_predictedvaues=svc_model.predict(X_test)

print('Train Accuracy Score:')
print(accuracy_score(Y_train,train_predictedvalues))

print('Test Accuracy Score:')
print(accuracy_score(Y_test,test_predictedvalues))

Train Accuracy Score:
0.9972345132743363
Test Accuracy Score:
0.8729281767955801
