We will work on the implementation of SVM 
Data: Prediction of the grant of personal loan to the customer based on his demographic and financial attributes

While solving the continuous value prediction problem, perform target based encoding of the categorical attributes

In [1]:
#### Import the necessary modules
## To read and manipulate the data/dataframe
import pandas as pd
import numpy as np

## For Data Preprocessing
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

## For Modelling
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC, SVR

## Evaluation metrics
from sklearn.metrics import confusion_matrix, classification_report


In [2]:
data=pd.read_csv("UnivBank.csv",na_values=['?',"#"])
data.shape

(5000, 14)

In [3]:
data.columns

Index(['ID', 'Age', 'Experience', 'Income', 'ZIP Code', 'Family', 'CCAvg',
       'Education', 'Mortgage', 'Personal Loan', 'Securities Account',
       'CD Account', 'Online', 'CreditCard'],
      dtype='object')

In [4]:
X=data.drop('Personal Loan',axis=1)
y=data['Personal Loan']

In [4]:
X_train,X_test,y_train,y_test=train_test_split(data.loc[:,data.columns !='Personal Loan'],data.loc[:,'Personal Loan'],test_size=0.3,random_state=123)

In [5]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(3500, 13)
(3500,)
(1500, 13)
(1500,)


In [6]:
X_train.dtypes

ID                      int64
Age                     int64
Experience              int64
Income                  int64
ZIP Code                int64
Family                  int64
CCAvg                 float64
Education               int64
Mortgage              float64
Securities Account    float64
CD Account            float64
Online                  int64
CreditCard              int64
dtype: object

In [7]:
#### Type conversion
to_drop=['ID','ZIP Code']
cat=['Family','Education', 'Securities Account','CD Account', 'Online', 'CreditCard']

In [8]:
## Type conversion on train
X_train[cat]=X_train[cat].astype('category')

In [9]:
## Type conversion on test
X_test[cat]=X_test[cat].astype('category')

In [10]:
X_train.dtypes

ID                       int64
Age                      int64
Experience               int64
Income                   int64
ZIP Code                 int64
Family                category
CCAvg                  float64
Education             category
Mortgage               float64
Securities Account    category
CD Account            category
Online                category
CreditCard            category
dtype: object

In [11]:
### Dropping attributes
X_train.drop(to_drop,axis=1,inplace=True)
X_test.drop(to_drop,axis=1,inplace=True)

In [12]:
X_train.dtypes

Age                      int64
Experience               int64
Income                   int64
Family                category
CCAvg                  float64
Education             category
Mortgage               float64
Securities Account    category
CD Account            category
Online                category
CreditCard            category
dtype: object

In [13]:
X_train.isna().sum()

Age                   0
Experience            0
Income                0
Family                0
CCAvg                 0
Education             0
Mortgage              1
Securities Account    2
CD Account            1
Online                0
CreditCard            0
dtype: int64

In [14]:
###
si_num=SimpleImputer(strategy="mean")
si_cat=SimpleImputer(strategy='most_frequent')

In [15]:
X_train_num=X_train.drop(cat,axis=1)
X_train_cat=X_train[cat]

### on test
X_test_num=X_test.drop(cat,axis=1)
X_test_cat=X_test[cat]

In [16]:
X_train_num.dtypes

Age             int64
Experience      int64
Income          int64
CCAvg         float64
Mortgage      float64
dtype: object

In [17]:
X_train_cat.dtypes

Family                category
Education             category
Securities Account    category
CD Account            category
Online                category
CreditCard            category
dtype: object

In [18]:
X_train_num=pd.DataFrame(si_num.fit_transform(X_train_num),columns=X_train_num.columns)


In [19]:
X_train_num.isna().sum()

Age           0
Experience    0
Income        0
CCAvg         0
Mortgage      0
dtype: int64

In [20]:
X_train_cat=pd.DataFrame(si_cat.fit_transform(X_train_cat),columns=X_train_cat.columns)

In [21]:
X_train_cat.isna().sum()

Family                0
Education             0
Securities Account    0
CD Account            0
Online                0
CreditCard            0
dtype: int64

In [22]:
X_test_num=pd.DataFrame(si_num.transform(X_test_num),columns=X_test_num.columns)
X_test_num.isna().sum()

Age           0
Experience    0
Income        0
CCAvg         0
Mortgage      0
dtype: int64

In [23]:
X_test_cat=pd.DataFrame(si_cat.transform(X_test_cat),columns=X_test_cat.columns)

In [24]:
### Statndardization of the numeric data
std= StandardScaler()
X_train_num=pd.DataFrame(std.fit_transform(X_train_num),columns= X_train_num.columns)
X_test_num=pd.DataFrame(std.transform(X_test_num),columns=X_test_num.columns)

In [25]:
X_train_num.head(10)

Unnamed: 0,Age,Experience,Income,CCAvg,Mortgage
0,0.066285,0.087957,-1.320429,-0.706115,-0.551912
1,-0.463392,-0.530394,0.396236,-0.306566,-0.551912
2,1.390479,1.50133,-0.71199,-0.649036,-0.551912
3,-1.346188,-1.23708,-0.690261,0.092982,-0.551912
4,-1.964145,-1.855431,-0.016633,0.378374,-0.551912
5,1.390479,1.50133,-0.299122,0.035904,-0.551912
6,-1.434467,-1.502088,0.374506,0.835001,-0.551912
7,0.507683,0.441301,2.15636,-0.820271,-0.551912
8,1.12564,1.236323,-0.299122,-0.192409,-0.551912
9,-1.169629,-1.060408,-0.299122,-0.420723,-0.551912


In [26]:
## One-hot encoding of categorical data
ohe=OneHotEncoder(handle_unknown='ignore')

In [27]:
X_train_cat=pd.DataFrame(ohe.fit_transform(X_train_cat).todense(),columns=ohe.get_feature_names_out())

In [28]:
X_train_cat.head(10)

Unnamed: 0,Family_1.0,Family_2.0,Family_3.0,Family_4.0,Education_1.0,Education_2.0,Education_3.0,Securities Account_0.0,Securities Account_1.0,CD Account_0.0,CD Account_1.0,Online_0.0,Online_1.0,CreditCard_0.0,CreditCard_1.0
0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0
1,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0
2,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0
3,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0
4,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0
5,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0
6,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
7,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0
8,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
9,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0


In [29]:
X_test_cat=pd.DataFrame(ohe.transform(X_test_cat).todense(),columns=ohe.get_feature_names_out())

In [30]:
######### Combining Numeric and Categorical Data
Train=pd.concat([X_train_num,X_train_cat],axis=1)

In [31]:
Train.shape

(3500, 20)

In [32]:
Train.head(10)

Unnamed: 0,Age,Experience,Income,CCAvg,Mortgage,Family_1.0,Family_2.0,Family_3.0,Family_4.0,Education_1.0,Education_2.0,Education_3.0,Securities Account_0.0,Securities Account_1.0,CD Account_0.0,CD Account_1.0,Online_0.0,Online_1.0,CreditCard_0.0,CreditCard_1.0
0,0.066285,0.087957,-1.320429,-0.706115,-0.551912,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0
1,-0.463392,-0.530394,0.396236,-0.306566,-0.551912,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0
2,1.390479,1.50133,-0.71199,-0.649036,-0.551912,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0
3,-1.346188,-1.23708,-0.690261,0.092982,-0.551912,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0
4,-1.964145,-1.855431,-0.016633,0.378374,-0.551912,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0
5,1.390479,1.50133,-0.299122,0.035904,-0.551912,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0
6,-1.434467,-1.502088,0.374506,0.835001,-0.551912,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
7,0.507683,0.441301,2.15636,-0.820271,-0.551912,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0
8,1.12564,1.236323,-0.299122,-0.192409,-0.551912,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
9,-1.169629,-1.060408,-0.299122,-0.420723,-0.551912,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0


In [33]:
Test= pd.concat([X_test_num,X_test_cat],axis=1)

In [34]:
Test.shape

(1500, 20)

In [35]:
### Mod
y_train=y_train.astype('category')
y_test=y_test.astype('category')

In [51]:
mod=SVC(kernel='rbf',C=2)
mod.fit(Train,y_train)

SVC(C=2)

In [52]:
preds_train=mod.predict(Train)

In [53]:
preds_test=mod.predict(Test)

In [54]:
confusion_matrix(y_train,preds_train)

array([[3163,    3],
       [  35,  299]])

In [55]:
confusion_matrix(y_test,preds_test)

array([[1351,    3],
       [  22,  124]])

In [56]:
print(classification_report(y_test,preds_test))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99      1354
           1       0.98      0.85      0.91       146

    accuracy                           0.98      1500
   macro avg       0.98      0.92      0.95      1500
weighted avg       0.98      0.98      0.98      1500

