# Meghanjali Chennupati (U30308400)
# Conducting SVM on riding mover.csv data.

# STEP -01: Import all the standard libraries

In [1]:
# import numpy and pandas libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix
# set random seed to ensure that results are repeatable
np.random.seed(1)

# step -02 Load the data

In [2]:
riding_mover=pd.read_csv(r"C:/Users/Meghanjali/Desktop/Data science programming/WE03 ASGT/RidingMowers.csv")

In [3]:
riding_mover

Unnamed: 0,Income,Lot_Size,Ownership
0,60.0,18.4,Owner
1,85.5,16.8,Owner
2,64.8,21.6,Owner
3,61.5,20.8,Owner
4,87.0,23.6,Owner
5,110.1,19.2,Owner
6,108.0,17.6,Owner
7,82.8,22.4,Owner
8,69.0,20.0,Owner
9,93.0,20.8,Owner


# step-03: Conduct some intial exploration on the data

In [4]:
# look at the data
riding_mover.head(3) # note that we don't want to dump all the data to the screen

Unnamed: 0,Income,Lot_Size,Ownership
0,60.0,18.4,Owner
1,85.5,16.8,Owner
2,64.8,21.6,Owner


In [5]:
# generate a basic summary of the data
riding_mover.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24 entries, 0 to 23
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Income     24 non-null     float64
 1   Lot_Size   24 non-null     float64
 2   Ownership  24 non-null     object 
dtypes: float64(2), object(1)
memory usage: 704.0+ bytes


In [6]:
# generate a statistical summary of the numeric value in the data
riding_mover.describe()

Unnamed: 0,Income,Lot_Size
count,24.0,24.0
mean,68.4375,18.95
std,19.793144,2.428275
min,33.0,14.0
25%,52.35,17.5
50%,64.8,19.0
75%,83.1,20.8
max,110.1,23.6


In [7]:
# Check the missing values by summing the total na's for each variable
riding_mover.isna().sum()

Income       0
Lot_Size     0
Ownership    0
dtype: int64

# Summary from intial exploration from data:

1. There are no missing values
2. There is one target variable which is categorical.
3. We need to encode using either dummmy variable or one hot encoding


In [8]:
riding_mover['Ownership'].unique

<bound method Series.unique of 0        Owner
1        Owner
2        Owner
3        Owner
4        Owner
5        Owner
6        Owner
7        Owner
8        Owner
9        Owner
10       Owner
11       Owner
12    Nonowner
13    Nonowner
14    Nonowner
15    Nonowner
16    Nonowner
17    Nonowner
18    Nonowner
19    Nonowner
20    Nonowner
21    Nonowner
22    Nonowner
23    Nonowner
Name: Ownership, dtype: object>

In [9]:
# Encoding the categorical variable using one hot encoding

In [9]:
dummies_df = pd.get_dummies(riding_mover['Ownership'], prefix='Ownership', drop_first=True)
riding_mover = riding_mover.join(dummies_df)
riding_mover.drop('Ownership', axis=1, inplace = True)

In [10]:
riding_mover.head(4)

Unnamed: 0,Income,Lot_Size,Ownership_Owner
0,60.0,18.4,1
1,85.5,16.8,1
2,64.8,21.6,1
3,61.5,20.8,1


In [11]:
riding_mover['Ownership_Owner'].unique

<bound method Series.unique of 0     1
1     1
2     1
3     1
4     1
5     1
6     1
7     1
8     1
9     1
10    1
11    1
12    0
13    0
14    0
15    0
16    0
17    0
18    0
19    0
20    0
21    0
22    0
23    0
Name: Ownership_Owner, dtype: uint8>

In [12]:
riding_mover.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24 entries, 0 to 23
Data columns (total 3 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Income           24 non-null     float64
 1   Lot_Size         24 non-null     float64
 2   Ownership_Owner  24 non-null     uint8  
dtypes: float64(2), uint8(1)
memory usage: 536.0 bytes


# step -04  Splitting the data in to training and testing 

In [13]:
X = riding_mover.loc[:,['Income','Lot_Size']]
y = riding_mover.loc[:,['Ownership_Owner']]

In [14]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=1/3)

In [16]:
#from sklearn.preprocessing import StandardScaler
#sc_X = StandardScaler()
#X_train = sc_X.fit_transform(X_train)
#X_test = sc_X.transform(X_test)

# Creating a data frame with metrics as column to store the each model metric

In [15]:
performance = pd.DataFrame({"model": [], "Accuracy": [], "Precision": [], "Recall": [], "F1": []})

# 4.0  Fit a SVM classification model using linear kernal

In [16]:
svm_lin_model = SVC(kernel="linear",probability=True)
svm_linsvc = svm_lin_model.fit(X_train, np.ravel(y_train))

In [17]:
model_preds_svclin = svm_lin_model.predict(X_test)
c_matrix = confusion_matrix(y_test, model_preds_svclin)
TP = c_matrix[1][1]
TN = c_matrix[0][0]
FP = c_matrix[0][1]
FN = c_matrix[1][0]
performance = pd.concat([performance, pd.DataFrame({'model':"linear svm", 
                                                    'Accuracy': [(TP+TN)/(TP+TN+FP+FN)], 
                                                    'Precision': [TP/(TP+FP)], 
                                                    'Recall': [TP/(TP+FN)], 
                                                    'F1': [2*TP/(2*TP+FP+FN)]
                                                     }, index=[0])])

In [19]:
performance

Unnamed: 0,model,Accuracy,Precision,Recall,F1
0,linear svm,1.0,1.0,1.0,1.0


In [18]:
riding_mover["predicted"]=svm_linsvc.predict(X)
riding_mover

Unnamed: 0,Income,Lot_Size,Ownership_Owner,predicted
0,60.0,18.4,1,0
1,85.5,16.8,1,1
2,64.8,21.6,1,1
3,61.5,20.8,1,1
4,87.0,23.6,1,1
5,110.1,19.2,1,1
6,108.0,17.6,1,1
7,82.8,22.4,1,1
8,69.0,20.0,1,1
9,93.0,20.8,1,1


In [20]:
riding_mover['pred_prob'] = svm_linsvc.predict_proba(X)[:,1]
riding_mover

Unnamed: 0,Income,Lot_Size,Ownership_Owner,predicted,pred_prob
0,60.0,18.4,1,0,0.477774
1,85.5,16.8,1,1,0.643367
2,64.8,21.6,1,1,0.643499
3,61.5,20.8,1,1,0.583284
4,87.0,23.6,1,1,0.842604
5,110.1,19.2,1,1,0.861159
6,108.0,17.6,1,1,0.81796
7,82.8,22.4,1,1,0.792842
8,69.0,20.0,1,1,0.619244
9,93.0,20.8,1,1,0.811532


# Fit a SVM classification model using rbf kernel

In [21]:
svm_rbf_model = SVC(kernel="rbf", C=10, gamma='scale',probability=True)
svm_rbfsvc = svm_rbf_model.fit(X_train, np.ravel(y_train))

In [22]:
model_preds_svcrbf = svm_rbf_model.predict(X_test)
c_matrix = confusion_matrix(y_test, model_preds_svcrbf)
TP = c_matrix[1][1]
TN = c_matrix[0][0]
FP = c_matrix[0][1]
FN = c_matrix[1][0]
performance = pd.concat([performance, pd.DataFrame({'model':"rbf svm", 
                                                    'Accuracy': [(TP+TN)/(TP+TN+FP+FN)], 
                                                    'Precision': [TP/(TP+FP)], 
                                                    'Recall': [TP/(TP+FN)], 
                                                    'F1': [2*TP/(2*TP+FP+FN)]
                                                     }, index=[0])])

In [23]:
performance

Unnamed: 0,model,Accuracy,Precision,Recall,F1
0,linear svm,1.0,1.0,1.0,1.0
0,rbf svm,0.75,0.666667,0.666667,0.666667


In [24]:
riding_mover["predicted"]=svm_rbfsvc.predict(X)
riding_mover

Unnamed: 0,Income,Lot_Size,Ownership_Owner,predicted,pred_prob
0,60.0,18.4,1,0,0.477774
1,85.5,16.8,1,1,0.643367
2,64.8,21.6,1,1,0.643499
3,61.5,20.8,1,1,0.583284
4,87.0,23.6,1,1,0.842604
5,110.1,19.2,1,1,0.861159
6,108.0,17.6,1,1,0.81796
7,82.8,22.4,1,1,0.792842
8,69.0,20.0,1,1,0.619244
9,93.0,20.8,1,1,0.811532


In [25]:
riding_mover['pred_prob'] = svm_rbfsvc.predict_proba(X)[:,1]
riding_mover

Unnamed: 0,Income,Lot_Size,Ownership_Owner,predicted,pred_prob
0,60.0,18.4,1,0,0.382182
1,85.5,16.8,1,1,0.738629
2,64.8,21.6,1,1,0.567431
3,61.5,20.8,1,1,0.476066
4,87.0,23.6,1,1,0.83206
5,110.1,19.2,1,1,0.759051
6,108.0,17.6,1,1,0.75904
7,82.8,22.4,1,1,0.804546
8,69.0,20.0,1,1,0.60839
9,93.0,20.8,1,1,0.813595


# Fit a SVM classification model using polynomial kernal

In [26]:
svm_poly_model = SVC(kernel="poly", degree=3, coef0=1.0, C=10,probability=True)
svm_polysvc= svm_poly_model.fit(X_train, np.ravel(y_train))

In [27]:
model_preds_svcpoly = svm_poly_model.predict(X_test)
c_matrix = confusion_matrix(y_test, model_preds_svcpoly)
TP = c_matrix[1][1]
TN = c_matrix[0][0]
FP = c_matrix[0][1]
FN = c_matrix[1][0]
performance = pd.concat([performance, pd.DataFrame({'model':"poly svm", 
                                                    'Accuracy': [(TP+TN)/(TP+TN+FP+FN)], 
                                                    'Precision': [TP/(TP+FP)], 
                                                    'Recall': [TP/(TP+FN)], 
                                                    'F1': [2*TP/(2*TP+FP+FN)]
                                                     }, index=[0])])

In [28]:
performance

Unnamed: 0,model,Accuracy,Precision,Recall,F1
0,linear svm,1.0,1.0,1.0,1.0
0,rbf svm,0.75,0.666667,0.666667,0.666667
0,poly svm,0.875,1.0,0.666667,0.8


In [32]:
riding_mover["predicted"]=svm_polysvc.predict(X)
riding_mover

Unnamed: 0,Income,Lot_Size,Ownership_Owner,predicted,pred_prob
0,60.0,18.4,1,0,0.558405
1,85.5,16.8,1,1,0.56233
2,64.8,21.6,1,1,0.565553
3,61.5,20.8,1,1,0.562769
4,87.0,23.6,1,1,0.578904
5,110.1,19.2,1,1,0.570517
6,108.0,17.6,1,1,0.565548
7,82.8,22.4,1,1,0.574482
8,69.0,20.0,1,1,0.56444
9,93.0,20.8,1,1,0.573509


In [33]:
riding_mover['pred_prob'] = svm_polysvc.predict_proba(X)[:,1]
riding_mover

Unnamed: 0,Income,Lot_Size,Ownership_Owner,predicted,pred_prob
0,60.0,18.4,1,0,0.558405
1,85.5,16.8,1,1,0.56233
2,64.8,21.6,1,1,0.565553
3,61.5,20.8,1,1,0.562769
4,87.0,23.6,1,1,0.578904
5,110.1,19.2,1,1,0.570517
6,108.0,17.6,1,1,0.565548
7,82.8,22.4,1,1,0.574482
8,69.0,20.0,1,1,0.56444
9,93.0,20.8,1,1,0.573509


In [34]:
performance

Unnamed: 0,model,Accuracy,Precision,Recall,F1
0,linear svm,1.0,1.0,1.0,1.0
0,rbf svm,0.75,0.666667,0.666667,0.666667
0,poly svm,0.875,1.0,0.666667,0.8


In [35]:
performance.sort_values(by=['Accuracy'])

Unnamed: 0,model,Accuracy,Precision,Recall,F1
0,rbf svm,0.75,0.666667,0.666667,0.666667
0,poly svm,0.875,1.0,0.666667,0.8
0,linear svm,1.0,1.0,1.0,1.0


In [36]:
performance.sort_values(by=['Precision'])

Unnamed: 0,model,Accuracy,Precision,Recall,F1
0,rbf svm,0.75,0.666667,0.666667,0.666667
0,linear svm,1.0,1.0,1.0,1.0
0,poly svm,0.875,1.0,0.666667,0.8


In [37]:
performance.sort_values(by=['Recall'])

Unnamed: 0,model,Accuracy,Precision,Recall,F1
0,rbf svm,0.75,0.666667,0.666667,0.666667
0,poly svm,0.875,1.0,0.666667,0.8
0,linear svm,1.0,1.0,1.0,1.0


In [38]:
performance.sort_values(by=['F1'])

Unnamed: 0,model,Accuracy,Precision,Recall,F1
0,rbf svm,0.75,0.666667,0.666667,0.666667
0,poly svm,0.875,1.0,0.666667,0.8
0,linear svm,1.0,1.0,1.0,1.0


# Analysis:
1. Firstly , I observe that the model Linear SVM is overfitting.
2. I tried to address this problem by introducing the reqularisation parameter.
3. How ever , the best way for doing this is grid search cv, cross validation  as mentioned . but as of our knowledge concerned in class materials I tried using gamma , c values but as we split the data and it is having less number of samples (24) It is overfitting.
4. So , I consider rbf and poly svm by eliminating linear svm as it is overfitting.
5. Among rbf and poly if we observe the metrics Poly is having higher values so I believe that Polynomial is the best model.and save the model in pickle file.
6. By using the pickle file in my text based interface file i developed interface and displaying the probability and result.
7. I used a sort of trial and error method like decrease gamma value and c value .

In [39]:
import pickle

# save model
pickle.dump(svm_polysvc, open(r'C:/Users/Meghanjali/Desktop/Data science programming/WE03 ASGT/best_svm_poly.pkl', "wb"))

# If you wish to load this model later, simply use pickle.load method
#loaded_model = pickle.load(open('logistic_model_example01.pkl', "rb"))