# Meghanjali Chennupati (U30308400)
# Conducting SVM on riding mover.csv data.

# STEP -01: Import all the standard libraries

In [30]:
# import numpy and pandas libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix
# set random seed to ensure that results are repeatable
np.random.seed(1)

#  Step -02 Load the data

In [31]:
riding_mover=pd.read_csv(r"C:/Users/Meghanjali/Desktop/Data science programming/WE03 ASGT/RidingMowers.csv")
riding_mover.head(5)

Unnamed: 0,Income,Lot_Size,Ownership
0,60.0,18.4,Owner
1,85.5,16.8,Owner
2,64.8,21.6,Owner
3,61.5,20.8,Owner
4,87.0,23.6,Owner


# Step-03: Conduct some intial exploration on the data

In [32]:
# generate a basic summary of the data
riding_mover.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24 entries, 0 to 23
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Income     24 non-null     float64
 1   Lot_Size   24 non-null     float64
 2   Ownership  24 non-null     object 
dtypes: float64(2), object(1)
memory usage: 704.0+ bytes


In [33]:
# generate a statistical summary of the numeric value in the data
riding_mover.describe()

Unnamed: 0,Income,Lot_Size
count,24.0,24.0
mean,68.4375,18.95
std,19.793144,2.428275
min,33.0,14.0
25%,52.35,17.5
50%,64.8,19.0
75%,83.1,20.8
max,110.1,23.6


In [34]:
# Check the missing values by summing the total na's for each variable
riding_mover.isna().sum()

Income       0
Lot_Size     0
Ownership    0
dtype: int64

In [35]:

# Find unique values of a column
print(riding_mover['Ownership'].unique())


['Owner' 'Nonowner']


# Summary from intial exploration from data:

1. There are no missing values
2. There is one target variable which is categorical.
3. We need to encode using either dummmy variable or one hot encoding
4. I am going to use dummy variable encoding by using get_dummies function()

In [36]:
dummies_df = pd.get_dummies(riding_mover['Ownership'], prefix='Ownership', drop_first=True)
riding_mover = riding_mover.join(dummies_df)
riding_mover.drop('Ownership', axis=1, inplace = True)

In [37]:
riding_mover.head(10)

Unnamed: 0,Income,Lot_Size,Ownership_Owner
0,60.0,18.4,1
1,85.5,16.8,1
2,64.8,21.6,1
3,61.5,20.8,1
4,87.0,23.6,1
5,110.1,19.2,1
6,108.0,17.6,1
7,82.8,22.4,1
8,69.0,20.0,1
9,93.0,20.8,1


In [38]:
riding_mover.tail(10)

Unnamed: 0,Income,Lot_Size,Ownership_Owner
14,64.8,17.2,0
15,43.2,20.4,0
16,84.0,17.6,0
17,49.2,17.6,0
18,59.4,16.0,0
19,66.0,18.4,0
20,47.4,16.4,0
21,33.0,18.8,0
22,51.0,14.0,0
23,63.0,14.8,0


In [39]:
riding_mover.shape

(24, 3)

# Step -04  Splitting the data in to training and testing 

In [40]:
X = riding_mover.loc[:,['Income','Lot_Size']]
y = riding_mover.loc[:,['Ownership_Owner']]

In [41]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=1/3)

In [13]:
#from sklearn.preprocessing import StandardScaler
#sc_X = StandardScaler()
#X_train = sc_X.fit_transform(X_train)
#X_test = sc_X.transform(X_test)

#  Analysis
In this the variables are having same values with in same range so no need to do scaling. how ever if we do we will get negative values which is of no mean as we dont have income and loft size in negative. How ever i refered to the content and learned that it is best to do scaling for distance based algorithims but i tried and did how evermy model is overfitting and for every non owner it is preducting owner . So i removed scaling

# Creating a data frame with metrics as column to store the each model metric

In [42]:
performance = pd.DataFrame({"model": [], "Accuracy": [], "Precision": [], "Recall": [], "F1": []})

# 4.0  Fit a SVM classification model using linear kernal

In [43]:
svm_lin_model = SVC(kernel="linear",probability=True,C=1.0)
svm_linsvc = svm_lin_model.fit(X_train, np.ravel(y_train))

In [44]:
model_preds_svclin = svm_lin_model.predict(X_test)
c_matrix = confusion_matrix(y_test, model_preds_svclin)
TP = c_matrix[1][1]
TN = c_matrix[0][0]
FP = c_matrix[0][1]
FN = c_matrix[1][0]
performance = pd.concat([performance, pd.DataFrame({'model':"linear svm", 
                                                    'Accuracy': [(TP+TN)/(TP+TN+FP+FN)], 
                                                    'Precision': [TP/(TP+FP)], 
                                                    'Recall': [TP/(TP+FN)], 
                                                    'F1': [2*TP/(2*TP+FP+FN)]
                                                     }, index=[0])])

In [45]:
print(TN,TP,FN,FP)

5 3 0 0


# Prediction for linear kenel

In [46]:
riding_mover["predicted"]=svm_linsvc.predict(X)
riding_mover

Unnamed: 0,Income,Lot_Size,Ownership_Owner,predicted
0,60.0,18.4,1,0
1,85.5,16.8,1,1
2,64.8,21.6,1,1
3,61.5,20.8,1,1
4,87.0,23.6,1,1
5,110.1,19.2,1,1
6,108.0,17.6,1,1
7,82.8,22.4,1,1
8,69.0,20.0,1,1
9,93.0,20.8,1,1


In [47]:
riding_mover['pred_prob'] = svm_linsvc.predict_proba(X)[:,1]
riding_mover

Unnamed: 0,Income,Lot_Size,Ownership_Owner,predicted,pred_prob
0,60.0,18.4,1,0,0.477774
1,85.5,16.8,1,1,0.643367
2,64.8,21.6,1,1,0.643499
3,61.5,20.8,1,1,0.583284
4,87.0,23.6,1,1,0.842604
5,110.1,19.2,1,1,0.861159
6,108.0,17.6,1,1,0.81796
7,82.8,22.4,1,1,0.792842
8,69.0,20.0,1,1,0.619244
9,93.0,20.8,1,1,0.811532


# Fit a SVM classification model using rbf kernel

In [48]:
svm_rbf_model = SVC(kernel="rbf", C=1.0, gamma=0.7,probability=True)
svm_rbfsvc = svm_rbf_model.fit(X_train, np.ravel(y_train))

In [49]:
model_preds_svcrbf = svm_rbf_model.predict(X_test)
c_matrix = confusion_matrix(y_test, model_preds_svcrbf)
TP = c_matrix[1][1]
TN = c_matrix[0][0]
FP = c_matrix[0][1]
FN = c_matrix[1][0]
performance = pd.concat([performance, pd.DataFrame({'model':"rbf svm", 
                                                    'Accuracy': [(TP+TN)/(TP+TN+FP+FN)], 
                                                    'Precision': [TP/(TP+FP)], 
                                                    'Recall': [TP/(TP+FN)], 
                                                    'F1': [2*TP/(2*TP+FP+FN)]
                                                     }, index=[0])])

In [50]:
print(TN,TP,FN,FP)

0 3 0 5


# Predictions for rbf kernel

In [51]:
riding_mover["predicted"]=svm_rbfsvc.predict(X)
riding_mover

Unnamed: 0,Income,Lot_Size,Ownership_Owner,predicted,pred_prob
0,60.0,18.4,1,1,0.477774
1,85.5,16.8,1,1,0.643367
2,64.8,21.6,1,1,0.643499
3,61.5,20.8,1,1,0.583284
4,87.0,23.6,1,1,0.842604
5,110.1,19.2,1,1,0.861159
6,108.0,17.6,1,1,0.81796
7,82.8,22.4,1,1,0.792842
8,69.0,20.0,1,1,0.619244
9,93.0,20.8,1,1,0.811532


In [52]:
riding_mover['pred_prob'] = svm_rbfsvc.predict_proba(X)[:,1]
riding_mover

Unnamed: 0,Income,Lot_Size,Ownership_Owner,predicted,pred_prob
0,60.0,18.4,1,1,0.025602
1,85.5,16.8,1,1,0.025556
2,64.8,21.6,1,1,0.025521
3,61.5,20.8,1,1,0.553402
4,87.0,23.6,1,1,0.557253
5,110.1,19.2,1,1,0.025602
6,108.0,17.6,1,1,0.025602
7,82.8,22.4,1,1,0.025556
8,69.0,20.0,1,1,0.025601
9,93.0,20.8,1,1,0.025601


# Fit a SVM classification model using polynomial kernal

In [53]:
svm_poly_model = SVC(kernel="poly", degree=3, coef0=1.0, C=1.0,probability=True,gamma="auto")
svm_polysvc= svm_poly_model.fit(X_train, np.ravel(y_train))

In [54]:
model_preds_svcpoly = svm_poly_model.predict(X_test)
c_matrix = confusion_matrix(y_test, model_preds_svcpoly)
TP = c_matrix[1][1]
TN = c_matrix[0][0]
FP = c_matrix[0][1]
FN = c_matrix[1][0]
performance = pd.concat([performance, pd.DataFrame({'model':"poly svm", 
                                                    'Accuracy': [(TP+TN)/(TP+TN+FP+FN)], 
                                                    'Precision': [TP/(TP+FP)], 
                                                    'Recall': [TP/(TP+FN)], 
                                                    'F1': [2*TP/(2*TP+FP+FN)]
                                                     }, index=[0])])

In [55]:
print(TN,TP,FN,FP)

4 3 0 1


# Displaying the performance metric values for each of the model

In [56]:
performance

Unnamed: 0,model,Accuracy,Precision,Recall,F1
0,linear svm,1.0,1.0,1.0,1.0
0,rbf svm,0.375,0.375,1.0,0.545455
0,poly svm,0.875,0.75,1.0,0.857143


# Analysis:

1. Firstly the data set is about predicting the ownership for an individual based on the income and loft size. So it is having two input variables  and one target variable.
2. For evaluting each model the metrics used are accuracy, precision, recall and f1 score.
3. How ever if we observe we can decide which is best metric by using the TN TP FN FP values.
4. Let us consider the Linear Kernel svm model we have (5,3,0,0) TN,TP,FN,FP.
5. For RBF kernel if we observe we have (0,3,0,5) TN,TP,FN.FP.
6. For POLY kernel if we observe we have (4,3,0,1) TN,TP,FN,FP.
7. As per the knowledge from Data mining class IF FN>FP then recall will be the best metric.
8. If FP>FN then precision is best metric.
9. If classes are reasonable in balance – focus on Accuracy
10. If classes are imbalanced – focused on F1 score.
11. Our data has total 24 samples which means 11 are owner 11 are not owner perfectly balanced so I am eliminating F1 score.
12. I can consider accuracy.
13. in linear svm if we observe we got both FN AND FP values 0 which means our model is clearly identifying the relation ship how ever in the real world this is not possible. This is like over fitting. and more over our is a small set of data so this might also be a reason.
14. for rbf if we observe fn 0 fp 5 fp>fn precison best metric.
15. for polynomial kernel fn 0 fp 1 fp>fn precion best metric.
16. The problem is we can clearly say it is over fitting. 
17 recall is 1.0 for all models and linear svm 1.0 for all so as it is overfitting i am conidering remaining two models.
18.if we consider rbf and poly poly is having best vales for both precision and accuracy so i am considering poly svm as best model.
19 We can address the overfitting by the following as per class material knowledge i uderstood that:
    1. we can control regularisation parameter c and gamma coef parameters in svc while fitting . I tried the trial and error method and then i observe after all polysvm is performing best.

10. The best way to address this is grid search and cross validation as mentioned. 
11. How ever as you mentioned that use the class material so far to address this problem i tried trial an error of c and gamma and other set of values to evalueate this . Then if i try to see the results then i am getting owner for non owner ship also .
12. So at last i finally decided poly svm is the best model and i saved to pickle file


In [57]:
import pickle

# save model
pickle.dump(svm_polysvc, open(r'C:/Users/Meghanjali/Desktop/Data science programming/WE03 ASGT/best_svm_poly.pkl', "wb"))

# If you wish to load this model later, simply use pickle.load method
#loaded_model = pickle.load(open('logistic_model_example01.pkl', "rb"))