## Import and install python libraries

In [233]:
# import numpy and pandas libraries
import numpy as np
import pandas as pd
from sklearn.svm import SVC
from matplotlib import pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.impute import SimpleImputer
from sklearn.metrics import confusion_matrix
import pickle

# set random seed to ensure that results are repeatable
np.random.seed(1)

## Load the data

In [234]:
# load data
lawnmower = pd.read_csv("RidingMowers.csv")

## Conduct initial exploration of the data

In [235]:
# look at the data
lawnmower.head(3) # note that we don't want to dump all the data to the screen

Unnamed: 0,Income,Lot_Size,Ownership
0,60.0,18.4,Owner
1,85.5,16.8,Owner
2,64.8,21.6,Owner


In [236]:
# generate a basic summary of the data
lawnmower.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24 entries, 0 to 23
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Income     24 non-null     float64
 1   Lot_Size   24 non-null     float64
 2   Ownership  24 non-null     object 
dtypes: float64(2), object(1)
memory usage: 704.0+ bytes


In [237]:
# generate a statistical summary of the numeric value in the data
lawnmower.describe()

Unnamed: 0,Income,Lot_Size
count,24.0,24.0
mean,68.4375,18.95
std,19.793144,2.428275
min,33.0,14.0
25%,52.35,17.5
50%,64.8,19.0
75%,83.1,20.8
max,110.1,23.6


In [238]:
# Check the missing values by summing the total na's for each variable
lawnmower.isna().sum()

Income       0
Lot_Size     0
Ownership    0
dtype: int64

In [239]:
# create a list of these catagorical variables
category_var_list = list(lawnmower.select_dtypes(include='object').columns)
category_var_list

['Ownership']

In [240]:
# explore the categorical variable values - often there are typos here that need to be fixed.
for cat in category_var_list: # generally, we want to avoid for loops and use a functional style (i.e. list comprehension)
    print(f"Category: {cat} Values: {lawnmower[cat].unique()}")

Category: Ownership Values: ['Owner' 'Nonowner']


## Encode our categorical variables

In [241]:
labelencoder = LabelEncoder()
lawnmower['Ownership'] = labelencoder.fit_transform(lawnmower['Ownership'])

In [242]:
# explore the dataframe columns to verify encoding and dropped columns
lawnmower.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24 entries, 0 to 23
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Income     24 non-null     float64
 1   Lot_Size   24 non-null     float64
 2   Ownership  24 non-null     int32  
dtypes: float64(2), int32(1)
memory usage: 608.0 bytes


### Split data (train/test)

In [243]:
X_train, X_test, y_train, y_test = train_test_split(lawnmower.drop('Ownership', axis=1), lawnmower['Ownership'], test_size=0.3, random_state=45)

In [244]:
performance = pd.DataFrame({"model": [], "Accuracy": [], "Precision": [], "Recall": [], "F1": []})

### SVM classification model using linear kernal

In [245]:
svm_lin_model = SVC(kernel="linear",probability=True)
_ = svm_lin_model.fit(X_train, np.ravel(y_train))

In [246]:
model_preds = svm_lin_model.predict(X_test)
c_matrix = confusion_matrix(y_test, model_preds)
TP = c_matrix[1][1]
TN = c_matrix[0][0]
FP = c_matrix[0][1]
FN = c_matrix[1][0]
performance = pd.concat([performance, pd.DataFrame({'model':"linear svm", 
                                                    'Accuracy': [(TP+TN)/(TP+TN+FP+FN)], 
                                                    'Precision': [TP/(TP+FP)], 
                                                    'Recall': [TP/(TP+FN)], 
                                                    'F1': [2*TP/(2*TP+FP+FN)]
                                                     }, index=[0])])
performance

Unnamed: 0,model,Accuracy,Precision,Recall,F1
0,linear svm,0.625,0.25,1.0,0.4


### SVM classification model using rbf kernal

In [247]:
svm_rbf_model = SVC(kernel="rbf", C=10, gamma='scale',probability=True)
_ = svm_rbf_model.fit(X_train, np.ravel(y_train))

In [248]:
model_preds = svm_rbf_model.predict(X_test)
c_matrix = confusion_matrix(y_test, model_preds)
TP = c_matrix[1][1]
TN = c_matrix[0][0]
FP = c_matrix[0][1]
FN = c_matrix[1][0]
performance = pd.concat([performance, pd.DataFrame({'model':"rbf svm", 
                                                    'Accuracy': [(TP+TN)/(TP+TN+FP+FN)], 
                                                    'Precision': [TP/(TP+FP)], 
                                                    'Recall': [TP/(TP+FN)], 
                                                    'F1': [2*TP/(2*TP+FP+FN)]
                                                     }, index=[0])])
performance

Unnamed: 0,model,Accuracy,Precision,Recall,F1
0,linear svm,0.625,0.25,1.0,0.4
0,rbf svm,0.5,0.0,0.0,0.0


### SVM classification model using polynomial kernal

In [249]:
svm_poly_model = SVC(kernel="poly", degree=3, coef0=1, C=10,probability=True)
_ = svm_poly_model.fit(X_train, np.ravel(y_train))

In [250]:
model_preds = svm_poly_model.predict(X_test)
c_matrix = confusion_matrix(y_test, model_preds)
TP = c_matrix[1][1]
TN = c_matrix[0][0]
FP = c_matrix[0][1]
FN = c_matrix[1][0]
performance = pd.concat([performance, pd.DataFrame({'model':"poly svm", 
                                                    'Accuracy': [(TP+TN)/(TP+TN+FP+FN)], 
                                                    'Precision': [TP/(TP+FP)], 
                                                    'Recall': [TP/(TP+FN)], 
                                                    'F1': [2*TP/(2*TP+FP+FN)]
                                                     }, index=[0])])
performance

Unnamed: 0,model,Accuracy,Precision,Recall,F1
0,linear svm,0.625,0.25,1.0,0.4
0,rbf svm,0.5,0.0,0.0,0.0
0,poly svm,0.75,0.333333,1.0,0.5


## Saving the 'winning' model to a pickle file.

In [251]:
winning_model = svm_poly_model 
with open("winning_model.pkl", "wb") as f:
    pickle.dump(winning_model, f)