## 1.0 Import and install python libraries

In [1]:
# import numpy and pandas libraries
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.impute import SimpleImputer
from sklearn.svm import SVC
from matplotlib import pyplot as plt
from sklearn.metrics import confusion_matrix
from sklearn.metrics import r2_score

## 2.0 Set Random Seed

In [2]:
# set random seed to ensure that results are repeatable
np.random.seed(1)

## 3.0 Load data 

In [3]:
# load data
RidingMowers = pd.read_csv("RidingMowers.csv")

## 4.0 Conduct initial exploration of the data

In [4]:
# generate a basic summary of the data
RidingMowers.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24 entries, 0 to 23
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Income     24 non-null     float64
 1   Lot_Size   24 non-null     float64
 2   Ownership  24 non-null     object 
dtypes: float64(2), object(1)
memory usage: 704.0+ bytes


In [5]:
RidingMowers.head(3)

Unnamed: 0,Income,Lot_Size,Ownership
0,60.0,18.4,Owner
1,85.5,16.8,Owner
2,64.8,21.6,Owner


In [6]:
# generate a statistical summary of the numeric value in the data
RidingMowers.describe()

Unnamed: 0,Income,Lot_Size
count,24.0,24.0
mean,68.4375,18.95
std,19.793144,2.428275
min,33.0,14.0
25%,52.35,17.5
50%,64.8,19.0
75%,83.1,20.8
max,110.1,23.6


In [7]:
# Check the missing values by summing the total na's for each variable
RidingMowers.isna().sum()

Income       0
Lot_Size     0
Ownership    0
dtype: int64

In [8]:
# create a list of these catagorical variables
category_var_list = list(RidingMowers.select_dtypes(include='object').columns)
category_var_list

['Ownership']

In [9]:
# explore the categorical variable values - often there are typos here that need to be fixed.
for cat in category_var_list: # generally, we want to avoid for loops and use a functional style (i.e. list comprehension)
    print(f"Category: {cat} Values: {RidingMowers[cat].unique()}")

Category: Ownership Values: ['Owner' 'Nonowner']


## 5.0 Process the data
* Conduct any data prepartion that should be done *BEFORE* the data split.
* Split the data.
* Conduct any data preparation that should be done *AFTER* the data split.

### 5.1  Conduct any data prepartion that should be done *BEFORE* the data split
Tasks at this stage include:
1. Drop any columns/features 
2. Decide if you with to exclude any observations (rows) due to missing na's.
2. Conduct proper encoding of categorical variables
    1. You can transform them using dummy variable encoding, one-hot-encoding, or label encoding. 

In [10]:
RidingMowers.shape

(24, 3)

#### Encode our categorical variables

In [11]:
labelencoder = LabelEncoder()
RidingMowers['Ownership'] = labelencoder.fit_transform(RidingMowers['Ownership'])

In [12]:
# explore the dataframe columns to verify encoding and dropped columns
RidingMowers.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24 entries, 0 to 23
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Income     24 non-null     float64
 1   Lot_Size   24 non-null     float64
 2   Ownership  24 non-null     int32  
dtypes: float64(2), int32(1)
memory usage: 608.0 bytes


### 5.2 Split data (train/test)

In [13]:
# split the data into validation and training set
train_df, test_df = train_test_split(RidingMowers, test_size=0.3)

# to reduce repetition in later code, create variables to represent the columns
# that are our predictors and target
target = 'Ownership'
predictors = list(RidingMowers.columns)
predictors.remove(target)

### 5.3  Conduct any data prepartion that should be done *AFTER* the data split

We will look at the following:
1) imput any missing numeric values using the mean of the variable/column
2) remove differences of scale by standardizing the numerica variables

We dont have any missing values here and standardization is not needed as it is not flexible.

## 6.0 Save the data

In [14]:
train_X = train_df[predictors]
train_y = train_df[target] # train_target is now a series objecttrain_df.to_csv('RidingMowers_df.csv', index=False)
test_X = test_df[predictors]
test_y = test_df[target] # validation_target is now a series object

# SVM Demonstration

In this tutorial we will demonstrate how to use the `SVM` class in `scikit-learn` to perform logistic regression on a dataset. 

NOTE: We are not splitting the data in this example. For this example we focus on the fitting process and results of the model on training data. As we know, this isn't how you would normally use a model. You can easily add splitting the data (as we did in the previous examples).

## 7 Model the data

First, let's create a dataframe to load the model performance metrics into.

In [15]:
performance = pd.DataFrame({"model": [], "Accuracy": [], "Precision": [], "Recall": [], "F1": []})

### 7.1 Fit a SVM classification model using linear kernal

In [16]:
svm_lin_model = SVC(kernel="linear", probability=True)
_ = svm_lin_model.fit(train_X, np.ravel(train_y))

In [17]:
model_preds_l = svm_lin_model.predict(test_X)
c_matrix = confusion_matrix(test_y, model_preds_l)
TP = c_matrix[1][1]
TN = c_matrix[0][0]
FP = c_matrix[0][1]
FN = c_matrix[1][0]
performance = pd.concat([performance, pd.DataFrame({'model':"linear svm", 
                                                    'Accuracy': [(TP+TN)/(TP+TN+FP+FN)], 
                                                    'Precision': [TP/(TP+FP)], 
                                                    'Recall': [TP/(TP+FN)], 
                                                    'F1': [2*TP/(2*TP+FP+FN)]
                                                     }, index=[0])])

### 7.2 Fit a SVM classification model using rbf kernal

In [18]:
svm_rbf_model = SVC(kernel="rbf", C=10, gamma='scale', probability=True)
_ = svm_rbf_model.fit(train_X, np.ravel(train_y))

In [19]:
model_preds_r = svm_rbf_model.predict(test_X)
c_matrix = confusion_matrix(test_y, model_preds_r)
TP = c_matrix[1][1]
TN = c_matrix[0][0]
FP = c_matrix[0][1]
FN = c_matrix[1][0]
performance = pd.concat([performance, pd.DataFrame({'model':"rbf svm", 
                                                    'Accuracy': [(TP+TN)/(TP+TN+FP+FN)], 
                                                    'Precision': [TP/(TP+FP)], 
                                                    'Recall': [TP/(TP+FN)], 
                                                    'F1': [2*TP/(2*TP+FP+FN)]
                                                     }, index=[0])])

### 7.3 Fit a SVM classification model using polynomial kernal

In [20]:
svm_poly_model = SVC(kernel="poly", degree=3, coef0=1, C=10, probability=True)
_ = svm_poly_model.fit(train_X, np.ravel(train_y))

In [21]:
model_preds_p = svm_poly_model.predict(test_X)
c_matrix = confusion_matrix(test_y, model_preds_p)
TP = c_matrix[1][1]
TN = c_matrix[0][0]
FP = c_matrix[0][1]
FN = c_matrix[1][0]
performance = pd.concat([performance, pd.DataFrame({'model':"poly svm", 
                                                    'Accuracy': [(TP+TN)/(TP+TN+FP+FN)], 
                                                    'Precision': [TP/(TP+FP)], 
                                                    'Recall': [TP/(TP+FN)], 
                                                    'F1': [2*TP/(2*TP+FP+FN)]
                                                     }, index=[0])])

## 8.0 Summary

In [22]:
performance

Unnamed: 0,model,Accuracy,Precision,Recall,F1
0,linear svm,1.0,1.0,1.0,1.0
0,rbf svm,0.75,0.666667,0.666667,0.666667
0,poly svm,0.875,1.0,0.666667,0.8


*Analysis* : Here, we got the highest Accuracy for Linear SVM model with the good recall and f1 scores. As we are considering how many samples are correctly predicted among all the values, I am considering accuracy as main parameter. When it comes to recall, where it is true positives out of actual positives, SVM Linear model performs better. Hence, I considered the SVM Linear Model as the better model for this situation. 

## 9.0 Prediction using winning model

In [23]:
test_df['predicted'] = svm_lin_model.predict(test_X)
test_df.head(20)

Unnamed: 0,Income,Lot_Size,Ownership,predicted
13,52.8,20.8,0,0
18,59.4,16.0,0,0
3,61.5,20.8,1,1
14,64.8,17.2,0,0
20,47.4,16.4,0,0
17,49.2,17.6,0,0
10,51.0,22.0,1,1
4,87.0,23.6,1,1


In [24]:
test_df['pred_prob'] = svm_lin_model.predict_proba(test_X)[:,1]
test_df.head(20)

Unnamed: 0,Income,Lot_Size,Ownership,predicted,pred_prob
13,52.8,20.8,0,0,0.505951
18,59.4,16.0,0,0,0.38321
3,61.5,20.8,1,1,0.583284
14,64.8,17.2,0,0,0.474402
20,47.4,16.4,0,0,0.299999
17,49.2,17.6,0,0,0.356866
10,51.0,22.0,1,1,0.536072
4,87.0,23.6,1,1,0.842604


## 10.0 Saving the 'winning' model to a pickle file

## Save the model to disk

Once you train a model, you want to reused it in other notebooks or applications. You can save the model to disk using the `pickle` module.

In [25]:
import pickle

# save model
pickle.dump(svm_lin_model, open('svm_lin_model_example01.pkl', "wb"))

# If you wish to load this model later, simply use pickle.load method
#loaded_model = pickle.load(open('logistic_model_example01.pkl', "rb"))