### Working on a problem statement on Support Vector Machine (SVM)

In [1]:
### Importing libraries
import numpy as np
import pandas as pd
import IPython.display

### Visualization
import seaborn as sns
import matplotlib.pyplot as pp

### Model Building packages
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.svm import SVC
from sklearn.preprocessing import MinMaxScaler, scale
from sklearn.model_selection import train_test_split, cross_val_score, KFold, GridSearchCV
from sklearn.metrics import accuracy_score, recall_score, precision_score,confusion_matrix

In [2]:
### Loading Data 
data=pd.read_csv('Spam.csv')
display(data.head(10))
display(data.shape)
display(data.isnull().sum())
print(f'The proportion of Spam mails: {data.spam.mean()}')

Unnamed: 0,word_freq_make,word_freq_address,word_freq_all,word_freq_3d,word_freq_our,word_freq_over,word_freq_remove,word_freq_internet,word_freq_order,word_freq_mail,...,char_freq_;,char_freq_(,char_freq_[,char_freq_!,char_freq_$,char_freq_hash,capital_run_length_average,capital_run_length_longest,capital_run_length_total,spam
0,0.0,0.64,0.64,0.0,0.32,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.778,0.0,0.0,3.756,61,278,1
1,0.21,0.28,0.5,0.0,0.14,0.28,0.21,0.07,0.0,0.94,...,0.0,0.132,0.0,0.372,0.18,0.048,5.114,101,1028,1
2,0.06,0.0,0.71,0.0,1.23,0.19,0.19,0.12,0.64,0.25,...,0.01,0.143,0.0,0.276,0.184,0.01,9.821,485,2259,1
3,0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,...,0.0,0.137,0.0,0.137,0.0,0.0,3.537,40,191,1
4,0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,...,0.0,0.135,0.0,0.135,0.0,0.0,3.537,40,191,1
5,0.0,0.0,0.0,0.0,1.85,0.0,0.0,1.85,0.0,0.0,...,0.0,0.223,0.0,0.0,0.0,0.0,3.0,15,54,1
6,0.0,0.0,0.0,0.0,1.92,0.0,0.0,0.0,0.0,0.64,...,0.0,0.054,0.0,0.164,0.054,0.0,1.671,4,112,1
7,0.0,0.0,0.0,0.0,1.88,0.0,0.0,1.88,0.0,0.0,...,0.0,0.206,0.0,0.0,0.0,0.0,2.45,11,49,1
8,0.15,0.0,0.46,0.0,0.61,0.0,0.3,0.0,0.92,0.76,...,0.0,0.271,0.0,0.181,0.203,0.022,9.744,445,1257,1
9,0.06,0.12,0.77,0.0,0.19,0.32,0.38,0.0,0.06,0.0,...,0.04,0.03,0.0,0.244,0.081,0.0,1.729,43,749,1


(4601, 58)

word_freq_make                0
word_freq_address             0
word_freq_all                 0
word_freq_3d                  0
word_freq_our                 0
word_freq_over                0
word_freq_remove              0
word_freq_internet            0
word_freq_order               0
word_freq_mail                0
word_freq_receive             0
word_freq_will                0
word_freq_people              0
word_freq_report              0
word_freq_addresses           0
word_freq_free                0
word_freq_business            0
word_freq_email               0
word_freq_you                 0
word_freq_credit              0
word_freq_your                0
word_freq_font                0
word_freq_000                 0
word_freq_money               0
word_freq_hp                  0
word_freq_hpl                 0
word_freq_george              0
word_freq_650                 0
word_freq_lab                 0
word_freq_labs                0
word_freq_telnet              0
word_fre

The proportion of Spam mails: 0.39404477287546186


The above are the fractional word counts for all the words in an email. Each record is a separate email.

~40% of the emails are classified as Spam, with this the data is not imbalanced. Hence, quite suitable for SVM.

In [3]:
### Features and labels
feats= data.drop('spam',axis=1)
label=data['spam'].astype(int).values

In [4]:
### Rescaling
### using scaler this time to get a standardization in a form of
### (x-mean(x))/std_dev(x)
rescaled_feat=scale(feats)

In [5]:
### Train test split
X_train, X_test, Y_train, Y_test=train_test_split(rescaled_feat,label,train_size=0.7,random_state=100)
### 
print(f'''
Training set spam %: {Y_train.mean()*100:.2f}%
Test set spam %: {Y_test.mean()*100:.2f}% 
''')


Training set spam %: 38.60%
Test set spam %: 41.27% 



### Model Building 

In [6]:
# help(SVC)\
### Initialize the model
model=SVC(C=1)
### Fit the model
model.fit(X_train,Y_train)
### Predict 
y_pred=model.predict(X_test)


In [7]:
### Confusion matrix
cm=confusion_matrix(Y_test,y_pred)
cm[0,0]

774

In [8]:
print(f"Accuracy of the model: {accuracy_score(Y_test,y_pred)}") ## (TP+TN/TP+FN+TN+FN)
print(f"Recall/sensitivity (% of Spams identified correctly): {recall_score(Y_test,y_pred)}") ## (TP/TP+FN)
print(f"Precision/Positive predictive power: {precision_score(Y_test,y_pred)}") ## (TP/TP+FP)
print(f"Specificity/ True Negative rate: {(cm[0,0]/(cm[0,0]+cm[0,1]))}") ## (TN/TN+FP)


Accuracy of the model: 0.9312092686459088
Recall/sensitivity (% of Spams identified correctly): 0.8982456140350877
Precision/Positive predictive power: 0.9326047358834244
Specificity/ True Negative rate: 0.9543773119605425


### Hyper-parameter Tuning

#### K-fold cross validation
A way to do simple or complex cross validation is through cross_val_score() from sklearn

In [9]:
### Creating K-Fold with 5 splits
folds=KFold(n_splits=5, shuffle=True, random_state=100)

## Initializing the model
model=SVC(C=1)

In [10]:
cv_results=cross_val_score(model,X_train, Y_train, cv=folds,scoring="accuracy")

In [11]:
print(cv_results)
print(f"Mean accuracy: {cv_results.mean()}")


[0.93167702 0.9068323  0.93944099 0.92546584 0.9378882 ]
Mean accuracy: 0.9282608695652174


### GridSearchCV

When you want to choose the optimal parameter i.e. choosing the best C that spearates the data. GridSearchCV takes in the list of the parameters and fit them

In [12]:
params={'C':[0.1,1,10,100,1000]}

model=SVC()

model_cv=GridSearchCV( estimator=model, 
                      param_grid=params,
                      scoring='accuracy',
                      cv=folds,
                      verbose=1,
                      return_train_score=True
)

In [None]:
### K-folds for each value of C
model_cv.fit(X_train,Y_train)

Fitting 5 folds for each of 5 candidates, totalling 25 fits


In [None]:
cv_results=pd.DataFrame(model_cv.cv_results_)

In [None]:
cv_results

In [None]:
pp.plot(cv_results.param_C, cv_results.mean_train_score,label="Train accuracy score")
pp.plot(cv_results.param_C, cv_results.mean_test_score,label="Test accuracy score")
pp.ylabel('accuracy score')
pp.xlabel('Cost function value on log scale')
pp.legend()
pp.xscale('log')

At C=10, we have a reasonable train and test accuracy score. As C increases, the model seems to be overfitted.

In [None]:
### Initialize the model with C=10
model_fin=SVC(C=10)

### Fit the model
model_fin.fit(X_train,Y_train)

### Predict the model
y_pred_fin=model_fin.predict(X_test)

print(f'Predicted labels after selecting the best value for C:{y_pred_fin.mean()}')


In [None]:
### Evaluation metrics
print(f"Accuracy of the model: {accuracy_score(Y_test,y_pred_fin)}") ## (TP+TN/TP+FN+TN+FN)
print(f"Recall/sensitivity (% of Spams identified correctly): {recall_score(Y_test,y_pred_fin)}") ## (TP/TP+FN)
print(f"Precision/Positive predictive power: {precision_score(Y_test,y_pred_fin)}") ## (TP/TP+FP)
print(f"Specificity/ True Negative rate: {(cm[0,0]/(cm[0,0]+cm[0,1]))}") ## (TN/TN+FP)
