# Importing the libraries 

In [1]:
import numpy as np
import pandas as pd
import statsmodels.formula.api as sm

# Load Data

In [2]:
dataset=pd.read_csv('emails.csv') 
dataset.head()

Unnamed: 0,Email No.,the,to,ect,and,for,of,a,you,hou,...,connevey,jay,valued,lay,infrastructure,military,allowing,ff,dry,Prediction
0,Email 1,0,0,1,0,0,0,2,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Email 2,8,13,24,6,6,2,102,1,27,...,0,0,0,0,0,0,0,1,0,0
2,Email 3,0,0,1,0,0,0,8,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Email 4,0,5,22,0,5,1,51,2,10,...,0,0,0,0,0,0,0,0,0,0
4,Email 5,7,6,17,1,5,2,57,0,9,...,0,0,0,0,0,0,0,1,0,0


In [3]:
dataset.describe()

Unnamed: 0,the,to,ect,and,for,of,a,you,hou,in,...,connevey,jay,valued,lay,infrastructure,military,allowing,ff,dry,Prediction
count,5172.0,5172.0,5172.0,5172.0,5172.0,5172.0,5172.0,5172.0,5172.0,5172.0,...,5172.0,5172.0,5172.0,5172.0,5172.0,5172.0,5172.0,5172.0,5172.0,5172.0
mean,6.640565,6.188128,5.143852,3.075599,3.12471,2.62703,55.517401,2.466551,2.024362,10.600155,...,0.005027,0.012568,0.010634,0.098028,0.004254,0.006574,0.00406,0.914733,0.006961,0.290023
std,11.745009,9.534576,14.101142,6.04597,4.680522,6.229845,87.574172,4.314444,6.967878,19.281892,...,0.105788,0.199682,0.116693,0.569532,0.096252,0.138908,0.072145,2.780203,0.098086,0.453817
min,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,1.0,1.0,0.0,1.0,0.0,12.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,3.0,3.0,1.0,1.0,2.0,1.0,28.0,1.0,0.0,5.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,8.0,7.0,4.0,3.0,4.0,2.0,62.25,3.0,1.0,12.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
max,210.0,132.0,344.0,89.0,47.0,77.0,1898.0,70.0,167.0,223.0,...,4.0,7.0,2.0,12.0,3.0,4.0,3.0,114.0,4.0,1.0


In [4]:
# verify that there are no null columns in the dataset
dataset[dataset.isnull().any(axis=1)]

Unnamed: 0,Email No.,the,to,ect,and,for,of,a,you,hou,...,connevey,jay,valued,lay,infrastructure,military,allowing,ff,dry,Prediction


In [5]:
categorical = [var for var in dataset.columns if dataset[var].dtype=='O']
numerical = [var for var in dataset.columns if dataset[var].dtype!='O']

In [6]:
print('There are {} categorical variables : \n'.format(len(categorical)), categorical)

There are 1 categorical variables : 
 ['Email No.']


In [7]:
# check for cardinality in categorical variables
for var in categorical:
    print(var, ' contains ', len(dataset[var].unique()), ' labels')

Email No.  contains  5172  labels


In [8]:
# view summary statistics in numerical variables to check for outliers
print(round(dataset[numerical].describe()),2)

          the      to     ect     and     for      of       a     you     hou  \
count  5172.0  5172.0  5172.0  5172.0  5172.0  5172.0  5172.0  5172.0  5172.0   
mean      7.0     6.0     5.0     3.0     3.0     3.0    56.0     2.0     2.0   
std      12.0    10.0    14.0     6.0     5.0     6.0    88.0     4.0     7.0   
min       0.0     0.0     1.0     0.0     0.0     0.0     0.0     0.0     0.0   
25%       0.0     1.0     1.0     0.0     1.0     0.0    12.0     0.0     0.0   
50%       3.0     3.0     1.0     1.0     2.0     1.0    28.0     1.0     0.0   
75%       8.0     7.0     4.0     3.0     4.0     2.0    62.0     3.0     1.0   
max     210.0   132.0   344.0    89.0    47.0    77.0  1898.0    70.0   167.0   

           in  ...  connevey     jay  valued     lay  infrastructure  \
count  5172.0  ...    5172.0  5172.0  5172.0  5172.0          5172.0   
mean     11.0  ...       0.0     0.0     0.0     0.0             0.0   
std      19.0  ...       0.0     0.0     0.0     1.0  

# Data Preprocessing

In [9]:
# use LabelEncoder to replace purchased (dependent variable) with 0 and 1 
from sklearn.preprocessing import LabelEncoder
dataset['Email No.']= LabelEncoder().fit_transform(dataset['Email No.'])
dataset.head()

Unnamed: 0,Email No.,the,to,ect,and,for,of,a,you,hou,...,connevey,jay,valued,lay,infrastructure,military,allowing,ff,dry,Prediction
0,0,0,0,1,0,0,0,2,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1111,8,13,24,6,6,2,102,1,27,...,0,0,0,0,0,0,0,1,0,0
2,2222,0,0,1,0,0,0,8,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3333,0,5,22,0,5,1,51,2,10,...,0,0,0,0,0,0,0,0,0,0
4,4444,7,6,17,1,5,2,57,0,9,...,0,0,0,0,0,0,0,1,0,0


In [10]:
y = dataset['Prediction']
x = dataset.drop(['Prediction'], axis=1)

In [11]:
print(x.head())

   Email No.  the  to  ect  and  for  of    a  you  hou  ...  enhancements  \
0          0    0   0    1    0    0   0    2    0    0  ...             0   
1       1111    8  13   24    6    6   2  102    1   27  ...             0   
2       2222    0   0    1    0    0   0    8    0    0  ...             0   
3       3333    0   5   22    0    5   1   51    2   10  ...             0   
4       4444    7   6   17    1    5   2   57    0    9  ...             0   

   connevey  jay  valued  lay  infrastructure  military  allowing  ff  dry  
0         0    0       0    0               0         0         0   0    0  
1         0    0       0    0               0         0         0   1    0  
2         0    0       0    0               0         0         0   0    0  
3         0    0       0    0               0         0         0   0    0  
4         0    0       0    0               0         0         0   1    0  

[5 rows x 3001 columns]


In [12]:
print(y.head())

0    0
1    0
2    0
3    0
4    0
Name: Prediction, dtype: int64


# Splitting the dataset into training and test set  

In [13]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.3,random_state = 0) # func returns train and test data. It takes dataset and then split size test_size =0.3 means 30% data is for test and rest for training and random_state 


In [14]:
print(x_train.head())

      Email No.  the  to  ect  and  for  of    a  you  hou  ...  enhancements  \
3459       2735    5   4    1    2    2   3   27    7    0  ...             0   
1385        430    3   6    4    6    0   2   57    2    0  ...             0   
1380        425    1   0    1    0    0   1    8    0    0  ...             0   
4462       3849   37  16    5   22    5  19  228    6    2  ...             0   
3840       3158    2   1    1    0    2   0   10    0    0  ...             0   

      connevey  jay  valued  lay  infrastructure  military  allowing  ff  dry  
3459         0    0       0    0               0         0         0   0    0  
1385         0    0       0    0               0         0         0   1    0  
1380         0    0       0    0               0         0         0   0    0  
4462         0    0       0    2               0         0         0   1    0  
3840         0    0       0    0               0         0         0   0    0  

[5 rows x 3001 columns]


In [15]:
print(x_test.head())

      Email No.  the  to  ect  and  for  of    a  you  hou  ...  enhancements  \
3324       2585   14   7    2    2    2   4   37    0    1  ...             0   
15          667    6   2    1    0    2   0   36    3    1  ...             0   
4950       4391   40   6    4    6    5  11  130    3    0  ...             0   
3964       3295   42  19    1    8    7   7  214    1    3  ...             0   
2315       1464    8  14    8    3    3   0   87    0    4  ...             0   

      connevey  jay  valued  lay  infrastructure  military  allowing  ff  dry  
3324         0    0       0    0               0         0         0   0    0  
15           0    0       0    0               0         0         0   0    0  
4950         0    0       0    0               0         0         0   1    0  
3964         0    0       0    1               0         4         0   3    0  
2315         0    0       0    0               0         0         0   5    0  

[5 rows x 3001 columns]


In [16]:
print(y_train[:10])

3459    1
1385    0
1380    0
4462    1
3840    0
1452    1
2849    0
3145    0
1271    0
1021    0
Name: Prediction, dtype: int64


In [17]:
print(y_test[:10])

3324    0
15      0
4950    0
3964    1
2315    0
861     1
2350    0
1767    0
965     0
2630    1
Name: Prediction, dtype: int64


In [18]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
x_train=scaler.fit_transform(x_train) # apply on whole x data 
x_test=scaler.transform(x_test)

# Build Model 

In [19]:
from sklearn.svm import SVC 
svr_lin = SVC(kernel='linear', C=100, gamma='auto')
svr_lin.fit(x_train,y_train)
y_pred_lin= svr_lin.predict(x_test)

In [20]:
svr_rbf = SVC(kernel='rbf', C=100, gamma=0.1)
svr_rbf.fit(x_train,y_train)
y_pred_rbf= svr_rbf.predict(x_test)

In [21]:
# poly kernel type to be used in the algorithm
svr_poly = SVC(kernel='poly', C=100, gamma='auto', degree=3,  coef0=1)
svr_poly.fit(x_train,y_train)
y_pred_poly= svr_poly.predict(x_test)

In [22]:
# sigmoid kernel type to be used in the algorithm
svr_sig = SVC(kernel='sigmoid', C=1, gamma='auto',  coef0=1)
svr_sig.fit(x_train,y_train)
y_pred_sig= svr_sig.predict(x_test)

# Evaluate Model 

In [23]:
# Evaluate linear model

In [24]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, average_precision_score
cm = confusion_matrix(y_test,y_pred_lin)
print(cm)

[[1057   54]
 [  38  403]]


In [25]:
cr = classification_report(y_test,y_pred_lin)
print(cr)

              precision    recall  f1-score   support

           0       0.97      0.95      0.96      1111
           1       0.88      0.91      0.90       441

    accuracy                           0.94      1552
   macro avg       0.92      0.93      0.93      1552
weighted avg       0.94      0.94      0.94      1552



In [26]:
accuracy_score(y_test,y_pred_lin)

0.9407216494845361

In [27]:
average_precision_score(y_test,y_pred_lin)

0.8303365632536637

In [28]:
# Evaluate rbf model

In [29]:
cm = confusion_matrix(y_test,y_pred_rbf)
print(cm)

[[1108    3]
 [ 412   29]]


In [30]:
cr = classification_report(y_test,y_pred_rbf)
print(cr)

              precision    recall  f1-score   support

           0       0.73      1.00      0.84      1111
           1       0.91      0.07      0.12       441

    accuracy                           0.73      1552
   macro avg       0.82      0.53      0.48      1552
weighted avg       0.78      0.73      0.64      1552



In [31]:
accuracy_score(y_test,y_pred_rbf)

0.7326030927835051

In [32]:
average_precision_score(y_test,y_pred_rbf)

0.32505858872758725

In [33]:
# Evaluate poly model

In [34]:
cm = confusion_matrix(y_test,y_pred_poly)
print(cm)

[[1067   44]
 [  44  397]]


In [35]:
cr = classification_report(y_test,y_pred_poly)
print(cr)

              precision    recall  f1-score   support

           0       0.96      0.96      0.96      1111
           1       0.90      0.90      0.90       441

    accuracy                           0.94      1552
   macro avg       0.93      0.93      0.93      1552
weighted avg       0.94      0.94      0.94      1552



In [36]:
accuracy_score(y_test,y_pred_poly)

0.9432989690721649

In [37]:
average_precision_score(y_test,y_pred_poly)

0.8387587301481284

In [38]:
# Evaluate sigmoid model

In [39]:
cm = confusion_matrix(y_test,y_pred_sig)
print(cm)

[[1058   53]
 [ 158  283]]


In [40]:
cr = classification_report(y_test,y_pred_sig)
print(cr)

              precision    recall  f1-score   support

           0       0.87      0.95      0.91      1111
           1       0.84      0.64      0.73       441

    accuracy                           0.86      1552
   macro avg       0.86      0.80      0.82      1552
weighted avg       0.86      0.86      0.86      1552



In [41]:
accuracy_score(y_test,y_pred_sig)

0.8640463917525774

In [42]:
average_precision_score(y_test,y_pred_sig)

0.6423032598737416