In [1]:
# Antonio Emanuele Cinà
# Simple spam filter using Support Vector Machine classifier
import numpy as np
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score

TRAINING_SET = "spambase/spambase.data"

def load_csv(filename):
    fread = open(filename, "r")
    data = np.loadtxt(fread, delimiter=",")
    return data

def tfidf(email):
    ndoc = email.shape[0]
    idf = np.log10(ndoc/(email != 0).sum(0))
    return email/100.0*idf

In [2]:
email = load_csv(TRAINING_SET)
np.random.shuffle(email)

Y = email[:,57] # classes
X = email[:,:54] # values

X = tfidf(X)

In [1]:
########################
### Linear Kernel ######
########################
clf = SVC(kernel="linear", C = 1.0)
scores_ln = cross_val_score(clf, X, Y, cv = 10, n_jobs= 8)
print("Min Accuracy Linear Kernel: " + str(scores_ln.min())+"\n")
print("Mean Accuracy Linear Kernel: " + str(scores_ln.mean())+"\n")
print("Max Accuracy Linear Kernel: " + str(scores_ln.max())+"\n")
print("Variance/Std Accuracy Linear Kernel: " + str(scores_ln.var()) +" / " +str(scores_ln.std())+"\n")
print("=================================")

NameError: name 'SVC' is not defined

In [5]:
########################
###  Poly Kernel  ######
########################
clf_pl = SVC(kernel="poly", degree = 2, C = 1.0)
scores_pl = cross_val_score(clf_pl, X, Y, cv = 10, n_jobs= 8)
print("Min Accuracy Linear Kernel: " + str(scores_pl.min())+"\n")
print("Mean Accuracy Linear Kernel: " + str(scores_pl.mean())+"\n")
print("Max Accuracy Linear Kernel: " + str(scores_pl.max())+"\n")
print("Variance/Std Accuracy Linear Kernel: " + str(scores_pl.var()) +" / " +str(scores_pl.std())+"\n")
print("=================================")

Min Accuracy Linear Kernel: 0.6052060737527115

Mean Accuracy Linear Kernel: 0.6059555892945166

Max Accuracy Linear Kernel: 0.6065217391304348

Variance/Std Accuracy Linear Kernel: 3.457428732817126e-07 / 0.0005879990419054376



In [6]:
########################
### RBF Kernel #########
########################
clf_rbf = SVC(kernel="rbf", C = 1.0)
scores_rbf = cross_val_score(clf_rbf, X, Y, cv = 10, n_jobs= 8)
print("Min Accuracy RBF Kernel: " + str(scores_rbf.min())+"\n")
print("Mean Accuracy RBF Kernel: " + str(scores_rbf.mean())+"\n")
print("Max Accuracy RBF Kernel: " + str(scores_rbf.max())+"\n")
print("Variance/Std Accuracy Linear Kernel: " + str(scores_rbf.var()) +" / " +str(scores_rbf.std())+"\n")
print("=================================")

Min Accuracy RBF Kernel: 0.6052060737527115

Mean Accuracy RBF Kernel: 0.6059555892945166

Max Accuracy RBF Kernel: 0.6065217391304348

Variance/Std Accuracy Linear Kernel: 3.457428732817126e-07 / 0.0005879990419054376



In [7]:
norms = np.sqrt(((X+1e-128) ** 2).sum(axis=1, keepdims=True))
XX = np.where(norms > 0.0, X / norms, 0.)

In [8]:
########################
### Linear Kernel ######
########################
clf_a = SVC(kernel="linear", C = 1.0)
scores_ln_a = cross_val_score(clf_a, XX, Y, cv = 10, n_jobs= 8)
print("Min Accuracy Linear Kernel: " + str(scores_ln_a.min())+"\n")
print("Mean Accuracy Linear Kernel: " + str(scores_ln_a.mean())+"\n")
print("Max Accuracy Linear Kernel: " + str(scores_ln_a.max())+"\n")
print("Variance/Std Accuracy Linear Kernel: " + str(scores_ln_a.var()) +" / " +str(scores_ln_a.std())+"\n")
print("=================================")

Min Accuracy Linear Kernel: 0.9

Mean Accuracy Linear Kernel: 0.9215494443242417

Max Accuracy Linear Kernel: 0.9477124183006536

Variance/Std Accuracy Linear Kernel: 0.00016820496178325333 / 0.012969385559202615



In [9]:
############################
### Polynomial Kernel ######
############################
clf_poly_a = SVC(kernel="poly", degree = 2, C = 1.0)
scores_pl_a = cross_val_score(clf_poly_a, XX, Y,cv = 10, n_jobs= 8)
print("Min Accuracy Poly Kernel: " + str(scores_pl_a.min())+"\n")
print("Mean Accuracy Poly Kernel: " + str(scores_pl_a.mean())+"\n")
print("Max Accuracy Poly Kernel: " + str(scores_pl_a.max())+"\n")
print("Variance/Std Accuracy Linear Kernel: " + str(scores_pl_a.var()) +" / " +str(scores_pl_a.std())+"\n")
print("=================================")

Min Accuracy Poly Kernel: 0.6052060737527115

Mean Accuracy Poly Kernel: 0.6059555892945166

Max Accuracy Poly Kernel: 0.6065217391304348

Variance/Std Accuracy Linear Kernel: 3.457428732817126e-07 / 0.0005879990419054376



In [10]:
########################
### RBF Kernel #########
########################
clf_rbf_a = SVC(kernel="rbf", C = 1.0)
scores_rbf_a = cross_val_score(clf_rbf_a, XX, Y, cv = 10, n_jobs= 8)
print("Min Accuracy RBF Kernel: " + str(scores_rbf_a.min())+"\n")
print("Mean Accuracy RBF Kernel: " + str(scores_rbf_a.mean())+"\n")
print("Max Accuracy RBF Kernel: " + str(scores_rbf_a.max())+"\n")
print("Variance/Std Accuracy Linear Kernel: " + str(scores_rbf_a.var()) +" / " +str(scores_rbf_a.std())+"\n")
print("=================================")

Min Accuracy RBF Kernel: 0.8956521739130435

Mean Accuracy RBF Kernel: 0.9161193691430695

Max Accuracy RBF Kernel: 0.9411764705882353

Variance/Std Accuracy Linear Kernel: 0.0001661921510609118 / 0.01289155347740961



In [15]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3)
xx_train, xx_test, yy_train, yy_test = train_test_split(XX, Y, test_size=0.3)
n = XX.shape[0]
n_train = xx_train.shape[0]
print(n_train)

3220


In [16]:
def print_model(clf, xx_train, yy_train):
    clf_fit = clf.fit(xx_train, yy_train)
    print(str(clf_fit.score(xx_train, yy_train)))
    print(str(clf_lna.n_support_)) # number of support vectors for each class
    print(str(clf_lna.support_vectors_)) # print support_vectors

In [17]:
clf_a = SVC(kernel= "linear", C = 1)
clf_lna = clf_a.fit(xx_train, yy_train)
print(str(clf_a.score(xx_train, yy_train)))

0.9236024844720497


In [18]:
### Parameters for linear angular kernel ###
print(str(clf_lna.support_vectors_.shape))
print(str(clf_lna.n_support_))
print(clf_lna.support_vectors_.shape[0]/n_train)
print(str(clf_lna.support_vectors_[0]))

(697, 54)
[346 351]
0
[0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.52271324 0.85250857]


In [20]:
### Parameters for linear kernel ###
clf_ln = clf.fit(x_train, y_train)
print(str(clf.score(x_train, y_train)))
print(str(clf_ln.support_vectors_.shape))
print(str(clf_ln.n_support_))
print(str(clf_lna.support_vectors_[0]))

0.6021739130434782
(2571, 54)
[1287 1284]
[0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.52271324 0.85250857]


In [21]:
def print_model(clf, xx_train, yy_train):
    clf_fit = clf.fit(xx_train, yy_train)
    print(str(clf_fit.score(xx_train, yy_train)))
    print(str(clf_lna.n_support_)) # number of support vectors for each class
    print(str(clf_lna.support_vectors_)) # print support_vectors

In [24]:
# Polynomial kernel parameters
print_model(clf_pl, xx_train, yy_train)

0.6049689440993788
[346 351]
[[0.         0.         0.         ... 0.         0.52271324 0.85250857]
 [0.         0.1387799  0.         ... 0.07742835 0.         0.        ]
 [0.         0.         0.4868123  ... 0.25998987 0.         0.        ]
 ...
 [0.         0.         0.         ... 0.0052838  0.         0.        ]
 [0.         0.44925875 0.         ... 0.03547229 0.05622225 0.        ]
 [0.         0.         0.         ... 0.09661426 0.46014426 0.        ]]


In [27]:
# Polynomial angular kernel parameters
print_model(clf_poly_a, xx_train, yy_train)
# Note that for standard polynomial and angular polynomial we obtain the same parameters and results

0.6049689440993788
[346 351]
[[0.         0.         0.         ... 0.         0.52271324 0.85250857]
 [0.         0.1387799  0.         ... 0.07742835 0.         0.        ]
 [0.         0.         0.4868123  ... 0.25998987 0.         0.        ]
 ...
 [0.         0.         0.         ... 0.0052838  0.         0.        ]
 [0.         0.44925875 0.         ... 0.03547229 0.05622225 0.        ]
 [0.         0.         0.         ... 0.09661426 0.46014426 0.        ]]


In [28]:
# RBF kernel parameters
print_model(clf_rbf, xx_train, yy_train)

0.9139751552795031
[346 351]
[[0.         0.         0.         ... 0.         0.52271324 0.85250857]
 [0.         0.1387799  0.         ... 0.07742835 0.         0.        ]
 [0.         0.         0.4868123  ... 0.25998987 0.         0.        ]
 ...
 [0.         0.         0.         ... 0.0052838  0.         0.        ]
 [0.         0.44925875 0.         ... 0.03547229 0.05622225 0.        ]
 [0.         0.         0.         ... 0.09661426 0.46014426 0.        ]]


In [29]:
# Angular RBF kernel parameters
print_model(clf_rbf_a, xx_train, yy_train)

0.9139751552795031
[346 351]
[[0.         0.         0.         ... 0.         0.52271324 0.85250857]
 [0.         0.1387799  0.         ... 0.07742835 0.         0.        ]
 [0.         0.         0.4868123  ... 0.25998987 0.         0.        ]
 ...
 [0.         0.         0.         ... 0.0052838  0.         0.        ]
 [0.         0.44925875 0.         ... 0.03547229 0.05622225 0.        ]
 [0.         0.         0.         ... 0.09661426 0.46014426 0.        ]]
