# 1：SVM与垃圾邮件分类
### **by:MLZZY**
**实验描述：实现高斯内核的SVM并实现垃圾邮件分类**

**1：SVM**

**先使用第一个数据集——二维数据集**

In [46]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
from scipy.io import loadmat

In [47]:
raw_data=loadmat('/home/mw/input/andrew_ml_ex67101/ex6data1.mat')
data=pd.DataFrame(raw_data.get('X'),columns=['X1','X2'])
data['y']=raw_data.get('y')
data.head()

Unnamed: 0,X1,X2,y
0,1.9643,4.5957,1
1,2.2753,3.8589,1
2,2.9781,4.5651,1
3,2.932,3.5519,1
4,3.5772,2.856,1


In [48]:
#进行数据可视化
def plot_init_data(data,fig,ax):
    positive=data[data['y'].isin([1])]
    negative=data[data['y'].isin([0])]
    ax.scatter(positive['X1'],positive['X2'],s=50,marker='o',label='Positive')
    ax.scatter(negative['X1'],negative['X2'],s=50,marker='x',label='Negative')

In [49]:
fig,ax=plt.subplots(figsize=(12,8))
plot_init_data(data,fig,ax)
ax.legend()
plt.show()

In [50]:
from sklearn import svm
#使用线性分类，并令参数C=1
svc=svm.LinearSVC(C=1,loss='hinge',max_iter=1000)
svc

LinearSVC(C=1, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='hinge', max_iter=1000, multi_class='ovr',
          penalty='l2', random_state=None, tol=0.0001, verbose=0)

In [51]:
svc.fit(data[['X1','X2']],data['y'])
svc.score(data[['X1','X2']],data['y'])



0.9803921568627451

In [52]:
#分类边界可视化
def find_decision_boundary(svc,x1min,x1max,x2min,x2max,diff):
    x1=np.linspace(x1min,x1max,1000)
    x2=np.linspace(x2min,x2max,1000)
    cordinates=[(x,y) for x in x1 for y in x2]
    x_cord,y_cord=zip(*cordinates)
    c_val=pd.DataFrame({'x1':x_cord,'x2':y_cord})
    c_val['cval']=svc.decision_function(c_val[['x1','x2']])
    decision=c_val[np.abs(c_val['cval'])<diff]
    return decision.x1,decision.x2

In [53]:
x1,x2=find_decision_boundary(svc,0,4,1.5,5,2*10**-3)
fig,ax=plt.subplots(figsize=(12,8))
ax.scatter(x1,x2,s=10,c='r',label='Boundary')
plot_init_data(data,fig,ax)
ax.set_title('SVM (C=1) Decision Boundary')
ax.legend()
plt.show()

In [54]:
#上图的结果为C=1时，下面增大C值
svc2=svm.LinearSVC(C=100,loss='hinge',max_iter=1000)
svc2.fit(data[['X1','X2']],data['y'])
svc2.score(data[['X2','X2']],data['y'])



0.803921568627451

In [55]:
x1,x2=find_decision_boundary(svc2,0,4,1.5,5,2*10**-3)
fig,ax=plt.subplots(figsize=(12,8))
ax.scatter(x1,x2,s=10,c='r',label='Boundary')
plot_init_data(data,fig,ax)
ax.set_title('SVM (C=100) Decision Boundary')
ax.legend()
plt.show()

**下面实现高斯内核的SVM**

![Image Name](https://cdn.kesci.com/upload/image/pzk8sm30t3.png?imageView2/0/w/960/h/960)

In [56]:
def gaussian_kernel(x1,x2,sigma):
    return np.exp(-(np.sum((x1-x2)**2)/(2*(sigma**2))))

In [57]:
x1 = np.array([1.0, 2.0, 1.0])
x2 = np.array([0.0, 4.0, -1.0])
sigma = 2
gaussian_kernel(x1, x2, sigma)

0.32465246735834974

**在数据集2上使用高斯内核SVM**

In [58]:
raw_data=loadmat('/home/mw/input/andrew_ml_ex67101/ex6data2.mat')
data=pd.DataFrame(raw_data.get('X'),columns=['X1','X2'])
data['y']=raw_data.get('y')
data.head()

Unnamed: 0,X1,X2,y
0,0.107143,0.60307,1
1,0.093318,0.649854,1
2,0.097926,0.705409,1
3,0.15553,0.784357,1
4,0.210829,0.866228,1


In [59]:
fig,ax=plt.subplots(figsize=(12,8))
plot_init_data(data,fig,ax)
ax.legend()
plt.show()

In [60]:
#不指定核函数，默认为高斯核函数
svc=svm.SVC(C=100,gamma=10,probability=True)
svc

SVC(C=100, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=10, kernel='rbf',
    max_iter=-1, probability=True, random_state=None, shrinking=True, tol=0.001,
    verbose=False)

In [61]:
svc.fit(data[['X1', 'X2']], data['y'])
svc.score(data[['X1', 'X2']], data['y'])

0.9698725376593279

In [62]:
x1, x2 = find_decision_boundary(svc, 0, 1, 0.4, 1, 0.01)
fig, ax = plt.subplots(figsize=(12,8))
plot_init_data(data, fig, ax)
ax.scatter(x1, x2, s=10)
plt.show()

**使用第三个数据集（包含训练集和验证集），使用验证集为SVM模型寻找最优超参数（C与sigma）**

In [63]:
raw_data=loadmat('/home/mw/input/andrew_ml_ex67101/ex6data3.mat')
#训练集
X = raw_data['X']
y = raw_data['y'].ravel()
#验证集
Xval = raw_data['Xval']
yval = raw_data['yval'].ravel()

In [64]:
fig, ax = plt.subplots(figsize=(12,8))
data = pd.DataFrame(raw_data.get('X'), columns=['X1', 'X2'])
data['y'] = raw_data.get('y')
plot_init_data(data, fig, ax)
ax.legend()
plt.show()

In [65]:
C_values = [0.01, 0.03, 0.1, 0.3, 1, 3, 10, 30, 100]
gamma_values = [0.01, 0.03, 0.1, 0.3, 1, 3, 10, 30, 100]
best_score = 0
best_params = {'C': None, 'gamma': None}
for C in C_values:
    for gamma in gamma_values:
        svc = svm.SVC(C=C, gamma=gamma)
        svc.fit(X, y)
        score = svc.score(Xval, yval)      
        if score > best_score:
            best_score = score
            best_params['C'] = C
            best_params['gamma'] = gamma
best_score, best_params

(0.965, {'C': 0.3, 'gamma': 100})

In [66]:
svc = svm.SVC(C=best_params['C'], gamma=best_params['gamma'])
svc.fit(X, y)
x1, x2 = find_decision_boundary(svc, -0.6, 0.3, -0.7, 0.6, 0.005)
fig, ax = plt.subplots(figsize=(12,8))
plot_init_data(data, fig, ax)
ax.scatter(x1, x2, s=10)
ax.legend()
plt.show()

**2：垃圾邮件分类**
**使用SVM构建垃圾邮件分类器**

**预处理后的数据已提供**
**每封邮件已经转换为一个向量，其中1899个维，对应于词汇表中的1899个单词。 它们的值为0/1，表示邮件中是否存在单词。**

In [67]:
spam_train=loadmat('/home/mw/input/andrew_ml_ex67101/spamTrain.mat')
spam_test=loadmat('/home/mw/input/andrew_ml_ex67101/spamTest.mat')
spam_test

{'__header__': b'MATLAB 5.0 MAT-file, Platform: GLNXA64, Created on: Sun Nov 13 14:27:39 2011',
 '__version__': '1.0',
 '__globals__': [],
 'Xtest': array([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
 'ytest': array([[1],
        [0],
        [0],
        [1],
        [1],
        [1],
        [1],
        [0],
        [1],
        [1],
        [0],
        [0],
        [0],
        [0],
        [1],
        [0],
        [0],
        [1],
        [1],
        [1],
        [0],
        [0],
        [0],
        [0],
        [1],
        [0],
        [0],
        [1],
        [0],
        [0],
        [0],
        [0],
        [0],
        [0],
        [1],
        [0],
        [1],
        [0],
        [1],
        [1],
        [1],
        [1],
        [1],
        [1],
        [1],
        [0],
        [0],
      

In [76]:
#训练集
X = spam_train['X']
y = spam_train['y'].ravel()
#测试集
Xtest = spam_test['Xtest']
ytest = spam_test['ytest'].ravel()
X.shape, y.shape, Xtest.shape, ytest.shape

((4000, 1899), (4000,), (1000, 1899), (1000,))

In [81]:
svc=svm.SVC(gamma='auto')
svc

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [82]:
svc.fit(X,y)
print('Training accuracy = {0}%'.format(np.round(svc.score(X, y) * 100, 2)))
print('Test accuracy = {0}%'.format(np.round(svc.score(Xtest, ytest) * 100, 2)))

Training accuracy = 94.4%
Test accuracy = 95.3%


In [88]:
kw = np.eye(1899)
spam_val = pd.DataFrame({'idx':range(1899)})
spam_val.head()

Unnamed: 0,idx
0,0
1,1
2,2
3,3
4,4


In [89]:
spam_val['isspam'] = svc.decision_function(kw)

In [90]:
spam_val.head()

Unnamed: 0,idx,isspam
0,0,-0.611871
1,1,-0.608278
2,2,-0.627371
3,3,-0.615271
4,4,-0.654248


In [91]:
spam_val['isspam'].describe()

count    1899.000000
mean       -0.622564
std         0.021603
min        -0.763138
25%        -0.631616
50%        -0.624299
75%        -0.615918
max        -0.364164
Name: isspam, dtype: float64

In [92]:
decision = spam_val[spam_val['isspam'] > -0.55]
decision

Unnamed: 0,idx,isspam
173,173,-0.546054
297,297,-0.364164
478,478,-0.543913
529,529,-0.524535
680,680,-0.524313
738,738,-0.549962
774,774,-0.489291
1059,1059,-0.547064
1088,1088,-0.531602
1163,1163,-0.549382


In [96]:
path ='/home/mw/input/andrew_ml_ex67101/vocab.txt'
voc = pd.read_csv(path, header=None, names=['idx', 'voc'], sep = '\t')
voc.head()

Unnamed: 0,idx,voc
0,1,aa
1,2,ab
2,3,abil
3,4,abl
4,5,about


In [97]:
spamvoc = voc.loc[list(decision['idx'])]
spamvoc

Unnamed: 0,idx,voc
173,174,below
297,298,click
478,479,dollarnumb
529,530,email
680,681,free
738,739,guarante
774,775,here
1059,1060,monei
1088,1089,nbsp
1163,1164,offer
