In [18]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# 'svm' is a module in the 'sklearn' library that contains support vector machine algorithms.
from sklearn import svm
# 'classification_report' is a function in the 'sklearn.metrics' module that builds a text report showing the main classification metrics.
from sklearn.metrics import classification_report
# 'LogisticRegression' is a class in the 'sklearn.linear_model' module for logistic regression classification.
from sklearn.linear_model import LogisticRegression
# 'loadmat' is a function in the 'scipy.io' module that reads MATLAB mat files.
from scipy.io import loadmat
# 'GridSearchCV' is a class in the 'sklearn.model_selection' module that implements a “fit” and a “score” method. It also implements “predict”, “predict_proba”, “decision_function”, “transform” and “inverse_transform” if they are available in the estimator used.
from sklearn.model_selection import GridSearchCV

In [59]:
vocab_path = r"D:\MyProjects\Python\Andrew Ng_ML_Exercise\python代码\ex6-SVM\data\vocab.txt"
train_path = r"D:\MyProjects\Python\Andrew Ng_ML_Exercise\python代码\ex6-SVM\data\spamTrain.mat"
test_path = r"D:\MyProjects\Python\Andrew Ng_ML_Exercise\python代码\ex6-SVM\data\spamTest.mat"

In [60]:
tr_data = loadmat(train_path)
te_data = loadmat(test_path)
X, y, Xtest, ytest = tr_data['X'], tr_data['y'].flatten(), te_data['Xtest'], te_data['ytest'].flatten()

In [None]:
[n.shape for n in [X, y, Xtest, ytest]]

### spam with SVM

In [61]:
svc = svm.SVC(kernel='linear')
svc.fit(X,y)
svc.get_params()

{'C': 1.0,
 'break_ties': False,
 'cache_size': 200,
 'class_weight': None,
 'coef0': 0.0,
 'decision_function_shape': 'ovr',
 'degree': 3,
 'gamma': 'scale',
 'kernel': 'linear',
 'max_iter': -1,
 'probability': False,
 'random_state': None,
 'shrinking': True,
 'tol': 0.001,
 'verbose': False}

In [None]:
y_pre = svc.predict(Xtest)
res = classification_report(ytest, y_pre)
print(res)

### get the most probability feature for spam

In [88]:
k=20
fea_weight = svc.coef_
abs_fweight = np.mean(np.abs(fea_weight), axis=0)
vip_fdex = np.argsort(abs_fweight)[-k:]

In [92]:
with open(vocab_path, 'r') as file:
    mydict = {int(line.split()[0]):line.split()[1] for line in file}
vip_words = list(map(mydict.get,vip_fdex),)
vip_words

['clearli',
 'visa',
 'that',
 'remot',
 'numberanumb',
 'why',
 'flag',
 'http',
 'et',
 'urgent',
 'instant',
 'datapow',
 'spam',
 'steve',
 'addit',
 'otherwis',
 'kid',
 'round',
 'wrong',
 'studi']

In [106]:
su_index = svc.support_
su_fea = X[su_index]
fea_mean = np.mean(su_fea,axis=0)
vip_findex = np.argsort(np.abs(fea_mean))[-k:]
vip_fea = list(map(mydict.get,vip_findex))
vip_fea

['option',
 'bd',
 'friend',
 'thank',
 'young',
 'email',
 'issu',
 'wish',
 'old',
 'york',
 'these',
 'food',
 'irish',
 'octob',
 'improv',
 'analyst',
 'http',
 'null',
 'tm',
 'that']

### what about linear logistic regresion?

In [109]:
logi  = LogisticRegression()
logi.fit(X,y)
logi.get_params()

{'C': 1.0,
 'class_weight': None,
 'dual': False,
 'fit_intercept': True,
 'intercept_scaling': 1,
 'l1_ratio': None,
 'max_iter': 100,
 'multi_class': 'auto',
 'n_jobs': None,
 'penalty': 'l2',
 'random_state': None,
 'solver': 'lbfgs',
 'tol': 0.0001,
 'verbose': 0,
 'warm_start': False}

In [113]:
y_pre = logi.predict(Xtest)
res = classification_report(ytest, y_pre)
print(res)

              precision    recall  f1-score   support

           0       1.00      0.99      1.00       692
           1       0.99      0.99      0.99       308

    accuracy                           0.99      1000
   macro avg       0.99      0.99      0.99      1000
weighted avg       0.99      0.99      0.99      1000

