In [1]:
# SVM建立垃圾邮件分类器
import numpy as np
import pandas  as pd
from matplotlib import pyplot as plt
from scipy.io import loadmat
from sklearn import svm
import re # 正则
from nltk.stem import PorterStemmer # 自然语言处理


In [2]:
# 邮件预览
em = './spam/emailSample1.txt'
f = open(em,'r',encoding='utf-8')
email = f.read()
print(email)
f.close()


> Anyone knows how much it costs to host a web portal ?
>
Well, it depends on how many visitors you're expecting.
This can be anywhere from less than 10 bucks a month to a couple of $100. 
You should checkout http://www.rackspace.com/ or perhaps Amazon EC2 
if youre running something big..

To unsubscribe yourself from this mailing list, send an email to:
groupname-unsubscribe@egroups.com




In [3]:
# 邮件预处理

def preprocess(email):
    """做除了Word Stemming和Removal of non-words的所有处理"""
    # 大写转小写
    email = email.lower()
    # 移除html标签
    email = re.sub(r'<.*>','',email)
    # 移除url
    email = re.sub(r'(http|https)://[^\s]*','httpaddr',email)
    # 移除＄，解决dollar 和 number 的连接问题
    email = re.sub(r'[\$][0-9]+','dollar number',email)
    # 移除单个$
    email = re.sub(r'\$','dollar number',email)
    # 移除数字
    email = re.sub(r'[0-9]+', 'number', email)
    # 移除邮箱
    email = re.sub(r'[^\s]+@[^\s]+','emailaddr',email)
    return email


def preprocess2(email):
    """预处理数据 : 提取词干，去除非字符内容"""

    stemmer = PorterStemmer()
    email = preprocess(email)

    # 将邮件分割为单个单词，re.split()可以设置多种分隔符
    tokens = re.split('[ \@\$\/\#\.\-\:\&\*\+\=\[\]\?\!\(\)\{\}\,\'\"\>\_\<\;\%]', email)

    # 遍历每个分割出来的内容
    tokenlist = []
    for token in tokens:
        #删除任何非字母的字符
        token = re.sub('[^a-zA-Z0-9]','',token)
        #提取词根
        stemmed = stemmer.stem(token)
        # 去除空字符串“:里面不包含任何字符
        if not len(token):continue
        tokenlist.append(stemmed)
    return tokenlist
email_preprocessed = preprocess2(email)
email_preprocessed

['anyon',
 'know',
 'how',
 'much',
 'it',
 'cost',
 'to',
 'host',
 'a',
 'web',
 'portal',
 'well',
 'it',
 'depend',
 'on',
 'how',
 'mani',
 'visitor',
 'you',
 're',
 'expect',
 'thi',
 'can',
 'be',
 'anywher',
 'from',
 'less',
 'than',
 'number',
 'buck',
 'a',
 'month',
 'to',
 'a',
 'coupl',
 'of',
 'dollar',
 'number',
 'you',
 'should',
 'checkout',
 'httpaddr',
 'or',
 'perhap',
 'amazon',
 'ecnumb',
 'if',
 'your',
 'run',
 'someth',
 'big',
 'to',
 'unsubscrib',
 'yourself',
 'from',
 'thi',
 'mail',
 'list',
 'send',
 'an',
 'email',
 'to',
 'emailaddr']

In [33]:
vocab_path = './spam/vocab.txt'


def build_word_list(vocab_path):
    """
    从单词表文件构建单词列表

    参数:
    vocab (str): 单词表文件路径

    返回:
    list: 单词列表，包含单词表中的所有单词
    """
    vocab = []
    with open(vocab_path, 'r', encoding='utf-8') as file:
        for line in file:
            parts = line.strip().split()
            if len(parts) >= 2:
                word = parts[1]
                vocab.append(word)
    return vocab

def words_to_indexes(email, vocab):
    email_preprocessed = preprocess2(email)
    """
    将英文文本中的单词转换为单词表中的数字序号

    参数:
    text (str): 包含英文文本的字符串
    word_list (list): 单词表列表，包含单词表中的所有单词

    返回:
    list: 包含单词在单词表中数字序号的列表
    """
    indexes = []# 初始化空列表
    for line in email_preprocessed:
        parts = line.strip().split()
        i=0
        while i<len(vocab):
            if(vocab[i]==parts[0]):
                indexes.append(i+1)
                break
            else:
                i=i+1
        
    
    return indexes

# 示例用法
# 假设text是包含英文文本的字符串，vocab是单词表文件路径

vocab = build_word_list(vocab_path)
indexes = words_to_indexes(email, vocab)
print(len(vocab))
print("单词数字序号列表:", indexes)




1899
单词数字序号列表: [86, 916, 794, 1077, 883, 370, 1699, 790, 1822, 1831, 883, 431, 1171, 794, 1002, 1893, 1364, 592, 1676, 238, 162, 89, 688, 945, 1663, 1120, 1062, 1699, 375, 1162, 477, 1120, 1893, 1510, 799, 1182, 1237, 810, 1895, 1440, 1547, 181, 1699, 1758, 1896, 688, 1676, 992, 961, 1477, 71, 530, 1699, 531]


In [23]:
email

"> Anyone knows how much it costs to host a web portal ?\n>\nWell, it depends on how many visitors you're expecting.\nThis can be anywhere from less than 10 bucks a month to a couple of $100. \nYou should checkout http://www.rackspace.com/ or perhaps Amazon EC2 \nif youre running something big..\n\nTo unsubscribe yourself from this mailing list, send an email to:\ngroupname-unsubscribe@egroups.com\n\n"

In [34]:
# 对照词汇表进行特征提取
def FeatureVector(email):
    vocab_path = './spam/vocab.txt'
    vocab = build_word_list(vocab_path)
    indexes = words_to_indexes(email, vocab)
    vector=np.zeros((1,len(vocab)))
    vector=vector.flatten()
    for number in indexes:
        vector[number-1]=1
    return vector

vector = FeatureVector(email)
print(vector)
print('length of vector = {}\nnum of non-zero = {}'.format(len(vector), int(vector.sum())))


[0. 0. 0. ... 0. 0. 0.]
length of vector = 1899
num of non-zero = 45


In [35]:
# 读取训练集与测试集

mat1 = loadmat('./spam/spamTrain.mat')
X, y = mat1['X'], mat1['y']

mat2 = loadmat('./spam/spamTest.mat')
Xtest, ytest = mat2['Xtest'], mat2['ytest']

In [36]:
clf = svm.SVC(C=0.1, kernel='linear')
clf.fit(X, y)

  y = column_or_1d(y, warn=True)


SVC(C=0.1, kernel='linear')

In [37]:
# 预测
predTrain = clf.score(X, y)
predTest = clf.score(Xtest, ytest)
print(predTrain, predTest)


0.99825 0.989
