In [15]:
import os
from collections import Counter
import numpy as np

# 数据目录(如要运行代码请自行修改!!!)
DATA_DIR = 'D:\MY\ml-data\Enron-Spam\pre-processed'
# 选取频率最高的 N 个词作为特征(由于运行速度问题,这里并不设特别大)
FEAT_NUM = 3000
# 划分训练集(可自由组合 1~6 六个目录)(测试集见最后一个代码单元)
TRAIN = [1, 2, 3, 4, 5]
TRAIN_DIRS = []
for i in TRAIN:
    TRAIN_DIRS.append(os.path.join(DATA_DIR, 'enron' + str(i)))

# 将出现次数最多的前 FEAT_NUM 个词构造为词典
def make_dict(train_dirs):
    all_words = []
    for train_dir in train_dirs:
        dirs = []
        dirs.append(os.path.join(train_dir, 'ham'))
        dirs.append(os.path.join(train_dir, 'spam'))
        for d in dirs:
            for f in os.listdir(d):
                fp = os.path.join(d, f)
                with open(fp, encoding='utf-8', errors='ignore') as email:
                    for i, line in enumerate(email):
                        if i != 1:
                            words = line.split()
                            all_words += words
    res = Counter(all_words)
    # 去除非单词和单个字母(认为其对于判断垃圾邮件没有太大帮助)
    for k in res.keys():
        if k.isalpha() == False:
            res[k] = 0
        elif len(k) == 1:
            res[k] = 0
    res = res.most_common(FEAT_NUM)
    return res

wv = make_dict(TRAIN_DIRS)
print(wv)



In [16]:
def extract_feature(data_dirs, wv):
    feature_matrix = []
    labels = []
    for data_dir in data_dirs:
        docID = 0
        # 将 ham 和 span 分开遍历, 同时填充 labels
        ham_dir = os.path.join(data_dir, 'ham')
        for f in os.listdir(ham_dir):
            feature_matrix.append(np.zeros(FEAT_NUM))
            # 0 代表 ham, 1 代表 spam
            labels.append(0)
            fp = os.path.join(ham_dir, f)
            with open(fp, encoding='utf-8', errors='ignore') as email:
                for i, line in enumerate(email):
                    if i != 1:
                        words = line.split()
                        for w in words:
                            for idx, d in enumerate(wv):
                                if d[0] == w:
                                    feature_matrix[docID][idx] = words.count(w)
                docID = docID + 1
        spam_dir = os.path.join(data_dir, 'spam')
        for f in os.listdir(spam_dir):
            feature_matrix.append(np.zeros(FEAT_NUM))
            labels.append(1)
            fp = os.path.join(spam_dir, f)
            with open(fp, encoding='utf-8', errors='ignore') as email:
                for i, line in enumerate(email):
                    if i != 1:
                        words = line.split()
                        for w in words:
                            for idx, d in enumerate(wv):
                                if d[0] == w:
                                    feature_matrix[docID][idx] = words.count(w)
                docID = docID + 1
    return feature_matrix, labels

train_matrix, train_labels = extract_feature(TRAIN_DIRS, wv)
# print(train_labels)
print(train_matrix[1])

[2. 1. 1. ... 0. 0. 0.]


In [19]:
from sklearn.svm import LinearSVC
from sklearn.metrics import confusion_matrix, precision_score, recall_score
import pandas as pd

# 划分测试集
TEST = [6]
TEST_DIRS = []
for i in TEST:
    TEST_DIRS.append(os.path.join(DATA_DIR, 'enron' + str(i)))
test_matrix, test_labels = extract_feature(TEST_DIRS, wv)

model = LinearSVC()
model.fit(train_matrix, train_labels)

result = model.predict(test_matrix)

# 展示预测结果与实际情况对比, 精确率, 召回率
rm = pd.DataFrame(
    confusion_matrix(test_labels, result),
    index=['ham', 'spam'],
    columns=['ham', 'spam']
)
print(rm)
print('precision score: ', precision_score(test_labels, result))
print('recall score: ', recall_score(test_labels, result))

ham  spam
ham   1341   159
spam  2792  1708
precision score:  0.9148366363149437
recall score:  0.37955555555555553
