# 垃圾短信分类

核心思想是提取文档的特征，根据特征进行分类。

In [1]:
import numpy as np
import pandas as pd
from thulac import thulac
from collections import defaultdict

In [2]:
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.naive_bayes import MultinomialNB, BernoulliNB

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report

In [None]:
# 可以直接读取上次处理结束后的清洗完成的joblib文件
from joblib import load
df = load('message_cleaned.joblib')

In [3]:
df = pd.read_csv('data/message80W.csv', header=None, index_col=0)
df.columns = ['is_junk', 'contents']
thu = thulac(user_dict='data/userdict.txt', seg_only=True)

Model loaded succeed


In [12]:
df.head()

Unnamed: 0_level_0,is_junk,contents
0,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0,商业秘密的秘密性那是维系其商业价值和垄断地位的前提条件之一
2,1,南口阿玛施新春第一批限量春装到店啦   春暖花开淑女裙、冰蓝色公主衫 ...
3,0,带给我们大常州一场壮观的视觉盛宴
4,0,有原因不明的泌尿系统结石等
5,0,23年从盐城拉回来的麻麻的嫁妆


对dataframe做出以下内容：
* 删去脱敏信息（在其中表示为'x'）
* 分词
* 删去只出现一次的词或停用词

In [4]:
# 读取stopwords
stopwords = None
with open('baidu_stopwords.txt', 'r') as f:
    stopwords = f.read()
stopwords = stopwords.split('\n')
stopwords[:5]

['--', '?', '“', '”', '》']

In [5]:
df['contents'] = df['contents'].str.replace('x', '', regex=False)
df['contents'] = df['contents'].apply(lambda x: thu.cut(x, text=True).split())
freq = defaultdict(int)
def countWord(row):
    global freq
    for x in row:
        freq[x] += 1
df['contents'].apply(countWord)
def filterWord(row) -> list:
    global stopwords, freq
    l = list()
    for x in row:
        if x not in stopwords and freq[x] > 1:
            l.append(x)
    return l
df['contents'] = df['contents'].apply(filterWord)
df[:5]

Unnamed: 0_level_0,is_junk,contents
0,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0,"[商业, 秘密, 维系, 商业, 价, 值, 垄断, 地位, 前提, 条件]"
2,1,"[南口, 玛施, 新春, 第一, 批, 限量, 春装, 店, , , , 春暖花开, ..."
3,0,"[带, 大, 常州, 场, 壮观, 视觉, 盛宴]"
4,0,"[原因, 不, 明, 泌尿, 系统, 结石]"
5,0,"[23年, 盐城, 拉, 回, 麻麻, 嫁妆]"


In [6]:
# 对文件内容进行储存
from joblib import dump
dump(df, 'message_cleaned.joblib')

['message_cleaned.joblib']

创建词典并且将其转换为词向量，使用sklearn中的`CountVectorizer`转换为稀疏矩阵进行特征提取

In [7]:
cv = CountVectorizer(lowercase=False)
X = cv.fit_transform(df['contents'].apply(lambda x: ' '.join(x)))
y = df['is_junk']
del df

我们对制作好的数据集创建训练和测试集，进行模型训练

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [9]:
mnb = MultinomialNB()
mnb.fit(X_train, y_train)
mnb.score(X_test, y_test)

0.97935

In [10]:
bnb = BernoulliNB()
bnb.fit(X_train, y_train)
bnb.score(X_test, y_test)

0.987845