In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import os
import codecs
import string
import jieba
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import naive_bayes as bayes
from sklearn.model_selection import train_test_split

### 文件读入

In [2]:
data = pd.read_excel('chinesespam.xlsx')
data

Unnamed: 0,type,text
0,ham,1506讲的是孔子后人的故事。一个老领导回到家乡，跟儿子感情不和，跟贪财的孙子孔为本和睦。老...
1,ham,那他为什么不愿意起诉，既然这样了！起诉后也有充分的理由！MM莫不是还生活在电影中，个人认为这...
2,ham,我觉得，负债不要紧，最重要的是能负得起这个责任来，\n欠了那么多钱，至少对当初拿出爱心来的网...
3,ham,"公司现在有内部推荐机会,2-3人主要从事视频编解码器在pc/dsp/arm上的优化工作.(h..."
4,ham,鼓励一下！\n还是让姐姐们给你解答更好吧。\n 赫赫，很少有女生追男生的例子。不过还...
...,...,...
145,spam,"EMC整改与对策\n\n\n 课程收益：\n 系统学习了解EMC的测试方法,对策理论..."
146,spam,您的IP：211.68.236.105 在发送大量垃圾邮件，请检查。\n\n\n信件已收到！...
147,spam,您好\n\n 商务邮件网为满足客户的需求，现特价118元提供如下产品信息：\n\n ...
148,spam,\n\n尊敬的负责人（经理／财务）您好！\n 我是新永友广深（深圳）实业有限公司；在我公...


In [3]:
data.shape

(150, 2)

In [4]:
data.loc[data['type']=='spam'].shape[0]

50

### 加载停用词

In [5]:
stopwords = codecs.open(os.path.join'stopwords.txt','r','UTF8').read().split('\r\n')

###  分词

In [10]:
processed_texts = []
for text in data['text']:
    words = []
    seg_list = jieba.cut(text)
    for seg in seg_list:
        if (seg.isalpha()) & (seg not in stopwords):
            words.append(seg)
    sentence = " ".join(words)
    processed_texts.append(sentence)
data['text'] = processed_texts
data.head()

Unnamed: 0,type,text
0,ham,讲 的 是 孔子 后人 的 故事 一个 老 领导 回到 家乡 跟 儿子 感情 不 和 跟 贪...
1,ham,那 他 为什么 不 愿意 起诉 既然 这样 了 起诉 后 也 有 充分 的 理由 MM 莫不...
2,ham,我 觉得 负债 不要紧 最 重要 的 是 能 负得起 这个 责任 来 欠 了 那么 多钱 至...
3,ham,公司 现在 有 内部 推荐 机会 人 主要 从事 视频 编解码器 在 pcdsparm 上 ...
4,ham,鼓励 一下 还是 让 姐姐 们 给 你 解答 更好 吧 赫赫 很少 有 女生 追 男生 的 ...


### 训练词向量

In [11]:
vectorizer = CountVectorizer(binary=False)
vectorizer.fit(data['text'])

vocabulary = vectorizer.vocabulary_
print('There are ', len(vocabulary),' word features.')

vector = vectorizer.transform(data['text'])
textmatrix = pd.DataFrame(vector.toarray())
textmatrix.head()

There are  6567  word features.


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,6557,6558,6559,6560,6561,6562,6563,6564,6565,6566
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [14]:
keys = []
values = []
for key,value in vectorizer.vocabulary_.items():
    keys.append(key)
    values.append(value)
df = pd.DataFrame(data = {'key': keys,'value': values})
colnames = df.sort_values('value')['key'].values
textmatrix.columns = colnames
textmatrix.head()

Unnamed: 0,aac,aav,abc,aemysonyamericansinglescupidjunction,age,annualhouseholdincome,areyouupforthechallengeofentrepreneurshipeverwantedtobepartofahighpotentialstartupcompanyhardatplaynothardatworksoundsappealingjoinadynamicinternetstartuplocatedinzhongguancunbeijingourlivelymembersarefromamericachinahongkongandenglandwefosteramultilingualmulticulturalworkenviornmentwearelookingforenergeticandcreativepeoplewhohavethespirittobuildacompanyfromthegroundupwithusyourresponsibilitieswillextendfarbeyondthetraditionaljobdescriptionboundariesyouwillbeinvolvedineveryaspectofthestartupprocessinterested,asq,a型,babyface,...,黑龙江省,默认,默默,默默地,黯然,鼎力支持,鼎韵,鼓励,鼠标,齐全
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [12]:
vocabulary

{'孔子': 2601,
 '后人': 2038,
 '故事': 3711,
 '一个': 332,
 '领导': 6447,
 '回到': 2180,
 '家乡': 2709,
 '儿子': 1297,
 '感情': 3296,
 '贪财': 5854,
 '孙子': 2611,
 '孔为': 2600,
 '和睦': 2089,
 '弟弟': 3046,
 '魏宗万': 6532,
 '马车': 6497,
 '洋妞': 4381,
 '大概': 2433,
 '考察': 5369,
 '民俗': 4289,
 '他们': 1008,
 '过年': 6013,
 '总想': 3211,
 '出国': 1499,
 '爷爷': 4575,
 '教育': 3729,
 '最后': 3951,
 '一家人': 373,
 '基本': 2292,
 '和解': 2090,
 '顺便': 6431,
 '另一类': 1908,
 '电影': 4735,
 '北京': 1684,
 '青年电影制片厂': 6398,
 '越战': 5924,
 '背景': 5413,
 '军人': 1428,
 '介绍': 991,
 '对象': 2750,
 '相亲': 4847,
 '女方': 2517,
 '军队': 1429,
 '医院': 1699,
 '护士': 3484,
 '犹豫不决': 4613,
 '总是': 3213,
 '回忆': 2187,
 '战场': 3365,
 '负伤': 5822,
 '男友': 4763,
 '好像': 2526,
 '男方': 4768,
 '表示': 5605,
 '理解': 4674,
 '归队': 3055,
 '为什么': 769,
 '愿意': 3305,
 '起诉': 5912,
 '既然': 3847,
 '这样': 6056,
 '充分': 1305,
 '理由': 4673,
 'mm': 187,
 '莫不是': 5540,
 '生活': 4696,
 '个人': 717,
 '认为': 5692,
 '这么': 6045,
 '结婚': 5255,
 '恰恰': 3236,
 '认真': 5697,
 '没有': 4346,
 '何来': 1165,
 '传统': 1119,
 '家庭': 2712,
 '责任感'

### 特征选择

In [15]:
features = pd.DataFrame(textmatrix.apply(sum,axis=0))
features

Unnamed: 0,0
aac,1
aav,1
abc,1
aemysonyamericansinglescupidjunction,1
age,1
...,...
鼎力支持,1
鼎韵,2
鼓励,1
鼠标,12


In [19]:
extractedfeatures = [features.index[i] for i in range(features.shape[0]) if features.iloc[i,0] > 5]
textmatrix = textmatrix[extractedfeatures]
textmatrix.shape[1]

778

### 划分训练集测试集

In [20]:
X_train,X_test,y_train,y_test = train_test_split(textmatrix,data['type'],test_size=0.2)

In [21]:
bys = bayes.BernoulliNB(alpha=1,binarize=True)
model = bys.fit(X_train,y_train)

In [22]:
model.predict(X_test)

array(['ham', 'ham', 'ham', 'spam', 'ham', 'ham', 'ham', 'ham', 'ham',
       'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'spam',
       'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham',
       'ham', 'ham', 'ham'], dtype='<U4')

In [None]:
model.score