## 读取 csv 数据文件

In [1]:
import pandas as pd
df = pd.read_csv('data_spam/spam.csv', encoding='latin')
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


## 重命名数据中的列。可读性

In [2]:
df.rename(columns = {'v1': 'Label', 'v2': 'Text'}, inplace=True)
df.head()

Unnamed: 0,Label,Text,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


## 把’ham',‘spam’标签重新命名为 0 和 1

In [3]:
df['numLabel'] = df['Label'].map({'ham': 0, 'spam': 1})
df.head()

Unnamed: 0,Label,Text,Unnamed: 2,Unnamed: 3,Unnamed: 4,numLabel
0,ham,"Go until jurong point, crazy.. Available only ...",,,,0
1,ham,Ok lar... Joking wif u oni...,,,,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,,1
3,ham,U dun say so early hor... U c already then say...,,,,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,,0


## 统计ham个数，spam个数

In [4]:
print("ham ", len(df[df.numLabel==0]))
print("spam", len(df[df.numLabel==1]))
# print("ham ", len(df[df.Label=='ham']))
# print("spam", len(df[df.Label=='spam']))
print("total", len(df))

ham  4825
spam 747
total 5572


## 统计文本的长度信息
df.loc 根据某列表选定的row 及某列 column 赋值

In [5]:
text_len = []
for i in range(len(df)):
    text_len.append(len(df.loc[i, 'Text']))

## 直方图显示长度
### plt.hist(x, bins=None, range=None, density=None, weights=None, cumulative=False, bottom=None, histtype='bar', align='mid', orientation='vertical', rwidth=None, log=False, color=None, label=None, stacked=False, normed=None)

- x：指定要绘制直方图的数据；输入值，这需要一个数组或者一个序列，不需要长度相同的数组。
- bins：指定直方图条形的个数；
- range：指定直方图数据的上下界，默认包含绘图数据的最大值和最小值；
- density：布尔,可选。如果"True"，返回元组的第一个元素将会将计数标准化以形成一个概率密度，也就是说，直方图下的面积（或积分）总和为1。这是通过将计数除以数字的数量来实现的观察乘以箱子的宽度而不是除以总数数量的观察。如果叠加也是“真实”的，那么柱状图被规范化为1。(替代normed)
- weights：该参数可为每一个数据点设置权重；
- cumulative：是否需要计算累计频数或频率；
- bottom：可以为直方图的每个条形添加基准线，默认为0；
- histtype：指定直方图的类型，默认为bar，除此还有’barstacked’, ‘step’, ‘stepfilled’；
- align：设置条形边界值的对其方式，默认为mid，除此还有’left’和’right’；
- orientation：设置直方图的摆放方向，默认为垂直方向；
- rwidth：设置直方图条形宽度的百分比；
- log：是否需要对绘图数据进行log变换；
- color：设置直方图的填充色；
- label：设置直方图的标签，可通过legend展示其图例；
- stacked：当有多个数据时，是否需要将直方图呈堆叠摆放，默认水平摆放；
- normed：是否将直方图的频数转换成频率；(弃用，被density替代)
- alpha：透明度，浮点数。

In [6]:
import numpy as np
import matplotlib.mlab as mlab
import matplotlib.pyplot as plt

plt.hist(text_len, 100, facecolor='red', alpha=0.5)
plt.xlim([0, 200])
plt.show()

<Figure size 640x480 with 1 Axes>

## 导入英文昵称停用词库
you,or,we,this,..等等

In [7]:
from nltk.corpus import stopwords
# import nltk
# nltk.download('stopwords')


stopset = set(stopwords.words('english'))
print(stopset)

{'mightn', 'hers', 'aren', 'hadn', 're', 'couldn', "hasn't", 'wasn', 'me', 'for', 'over', 'why', 'because', "don't", 'their', 'how', 'needn', 'once', 'should', 'ourselves', 'm', 'or', 'hasn', 'off', 'mustn', 'had', 'at', 'ain', 'into', 'against', 'to', 'her', 'with', "didn't", 'he', 'am', "that'll", 'not', 'themselves', "weren't", 'up', 'yourself', 'your', 'where', 'doesn', 'you', 'through', 'too', 'only', 's', 'below', 'which', 'its', 'that', 'if', 'these', 'is', 'them', 'of', 'by', 'down', 't', "haven't", 'ma', 'll', 'did', 'in', 'a', 'than', 'after', 'isn', "should've", 'from', 'ours', "shouldn't", 'don', 'each', "you're", 'herself', 'myself', 'shan', 'yours', 'before', 'under', 'just', 'both', 'few', "isn't", 'o', 'some', 'most', 'yourselves', 'and', "shan't", "doesn't", 'any', "you'd", 'my', 'now', 'between', 'very', "wouldn't", "aren't", 'won', 'an', 'out', 'those', 'i', 'other', 'whom', 'are', 'further', 'such', 'd', 'own', "mustn't", 'shouldn', 'didn', 'when', 'so', "wasn't", '

## 构建文本向量
基于词频的表示

In [8]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()

X = vectorizer.fit_transform(df.Text)
y = df.numLabel
print(X)
#print(y)

  (0, 8267)	1
  (0, 1069)	1
  (0, 3594)	1
  (0, 7645)	1
  (0, 2048)	1
  (0, 1749)	1
  (0, 4476)	1
  (0, 8489)	1
  (0, 3634)	1
  (0, 1751)	1
  (0, 4087)	1
  (0, 5537)	1
  (0, 1303)	1
  (0, 2327)	1
  (0, 5920)	1
  (0, 4350)	1
  (0, 8030)	1
  (0, 3550)	1
  (1, 5533)	1
  (1, 8392)	1
  (1, 4318)	1
  (1, 4512)	1
  (1, 5504)	1
  (2, 77)	1
  (2, 1156)	1
  :	:
  (5570, 1786)	1
  (5570, 3470)	1
  (5570, 2892)	1
  (5570, 7049)	1
  (5570, 1778)	1
  (5570, 8065)	1
  (5570, 2592)	1
  (5570, 5334)	1
  (5570, 1438)	1
  (5570, 7627)	1
  (5570, 3308)	1
  (5570, 7039)	1
  (5570, 4615)	1
  (5570, 1084)	1
  (5570, 8313)	1
  (5570, 4218)	1
  (5570, 3781)	1
  (5570, 7756)	1
  (5570, 3358)	1
  (5570, 4087)	1
  (5571, 6505)	1
  (5571, 7885)	1
  (5571, 4225)	2
  (5571, 5244)	1
  (5571, 7756)	1


## 拆分训练数据和测试数据集

In [9]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=100)
print ("训练数据中的样本个数: ", X_train.shape[0], "测试数据中的样本个数: ", X_test.shape[0])

训练数据中的样本个数:  4457 测试数据中的样本个数:  1115


## 贝叶斯做训练

In [12]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

clf = MultinomialNB(alpha=1.0, fit_prior=True)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

print("accuracy on test data: ", accuracy_score(y_test, y_pred))

accuracy on test data:  0.97847533632287


## 打印混淆矩阵

True Positive(真正，TP): 将正类预测为正类

True Negative(真负，TN): 将负类预测为负类

False Positive(假正，FP): 将负类预测为正类　

False Negative(假负，FN): 将正类预测为负类

输出对应 [TP, FP], [FN, TN]

In [16]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred, labels=[0,1])

array([[956,  14],
       [ 10, 135]])