In [1]:
import jieba
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn import metrics

In [2]:
filepath = "data/traindata.csv"
# 加载全部数据
alldata = pd.read_csv(filepath,header=None)
alldata.columns=["label","content"]
alldata.shape

(800000, 2)

In [168]:
# 随机抽取n条数据
# data = alldata.sample(n=500)
data = alldata.iloc[:500]

In [169]:
data["label"].value_counts()

0    451
1     49
Name: label, dtype: int64

In [330]:
def getStopWords(filepath):
    """
    函数说明：获取停用词，包括数字，标点符号，常见的中文词汇“的、地、得”等
    Parameter:
        filepath - 停用词文件的路径
    Return:
        stopwords - 以集合形式返回的停用词列表
    Modify:
        2017-12-02
    """
    stopwordfile = open(filepath,"r",encoding="utf-8").readlines()
    stopwords = {line.strip() for line in stopwordfile}
#     stopwords = set()
    return stopwords

In [331]:
stopwordsfile = "data/stopwords.txt"
stopwords = getStopWords(stopwordsfile)

In [332]:
print(len(stopwords))

9


In [333]:
def getVocalist(contents,stopwords):
    """
    函数说明：获取去除停用词后的词汇表，来减小向量维度的大小
    Parameter:
        data - 待处理的文本数据
        stopwords - 停用词列表
    Return:
        list(vocalist)- 文本词汇表
    Modify:
        2017-12-02
    """
    vocalist = set([])
    for i in range(len(data)):
        content = contents.iloc[i]
        segresult = set(jieba.cut(content))
        # 两个集合求并集
        vocalist = vocalist | segresult
    vocalist = vocalist - stopwords
    # print(vocalist)
    print("词汇表长度为：",len(vocalist))
    return list(vocalist)

In [334]:
contents = data["content"]
vocalist=getVocalist(contents,stopwords)   

词汇表长度为： 3296


In [335]:
# 文本信息矢量化
def getVect(content,vocalist,stopwords):
    """
    函数说明：得到每个短信的0-1向量表示，如[0,0,0,1,0,1,0,0,0,0....]，向量长度与词汇表长度相同
    Paremeter:
        content -  每条短信的内容
        vocalist - 词汇表
        stopwords - 停用词列表
    Return:
        vector - 每条短信的向量表示
    Modify:
        2017-12-02
    """
    vector= [0]*len(vocalist)
    segresult = set(jieba.cut(content))-stopwords
    for word in segresult:
        vector[vocalist.index(word)] = 1
    return vector

In [336]:
# 得到每个文本的矢量化结果
data["vector"] = data.apply(lambda row:getVect(row["content"],vocalist,stopwords),axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [337]:
# 随机划分训练数据和测试数据 8:2
# traindata,testdata = train_test_split(data,test_size=0.2)
traindata = data.iloc[:int(0.8*len(data))]
testdata = data.iloc[int(0.8*len(data)):]

In [338]:
print("训练数据的标签分布：")
print(traindata["label"].value_counts())
print("测试数据的标签分布：")
print(testdata["label"].value_counts())

训练数据的标签分布：
0    357
1     43
Name: label, dtype: int64
测试数据的标签分布：
0    94
1     6
Name: label, dtype: int64


In [339]:
# 构建分类器
def trainclassifier(traindata):
    """
    函数说明：构建不同种类的分类器
    Parameter:
        traindata - 训练数据
    Return:
        classifier.fit(vector,label) - 训练好的分类器
    Modify:
        2017-12-02
    """
    classifier = svm.SVC(C=50,kernel='linear')
    # 需转换为list,不然报错
    vector = list(traindata["vector"])
    label = list(traindata["label"])
    return classifier.fit(vector,label)

In [340]:
classificer = trainclassifier(traindata)

In [341]:
# 应用分类器
pred = classificer.predict(list(testdata["vector"]))

In [342]:
def elevate_result(label,pred):
    """
    函数说明: 对分类器预测的结果进行评估，包括accurancy,precision,recall,F-score
    Parameter:
        label - 真实值
        pred - 预测值
    Return:
        None
    Modify:
        2017-12-02
    """
    con_mat = metrics.confusion_matrix(label,pred)
    TP = con_mat[1,1]
    TN = con_mat[0,0]
    FP = con_mat[0,1]
    FN = con_mat[1,0]
    
    accurancy = (TP+TN)/(TP+TN+FN+FP)
    precison = TP/(TP+FP)
    recall = TP/(TP+FN)
    beta = 1
    F_score = (1+pow(beta,2))*precison*recall/(pow(beta,2)*precison+recall)
    
    print("TP:",TP)
    print("TN:",TN)
    print("FP:",FP)
    print("FN:",FN)
    print("accurancy: %s \nprecison: %s \nrecall: %s \nF-score: %s" % (accurancy,precison,recall,F_score))

In [343]:
label = testdata["label"]
elevate_result(label,pred)

TP: 5
TN: 94
FP: 0
FN: 1
accurancy: 0.99 
precison: 1.0 
recall: 0.833333333333 
F-score: 0.909090909091


In [344]:
getstopwordlist = "data/data50.txt"
partdata = pd.read_csv(getstopwordlist,header=None)
partdata.columns=["label","content"]

In [349]:
contents = partdata["content"]
stop={}

In [356]:
for i in range(len(contents)):
    content = contents.iloc[i]
    segresult = list(jieba.cut(content))
    for seg in segresult:
        if seg in stop.keys():
            stop[seg]+=1
        else:
            stop[seg]=1
stop = sorted(stop.iteritems(),key=lambda item:item[1],reverse = True)
print(stop)

AttributeError: 'dict' object has no attribute 'iteritems'