In [479]:
import jieba
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn import metrics

In [480]:
filepath = "data/traindata.csv"
# 加载全部数据
alldata = pd.read_csv(filepath,header=None)
alldata.columns=["label","content"]
alldata.shape

(800000, 2)

In [481]:
# 随机抽取n条数据
data = alldata.sample(n=500)

In [482]:
data["label"].value_counts()

0    433
1     67
Name: label, dtype: int64

In [483]:
def getVocalist(data):
    vocalist = set([])
    for i in range(len(data)):
        content = data.iloc[i]["content"]
        segresult = set(jieba.cut(content))
        # 两个集合求并集
        vocalist = vocalist | segresult
    # print(vocalist)
    print("词汇表长度为：",len(vocalist))
    return list(vocalist)

In [484]:
vocalist=getVocalist(data)   

词汇表长度为： 3594


In [485]:
# 文本信息矢量化
def getVect(content,vocalist):
    vector= [0]*len(vocalist)
    segresult = list(jieba.cut(content))
    for word in segresult:
        vector[vocalist.index(word)] = 1
    return vector

In [486]:
# 得到每个文本的矢量化结果
data["vector"] = data.apply(lambda row:getVect(row["content"],vocalist),axis=1)

In [487]:
# 随机划分训练数据和测试数据 8:2
traindata,testdata = train_test_split(data,test_size=0.2)

In [488]:
print("训练数据的标签分布：")
print(traindata["label"].value_counts())
print("测试数据的标签分布：")
print(testdata["label"].value_counts())

训练数据的标签分布：
0    346
1     54
Name: label, dtype: int64
测试数据的标签分布：
0    87
1    13
Name: label, dtype: int64


In [489]:
# 构建分类器
def trainclassifier(traindata):
    classifier = svm.SVC(C=50,kernel='linear')
    # 需转换为list,不然报错
    vector = list(traindata["vector"])
    label = list(traindata["label"])
    return classifier.fit(vector,label)

In [490]:
classificer = trainclassifier(traindata)

In [491]:
# 应用分类器
testdata["pred"] = classificer.predict(list(testdata["vector"]))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [492]:
def elevate_result(data):
    label = data["label"]
    pred = data["pred"]
    con_mat = metrics.confusion_matrix(label,pred)
    TP = con_mat[1,1]
    TN = con_mat[0,0]
    FP = con_mat[0,1]
    FN = conf_mat[1,0]
    
    accurancy = (TP+TN)/(TP+TN+FN+FP)
    precison = TP/(TP+FP)
    recall = TP/(TP+FN)
    beta = 1
    F_score = (1+pow(beta,2))*precison*recall/(pow(beta,2)*precison+recall)
    
    print("TP:",TP)
    print("TN:",TN)
    print("FP:",FP)
    print("FN:",FN)
    print("accurancy: %s \nprecison: %s \nrecall: %s \nF-score: %s" % (accurancy,precison,recall,F_score))

In [493]:
elevate_result(testdata)

TP: 12
TN: 87
FP: 0
FN: 1
accurancy: 0.99 
precison: 1.0 
recall: 0.923076923077 
F-score: 0.96
