In [81]:
import jieba
import time
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn
from sklearn.feature_extraction.text import CountVectorizer  
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn import metrics
from scipy import io
import json

In [8]:
def load_data(rawdata,n):
    """
    Purpose:加载原始数据，处理并输出
    
    """
    alldata = pd.read_csv(rawdata,header=None)
    alldata.columns = ["label","content"]
    data = alldata.sample(n)
    content = data["content"]
    label=data["label"]
    return content,label

In [73]:
rawdata_path = "rawdata/traindata.csv"
content ,label = load_data(rawdata_path,100)

In [64]:
class MessageCountVectorizer(sklearn.feature_extraction.text.CountVectorizer):
    def build_analyzer(self):
        def analyzer(doc):
            words = jieba.cut(doc)
            return words
        return analyzer

In [76]:
def vect_data(content,label):
    """
    函数说明：得到每个短信的内容和标签的向量表示
    Return:
        vect_result - 短信的向量表示
    Modify:
        2017-12-22
    
    """
    vect = 	MessageCountVectorizer(max_df=0.9,min_df=2)
    vect_result=vect.fit_transform(content)

    words = vect.get_feature_names()
    io.mmwrite("data/content_vector.mtx",vect_result)
    print(label)
    io.mmwrite("data/label_vector",label)
    print(words)
    print(len(words))
    print(vect_result.toarray())

In [77]:
vect_data(content,label)

597363    1
389380    0
644816    0
220541    0
213792    0
669048    0
348089    1
223072    0
609600    0
728001    0
76693     0
782039    0
437495    0
173458    0
433323    0
679704    0
656113    0
477913    1
507999    0
325064    0
765361    1
771041    0
501300    0
31986     0
70581     1
556656    0
179101    0
240064    0
253517    0
794936    0
         ..
298879    0
615504    0
198046    0
576436    0
470677    0
625754    0
297488    0
297477    0
29789     0
598238    0
713878    0
421113    0
81207     0
720113    0
319801    0
4185      0
312801    0
438160    0
302887    0
616915    0
86411     0
608579    0
433299    0
404771    0
642562    0
285661    0
610332    0
360420    0
449980    0
303384    0
Name: label, Length: 100, dtype: int64


ValueError: Expected 2 dimensional array

In [19]:
set(jieba.cut(s))

{'为了', '互联网', '扩展', '数字', '服务', '汽车', '的'}

In [85]:
# io.mmwrite("data/label_vector",label.as_matrix())
with open('data/label_vector.json', 'w') as f:
        json.dump(label.as_matrix(), f)

TypeError: Object of type 'ndarray' is not JSON serializable

In [24]:
set(jieba.cut(s)) - stopwords

{'为了', '互联网', '扩展', '数字', '服务', '汽车'}

In [41]:
words=(pseg.cut(s))
# new_doc=''.join(w.word for w in words if w.flag != 'x')


In [36]:
new_doc

'为了扩展互联网汽车的数字服务'

In [45]:

s= "nihaom xxxx 你好吗"
words=(pseg.cut(s))
for w in words:
    print(w.flag,w.word)

eng nihaom
x  
eng xxxx
x  
l 你好
y 吗


In [3]:
filepath = "data/traindata.csv"
# 加载全部数据
alldata = pd.read_csv(filepath,header=None)
alldata.columns=["label","content"]
alldata.shape

(800000, 2)

In [7]:
# 随机抽取n条数据
data = alldata.sample(n=1000)
# data = alldata.iloc[:1000]

In [8]:
data["label"].value_counts()

0    902
1     98
Name: label, dtype: int64

In [9]:
def logtime(func):
    """
    函数目的：测量函数运行时间
    Parameter:
        func - 被测量的函数
    Return:
        wrapper - 被装饰之后的函数
    """
    def wrapper(*args,**kwargs):
        start = time.time()
        result = func(*args,**kwargs)
        end = time.time()
        print("完成函数{name}, 运行时间 {totaltime:.3f}s".format(name=func.__name__,totaltime=end-start))
        start = time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(start))
        end = time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(end))
        print("开始时间 : %s \n结束时间 : %s "%(start,end))
        return result
    return wrapper

In [10]:
@logtime
def loadStopWords(filepath):
    """
    函数说明：加载获取停用词列表
    Parameter:
        filepath - 停用词文件的路径
    Return:
        stopwords - 以集合形式返回的停用词列表
    Modify:
        2017-12-02
    """
    stopwordfile = open(filepath,"r",encoding="utf-8").readlines()
    stopwords = {line.strip() for line in stopwordfile}
    return stopwords

In [11]:
stopwordsfile = "data/stopwords.txt"
stopwords = loadStopWords(stopwordsfile)

完成函数loadStopWords, 运行时间 0.000s
开始时间 : 2017-12-04 22:14:11 
结束时间 : 2017-12-04 22:14:11 


In [12]:
print(len(stopwords))

66


In [13]:
@logtime
def getVocalist(contents,stopwords):
    """
    函数说明：获取去除停用词后的词汇表，来减小向量维度的大小
    Parameter:
        data - 待处理的文本数据
        stopwords - 停用词列表
    Return:
        list(vocalist)- 文本词汇表
    Modify:
        2017-12-02
    """
    vocalist = set([])
    for i in range(len(data)):
        content = contents.iloc[i]
        segresult = set(jieba.cut(content))
        # 两个集合求并集
        vocalist = vocalist | segresult
    vocalist = vocalist - stopwords
    # print(vocalist)
    print("词汇表长度为：",len(vocalist))
    return list(vocalist)

In [14]:
contents = data["content"]
vocalist=getVocalist(contents,stopwords)   

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\CC\AppData\Local\Temp\jieba.cache
Loading model cost 0.825 seconds.
Prefix dict has been built succesfully.


词汇表长度为： 5778
完成函数getVocalist, 运行时间 1.121s
开始时间 : 2017-12-04 22:14:17 
结束时间 : 2017-12-04 22:14:18 


In [19]:
# 文本信息矢量化
def Vectorization(content,vocalist,stopwords):
    """
    函数说明：得到每个短信的0-1向量表示，如[0,0,0,1,0,1,0,0,0,0....]，向量长度与词汇表长度相同
    Paremeter:
        content -  每条短信的内容
        vocalist - 词汇表
        stopwords - 停用词列表
    Return:
        vector - 每条短信的向量表示
    Modify:
        2017-12-02
    """
    vector= [0]*len(vocalist)
    segresult = set(jieba.cut(content))-stopwords
    for word in segresult:
        vector[vocalist.index(word)] = 1
    return vector
@logtime
def vectall(data,vocalist,stopwords):
    data["vector"] = data.apply(lambda row:Vectorization(row["content"],vocalist,stopwords),axis=1)

In [20]:
# 得到每个文本的矢量化结果
vectall(data,vocalist,stopwords)

完成函数vectall, 运行时间 0.912s
开始时间 : 2017-12-04 22:14:46 
结束时间 : 2017-12-04 22:14:47 


In [21]:
# 随机划分训练数据和测试数据 8:2
traindata,testdata = train_test_split(data,test_size=0.2)

In [22]:
print("训练数据的标签分布：")
print(traindata["label"].value_counts())
print("测试数据的标签分布：")
print(testdata["label"].value_counts())

训练数据的标签分布：
0    721
1     79
Name: label, dtype: int64
测试数据的标签分布：
0    181
1     19
Name: label, dtype: int64


In [23]:
# 构建分类器
@logtime
def trainclassifier(traindata):
    """
    函数说明：构建不同种类的分类器
    Parameter:
        traindata - 训练数据
    Return:
        classifier.fit(vector,label) - 训练好的分类器
    Modify:
        2017-12-02
    """
    classifier = svm.SVC(C=50,kernel='linear')
    # 需转换为list,不然报错
    vector = list(traindata["vector"])
    label = list(traindata["label"])
    return classifier.fit(vector,label)

In [24]:
classificer = trainclassifier(traindata)

完成函数trainclassifier, 运行时间 5.286s
开始时间 : 2017-12-04 22:14:57 
结束时间 : 2017-12-04 22:15:02 


In [25]:
# 应用分类器
@logtime
def applyClassificer(classificer,testdata):
    return classificer.predict(list(testdata["vector"]))
pred = applyClassificer(classificer,testdata)

完成函数applyClassificer, 运行时间 1.009s
开始时间 : 2017-12-04 22:15:05 
结束时间 : 2017-12-04 22:15:06 


In [26]:
def elevate_result(label,pred):
    """
    函数说明: 对分类器预测的结果进行评估，包括accurancy,precision,recall,F-score
    Parameter:
        label - 真实值
        pred - 预测值
    Return:
        None
    Modify:
        2017-12-02
    """
    con_mat = metrics.confusion_matrix(label,pred)
    TP = con_mat[1,1]
    TN = con_mat[0,0]
    FP = con_mat[0,1]
    FN = con_mat[1,0]
    
    accurancy = (TP+TN)/(TP+TN+FN+FP)
    precison = TP/(TP+FP)
    recall = TP/(TP+FN)
    beta = 1
    F_score = (1+pow(beta,2))*precison*recall/(pow(beta,2)*precison+recall)
    
    print("TP:",TP)
    print("TN:",TN)
    print("FP:",FP)
    print("FN:",FN)
    print("accurancy: %s \nprecison: %s \nrecall: %s \nF-score: %s" % (accurancy,precison,recall,F_score))

In [27]:
label = testdata["label"]
elevate_result(label,pred)

TP: 16
TN: 181
FP: 0
FN: 3
accurancy: 0.985 
precison: 1.0 
recall: 0.842105263158 
F-score: 0.914285714286
