In [90]:
# %load model1.py
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt

from data_process1 import data_process
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
'''
#朴素贝叶斯
adata, data_after_stop, labels = data_process()
data_tr, data_te, labels_tr, labels_te = train_test_split(adata, labels, test_size=0.2)

countVectorizer = CountVectorizer()
data_tr = countVectorizer.fit_transform(data_tr)
X_tr = TfidfTransformer().fit_transform(data_tr.toarray()).toarray()

data_te = CountVectorizer(vocabulary=countVectorizer.vocabulary_).fit_transform(data_te)
X_te = TfidfTransformer().fit_transform(data_te.toarray()).toarray()
'''


#--------------------------------------------------


#其他四种模型
adata, data_after_stop, labels = data_process()
adata_key = data_after_stop['key']
adata_set = adata_key.apply(lambda x: ' '.join(x))

tfidfVectorizer = TfidfVectorizer(norm='l2', ngram_range=(1, 2))
features = tfidfVectorizer.fit_transform(adata_set)



models = [
   RandomForestClassifier(n_estimators=200, max_depth=3, random_state=0),
   LinearSVC(),
   MultinomialNB(),
   LogisticRegression(random_state=0),
]

#5折交叉验证
CV = 5
cv_df = pd.DataFrame(index=range(CV * len(models)))
entries = []
for model in models:
    
    model_name = model.__class__.__name__
    accuracies = cross_val_score(model, features, labels, scoring='accuracy', cv=CV)
    for fold_idx, accuracy in enumerate(accuracies):
        entries.append((model_name, fold_idx, accuracy))
cv_df = pd.DataFrame(entries, columns=['model_name', 'fold_idx', 'accuracy'])


#-------------------------------------
#线性SVC模型调用
model = LinearSVC(random_state=0,dual =False)
X_train, X_test, y_train, y_test, indices_train, indices_test = train_test_split(features, labels, data_after_stop.index, 
                                                                                 test_size=0.3, stratify=labels, random_state=0)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

#labelss = ['城乡建设','环境保护','交通运输','教育文体','劳动和社会保障','商贸旅游','卫生计生']
print('accuracy %s' % accuracy_score(y_pred, y_test))
#print(classification_report(y_test, y_pred,target_names=labelss))


  stopWords = pd.read_csv('stopword1.txt', encoding='GB18030', sep='hahaha', header=None)


accuracy 0.8662175168431184


In [None]:
data_after_stop['key'].shape

In [None]:
print('X_test为：',X_test.shape)
print('X_train为：',X_train.shape)
print('features为：',features.shape)

In [58]:
#引入sklearn中自带的保存模型
from sklearn.externals import joblib
#保存模型
joblib.dump(model, 'SVC模型.pkl')

['SVC模型.pkl']

In [59]:
#预处理测试数据
import pandas as pd
import re
import jieba.analyse

def process(file='附件2（测试数据）.xlsx'):
    data = pd.read_excel(file,  index_col=0,encoding = 'GB18030')
    
    
    jieba.load_userdict('newdic1.txt')
    data_cut = data['留言详情'].apply(lambda x: jieba.lcut(x))

    stopWords = pd.read_csv('stopword1.txt', encoding='GB18030', sep='hahaha', header=None)
    stopWords = [' ', '\n', '\t', '\r\n', '\u3000', '＂', '–'] + list(stopWords.iloc[:, 0])
    after_stop = data_cut.apply(lambda x: [i for i in x if i not in stopWords])
    #labels = data_new.loc[data_after_stop.index, '一级标签']
    adata = after_stop.apply(lambda x: ' '.join(x))
    after_stop = after_stop.to_frame()
    
	#提取关键词
    key=[]
    for i in adata:
        keywords=jieba.analyse.extract_tags(i,topK=20)
        key.append(keywords)
    after_stop['key']=key
    
	#去除城市乡镇以外的字母和0
    key=[]
    pattern = re.compile('[0-9]+')
    for x in after_stop['key']:
        temp=[]
        for i in x:
            match = pattern.findall(i)
            if match:
                pass
            else:
                temp.append(i)
        key.append(temp)
    after_stop['key']=key
    #data_after_stop['labels']=labels
    return after_stop




In [61]:
after_stop = process()
key = after_stop['key']
join_key = key.apply(lambda x: ' '.join(x))


my_features = TfidfVectorizer(vocabulary=tfidfVectorizer.vocabulary_).fit_transform(join_key)

  del sys.path[0]


In [62]:
print(my_features.shape)

(2801, 73876)


In [63]:
my_model = joblib.load('SVC模型.pkl')
first_labels = []
#得到预测的目标值
to_list = my_model.predict(my_features)
for i in to_list:
    first_labels.append(i)

#目标值写进测试数据文件
data1 = pd.read_excel('附件2（测试数据）.xlsx',  index_col=0,encoding = 'GB18030')
data1 = data1.drop(['一级分类'], axis = 1)
data1['一级分类'] = first_labels
data1

Unnamed: 0_level_0,留言用户,留言主题,留言时间,留言详情,一级分类
留言编号,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,U0001196,投诉A市A1区苑物业违规收停车费,2019/12/30 17:06:14,\n\t\t\t\t\t\n\t\t\t\t\t尊敬的领导：A1区苑小区位于A1区火炬路，小...,商贸旅游
23,U0002738,A4区五一大道一酒吧噪音严重影响居民休息,2020/1/6 13:15:56,\n\t\t\t\t\t\n\t\t\t\t\t位于五一大道202的“李四的LIST-PUB...,环境保护
26,U0003729,A市地铁6号线桐梓坡地铁站交叉口低频率噪音严重扰民,2020/1/6 11:29:11,\n\t\t\t\t\t\n\t\t\t\t\t近2个月来，地铁6号线桐梓坡地铁站交叉口有设...,环境保护
39,U0007638,A市地铁8号线工作现在就要开始准备了,2020/1/3 22:26:19,\n\t\t\t\t\t\n\t\t\t\t\t今年地铁3号线和5号线通车后，正线就只剩下6...,城乡建设
40,U0005855,A2区福满新城二期施工噪音扰民谁能管,2020/1/3 20:08:05,\n\t\t\t\t\t\n\t\t\t\t\t今天是投诉A2区福满新城二期噪音扰民问题第8...,环境保护
...,...,...,...,...,...
6056990,U0004035,为何在B市乡镇卫生院的高级职工所在的点不能一视同仁还保留？,2012/12/15 18:39:45,\n \n 我想请问一下领导，自从乡村医生实行基药开始，是不是所有的乡镇...,卫生计生
6665290,U0008214,C3县居民这样的情况要不要罚款？,2011/12/13 14:05:17,"\n \n 你好,我是C3县居民,在家中是独女,父亲是老师,母亲是农民,...",卫生计生
16704000,U0007689,请求市长查处步步高E12市店长期拖欠员工加班工资的违法行为,2011/11/16 1:35:27,\n \n 尊敬的王书记： 您好！步步高E12市店无视劳动法有关规定，国家...,劳动和社会保障
33681565,U0006759,B市滨江花园二期容积率达到4.11之高,2018/12/24 19:19:26,\n\t\t\t\t\t\n\t\t\t\t\t西地省格尚置业有限公司于2004年取得滨江花...,城乡建设


In [71]:
#把目标值写进测试结果文件
data2 = pd.read_excel('附件2（测试结果）.xlsx',  index_col=0,encoding = 'GB18030')
data2 = data2.drop(['一级分类'], axis = 1)
data_new = pd.merge(data1, data2, on='留言编号')
data_temp1 = data_new.drop(['留言用户'], axis = 1)
data_temp2 = data_temp1.drop(['留言主题'], axis = 1)
data_temp3 = data_temp2.drop(['留言时间'], axis = 1)
data_final = data_temp3.drop(['留言详情'], axis = 1)
data_final.to_excel('结果（待重命名）.xlsx',encoding = 'GB18030')