## 加载数据集

In [85]:
import pandas as pd

train_labled_path = './data/nCoV_100k_train.labled.csv'
test_path='./data/nCov_10k_test.csv'
df = pd.read_csv(train_labled_path, encoding='utf-8', usecols=[3,6])
df2 = pd.read_csv(test_path, encoding='utf-8', usecols=[0,3])
df = df[df['情感倾向'].isin(['0','-1','1'])]
print(df['情感倾向'].value_counts())

0     57619
1     25392
-1    16902
Name: 情感倾向, dtype: int64


## 中文分词

In [86]:
import jieba
df['微博中文内容']=df['微博中文内容'].map(str)
df['cuted']=df['微博中文内容'].map(lambda x:' '.join(jieba.cut(x)))
df2['微博中文内容']=df2['微博中文内容'].map(str)
df2['cuted']=df2['微博中文内容'].map(lambda x:' '.join(jieba.cut(x)))

In [87]:
print(df['cuted'][1])

开年 大 模型 … 累到 以为 自己 发烧 了 腰疼 膝盖 疼 腿疼 胳膊 疼 脖子 疼 # Luna 的 Krystallife # ?


## 训练集和测试集

In [88]:
# 输入和输出
X = df['cuted']
y = df['情感倾向']
x_ans = df2['cuted']

from sklearn.model_selection import train_test_split

# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.01, random_state=42)

# 查看训练集
X_train.shape

(98913,)

In [89]:
from sklearn.feature_extraction.text import CountVectorizer

# 变换器
vect = CountVectorizer()

vect.fit(X_train)

# 词表数量
print(len(vect.vocabulary_))
# 打印词表
#print(vect.vocabulary_)

143036


In [90]:
#words_matrix = pd.DataFrame(vect.transform(X).toarray(),columns=vect.get_feature_names())

#words_matrix.head()

## 构建模型

In [91]:
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
'''
# 交叉验证评估模型
scores = cross_val_score(LogisticRegression(),
                         vect.transform(X_train), y_train, cv=5)
print('平均交叉验证准确率：{:.3f}'.format(np.mean(scores)))
'''

"\n# 交叉验证评估模型\nscores = cross_val_score(LogisticRegression(),\n                         vect.transform(X_train), y_train, cv=5)\nprint('平均交叉验证准确率：{:.3f}'.format(np.mean(scores)))\n"

## 去除停用词

In [92]:
def stopwords_list(d):
    with open('./data/'+d,'rb') as f:
        lines = f.readlines()
        result = [i.decode().strip('\n') for i in lines]
    return result

stopwords = stopwords_list('hit_stopwords.txt')
stopwords.extend(stopwords_list('cn_stopwords.txt'))
stopwords.extend(stopwords_list('baidu_stopwords.txt'))
stopwords.extend(stopwords_list('scu_stopwords.txt'))

In [93]:
print(stopwords[10:100])

['*', '一一', '~~~~', '’', '. ', '『', '.一', './', '-- ', '』', '＝″', '【', '［＊］', '｝＞', '［⑤］］', '［①Ｄ］', 'ｃ］', 'ｎｇ昉', '＊', '//', '［', '］', '［②ｅ］', '［②ｇ］', '＝｛', '}', '，也 ', '‘', 'Ａ', '［①⑥］', '［②Ｂ］ ', '［①ａ］', '［④ａ］', '［①③］', '［③ｈ］', '③］', '１． ', '－－ ', '［②ｂ］', '’‘ ', '××× ', '［①⑧］', '０：２ ', '＝［', '［⑤ｂ］', '［②ｃ］ ', '［④ｂ］', '［②③］', '［③ａ］', '［④ｃ］', '［①⑤］', '［①⑦］', '［①ｇ］', '∈［ ', '［①⑨］', '［①④］', '［①ｃ］', '［②ｆ］', '［②⑧］', '［②①］', '［①Ｃ］', '［③ｃ］', '［③ｇ］', '［②⑤］', '［②②］', '一.', '［①ｈ］', '.数', '［］', '［①Ｂ］', '数/', '［①ｉ］', '［③ｅ］', '［①①］', '［④ｄ］', '［④ｅ］', '［③ｂ］', '［⑤ａ］', '［①Ａ］', '［②⑧］', '［②⑦］', '［①ｄ］', '［②ｊ］', '〕〔', '］［', '://', '′∈', '［②④', '［⑤ｅ］', '１２％']


In [94]:
vect = CountVectorizer(max_df=0.8, min_df=2, stop_words=stopwords,
                       token_pattern=u'(?u)\\b[^\\d\\W]\\w+\\b')

vect.fit(X_train)

  'stop_words.' % sorted(inconsistent))


CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=0.8, max_features=None, min_df=2,
                ngram_range=(1, 1), preprocessor=None,
                stop_words=['———', '》），', '）÷（１－', '”，', '）、', '＝（', ':', '→',
                            '℃ ', '&', '*', '一一', '~~~~', '’', '. ', '『', '.一',
                            './', '-- ', '』', '＝″', '【', '［＊］', '｝＞', '［⑤］］',
                            '［①Ｄ］', 'ｃ］', 'ｎｇ昉', '＊', '//', ...],
                strip_accents=None, token_pattern='(?u)\\b[^\\d\\W]\\w+\\b',
                tokenizer=None, vocabulary=None)

In [95]:
print(vect.get_feature_names())

['__', '___', '____', '_____', '______', '________', '__________', 'a095', 'a1', 'a24', 'a2n', 'a3', 'a3g389', 'a4', 'a50', 'a6', 'a6g126', 'a6h', 'a6hzppdi', 'a6hzqlzo', 'a6p4t3h7', 'a6p5zgae', 'a6pfl4v7', 'a6pfslok', 'a6pgwesw', 'a6pinnpb', 'a6piph89', 'a6pk09lq', 'a6pnneiy', 'a6ptqcod', 'a6px4ivp', 'a6pxmou3', 'a6pz9kzz', 'a6vy92wl', 'aa', 'aaa', 'ab', 'ababebaci', 'abc', 'abc2017', 'abcd', 'abner', 'abo', 'aboutduringthewinter', 'abs', 'absolutecb', 'abyss', 'ac', 'acca', 'ace', 'ace2', 'acfc', 'acg', 'achyuta', 'ad', 'adam0616', 'aed', 'aeolus', 'aerosol', 'aesopbach', 'aesopbach75', 'af', 'afc', 'afrabot', 'ag', 'ageha', 'agent', 'ago', 'ahc', 'ai', 'aia', 'aiba', 'aibofold', 'aidqw916', 'aids', 'aiec', 'aifxd1ng', 'aih', 'aih00qg5', 'aikkkkkkkkkkkkk', 'ailurus', 'aioros', 'air', 'airpods', 'airpods1', 'ais5wmdv', 'ait9zhe9', 'aj', 'ak', 'ak20190515', 'aka', 'akalui', 'akb48teamsh', 'aki', 'akiko', 'akira', 'akiramiya', 'alain', 'alan', 'alan8616', 'albus', 'alex', 'alexie', 'ali

In [96]:
#words_matrix = pd.DataFrame(vect.transform(X_train).toarray(),columns=vect.get_feature_names())

In [97]:
# 训练模型
lr=LogisticRegression()
lr.fit(vect.transform(X_train), y_train)

print('测试集准确率：{:.3f}'.format(lr.score(vect.transform(X_test), y_test)))

测试集准确率：0.715


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


## tf-idf+GBDT

In [107]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import *
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV

pipe = Pipeline([('tiv', TfidfVectorizer(max_df=0.8,min_df=3)),
                      ('gbc', GradientBoostingClassifier(learning_rate=0.05,
                                                         n_estimators=540,
                                                         min_samples_split=1200,
                                                         min_samples_leaf=10,
                                                         max_depth=17,
                                                         #max_features='sqrt',
                                                         subsample=1,
                                                         random_state=10))])
#param_test5= {'gbc__subsample':[0.6,0.7,0.75,0.8,0.85,0.9]}
#gsearch1=GridSearchCV(estimator=pipe, param_grid=param_test5, scoring='f1_macro',iid=False,cv=5)
#gsearch1.fit(X_train, y_train)
pipe.fit(X_train, y_train)
scores = cross_val_score(pipe, X_train, y_train, cv=5, scoring='f1_macro')
print('平均交叉验证Macro-F1：{:.3f}'.format(np.mean(scores)))

KeyboardInterrupt: 

In [99]:
#print('{}:{}'.format(gsearch1.cv_results_['params'],gsearch1.cv_results_['rank_test_score']),'\n',gsearch1.best_params_, gsearch1.best_score_)

In [None]:
vectorizer = pipe.named_steps['tiv']
# 找到每个特征中最大值
max_value = vectorizer.transform(X_train).max(axis=0).toarray().ravel()
sorted_by_tfidf = max_value.argsort()
# 获取特征名称
feature_names = np.array(vectorizer.get_feature_names())

print("tfidf较低的特征：\n{}".format(feature_names[sorted_by_tfidf[:20]]))
print()
print("tfidf较高的特征：\n{}".format( feature_names[sorted_by_tfidf[-20:]]))

In [None]:
from sklearn import metrics

# 预测值
y_pred = pipe.predict(X_test)

print('测试集准确率：{:.3f}'.format(metrics.accuracy_score(y_test, y_pred)))
print('测试集准确率：{:.3f}'.format(pipe.score(X_test, y_test)))

metrics.confusion_matrix(y_test, y_pred)

In [None]:
print(y_pred)

In [None]:
print(pd.value_counts(y_pred))

## 输出

In [None]:
y_ans = pipe.predict(x_ans)

In [None]:
y_ans = pipe.predict(x_ans)
dict={"测试数据id":df2['微博id'].values.tolist(),'情感极性':y_ans.tolist()}
output_list = [df2['微博id'].values.tolist(),y_ans.tolist()]
output = pd.DataFrame(dict)
output.to_csv("./submit7.csv",sep=',',index=False)

In [None]:
print(output_list)