## 加载数据集

In [None]:
import pandas as pd

train_labled_path = './data/nCoV_100k_train.labled.csv'
test_path='./data/nCov_10k_test.csv'
df = pd.read_csv(train_labled_path, encoding='utf-8', usecols=[3,6])
df2 = pd.read_csv(test_path, encoding='utf-8', usecols=[0,3])
df = df[df['情感倾向'].isin(['0','-1','1'])]
print(df['情感倾向'].value_counts())

## 中文分词

In [None]:
import jieba
df['微博中文内容']=df['微博中文内容'].map(str)
df['cuted']=df['微博中文内容'].map(lambda x:' '.join(jieba.cut(x)))
df2['微博中文内容']=df2['微博中文内容'].map(str)
df2['cuted']=df2['微博中文内容'].map(lambda x:' '.join(jieba.cut(x)))

In [None]:
print(df['cuted'][1])

## 训练集和测试集

In [None]:
# 输入和输出
X = df['cuted']
y = df['情感倾向']
x_ans = df2['cuted']

from sklearn.model_selection import train_test_split

# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.01, random_state=0)

# 查看训练集
X_train.shape

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

# 变换器
vect = CountVectorizer()

vect.fit(X_train)

# 词表数量
print(len(vect.vocabulary_))
# 打印词表
#print(vect.vocabulary_)

In [None]:
#words_matrix = pd.DataFrame(vect.transform(X).toarray(),columns=vect.get_feature_names())

#words_matrix.head()

## 构建模型

In [None]:
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

# 交叉验证评估模型
scores = cross_val_score(LogisticRegression(),
                         vect.transform(X_train), y_train, cv=5)
print('平均交叉验证准确率：{:.3f}'.format(np.mean(scores)))

## 去除停用词

In [None]:
def stopwords_list(d):
    with open('./data/'+d,'rb') as f:
        lines = f.readlines()
        result = [i.decode().strip('\n') for i in lines]
    return result

stopwords = stopwords_list('hit_stopwords.txt')
stopwords.extend(stopwords_list('cn_stopwords.txt'))
stopwords.extend(stopwords_list('baidu_stopwords.txt'))
stopwords.extend(stopwords_list('scu_stopwords.txt'))

In [None]:
vect = CountVectorizer(max_df=0.8, min_df=3, stop_words=stopwords,
                       token_pattern=u'(?u)\\b[^\\d\\W]\\w+\\b')

vect.fit(X_train)

In [None]:
print(vect.get_feature_names())

In [None]:
#words_matrix = pd.DataFrame(vect.transform(X_train).toarray(),columns=vect.get_feature_names())

In [None]:
# 训练模型
lr=LogisticRegression()
lr.fit(vect.transform(X_train), y_train)

print('测试集准确率：{:.3f}'.format(lr.score(vect.transform(X_test), y_test)))

## 用tf-idf缩放数据

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline

pipe = make_pipeline(TfidfVectorizer(min_df=3), LogisticRegression())
pipe.fit(X_train, y_train)
scores = cross_val_score(pipe, X_train, y_train, cv=5)
print('平均交叉验证准确率：{:.3f}'.format(np.mean(scores)))

In [None]:
vectorizer = pipe.named_steps['tfidfvectorizer']
# 找到每个特征中最大值
max_value = vectorizer.transform(X_train).max(axis=0).toarray().ravel()
sorted_by_tfidf = max_value.argsort()
# 获取特征名称
feature_names = np.array(vectorizer.get_feature_names())

print("tfidf较低的特征：\n{}".format(feature_names[sorted_by_tfidf[:20]]))
print()
print("tfidf较高的特征：\n{}".format( feature_names[sorted_by_tfidf[-20:]]))

In [None]:
from sklearn import metrics

# 预测值
y_pred = pipe.predict(X_test)

print('测试集准确率：{:.3f}'.format(metrics.accuracy_score(y_test, y_pred)))
print('测试集准确率：{:.3f}'.format(pipe.score(X_test, y_test)))

metrics.confusion_matrix(y_test, y_pred)

In [None]:
print(y_pred)

In [None]:
print(pd.value_counts(y_pred))

## 输出

In [3]:
y_ans = pipe.predict(x_ans)

NameError: name 'pipe' is not defined

In [None]:
y_ans = pipe.predict(x_ans)
dict={"测试数据id":df2['微博id'].values.tolist(),'情感极性':y_ans.tolist()}
output_list = [df2['微博id'].values.tolist(),y_ans.tolist()]
output = pd.DataFrame(dict)
output.to_csv("./submit1.csv",sep=',',index=False)

In [None]:
print(output_list)