In [1]:
import pandas as pd 

data = pd.read_csv('./data/ISEAR.csv', header=None)
data.head()

Unnamed: 0,0,1,2
0,joy,On days when I feel close to my partner and ot...,
1,fear,Every time I imagine that someone I love or I ...,
2,anger,When I had been obviously unjustly treated and...,
3,sadness,When I think about the short time that we live...,
4,disgust,At a gathering I found myself involuntarily si...,


In [4]:
from sklearn.model_selection import train_test_split

labels = data[0].values.tolist()
sentences = data[1].values.tolist()
# 按照4：1的比例随机划分训练集与测试集
X_train, X_test, y_train, y_test = train_test_split(sentences, labels, test_size=0.2, random_state=3)

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
# 这里仅提取tfidf特征
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

In [13]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

# 用逻辑回归进行分类
lr = LogisticRegression(multi_class='auto')

params = {'C': [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5, 10]}
# 使用GridSearch方法调参
clf = GridSearchCV(lr, params, cv=5)
clf.fit(X_train, y_train)
print(clf.best_params_)



{'C': 5}


In [17]:
from sklearn.metrics import confusion_matrix, classification_report

y_pred = clf.predict(X_test)
# 分类混淆矩阵
print(confusion_matrix(y_test, y_pred))
# 分类结果报告
print(classification_report(y_test, y_pred))

[[ 99  17  10  30   6  18  20]
 [ 32 123  14  13   7   6  20]
 [ 15  11 140   5   4  18  10]
 [ 31  14  13 103   7  10  34]
 [ 12   8   5   7 172  15  17]
 [ 24   6   7  15  28 142  12]
 [ 23  10  10  30   9  11 111]]
              precision    recall  f1-score   support

       anger       0.42      0.49      0.45       200
     disgust       0.65      0.57      0.61       215
        fear       0.70      0.69      0.70       203
       guilt       0.51      0.49      0.50       212
         joy       0.74      0.73      0.73       236
     sadness       0.65      0.61      0.63       234
       shame       0.50      0.54      0.52       204

   micro avg       0.59      0.59      0.59      1504
   macro avg       0.59      0.59      0.59      1504
weighted avg       0.60      0.59      0.59      1504



可以看出，“fear”和“joy”相对更容易识别出，当然这只是个很简单的情感分类器