In [None]:
"""
requirements:
    tensorflow: 2.4.1
    numpy: 1.19.5
    keras: 2.4.3
    pandas: 1.1.5
"""
import numpy as np
import pandas as pd
import tensorflow.keras as keras
import tensorflow as tf

In [None]:
# 载入google drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
df = pd.read_csv('/content/drive/MyDrive/SentimentAnalysis/corpus/5moods/train/usual_trainChinese.csv')
Labels, Reviews = df['label'].astype('str'), df['review'].astype('str')

In [None]:
# 打乱并分割数据集
from sklearn.model_selection import train_test_split

# random_state表示随机数种子
Labels_train, Labels_test, Reviews_train, Reviews_test = train_test_split(
    Labels, Reviews, 
    test_size=0.3, random_state=0
)

In [None]:
"""
Tokenize words
String to INT
"""

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

# 最长评论长度 按字数
max_len = max(map(lambda x: len(x.split()), Reviews))

# Tokenize
tokenizer = Tokenizer()
tokenizer.fit_on_texts(Reviews_train)
reviews_train = tokenizer.texts_to_sequences(Reviews_train)
reviews_test = tokenizer.texts_to_sequences(Reviews_test)

# 查看结果
for text, vector in zip(Reviews_train[:3], reviews_train[:3]):
    print(text)
    print(vector)

# 固定长度
# 向量填充

reviews_train = pad_sequences(reviews_train, max_len)
reviews_test = pad_sequences(reviews_test, max_len)

累 死 啦 活 动 结 束 哈 哈 洗 澡 睡 觉 咯 另 外 玩 命 速 递 好 好 看 奥 迪 好 帅 晚 安 啦
[353, 82, 299, 161, 128, 163, 859, 130, 130, 417, 1197, 87, 64, 1328, 928, 214, 266, 347, 664, 614, 11, 11, 29, 1404, 1861, 11, 837, 99, 127, 299]
在 安 静 的 环 境 下 闭 眼 感 受 一 种 灯 光 从 亮 到 暗 到 黑 一 种 奇 怪 的 感 觉
[12, 127, 468, 1, 760, 854, 41, 975, 185, 76, 193, 6, 113, 793, 291, 164, 777, 20, 934, 20, 442, 6, 113, 515, 581, 1, 76, 64]
接 上 读 书 分 享 会 中 班 级 热 爱 读 书 的 同 学 分 享 了 最 近 读 书 的 心 得 并 为 班 级 同 学 推 荐 有 意 义 的 书 籍 在 写 作 方 面 已 小 有 成 就 的 胡 仕 林 同 学 为 同 学 们 介 绍 了 自 己 的 写 作 经 验 并 为 同 学 们 推 荐 张 德 芬 的 都 市 心 灵 三 部 曲
[224, 23, 737, 311, 116, 935, 28, 53, 223, 431, 418, 100, 737, 311, 1, 184, 70, 116, 935, 2, 67, 238, 737, 311, 1, 22, 40, 362, 32, 223, 431, 184, 70, 716, 1740, 7, 143, 633, 1, 311, 2059, 12, 371, 111, 153, 115, 160, 51, 7, 84, 15, 1, 1237, 2640, 750, 184, 70, 32, 184, 70, 50, 1098, 1649, 2, 30, 52, 1, 371, 111, 107, 717, 362, 32, 184, 70, 50, 716, 1740, 319, 686, 3066, 1, 18, 351, 22, 936, 162, 213, 1047]


In [None]:
# mood to int and int to mood dict
mood_to_int = {
    'sad':0,
    'angry': 1,
    'fear': 2,
    'neutral': 3,
    'surprise': 4,
    'happy': 5,
}
int_to_mood = {k:i for k, i in enumerate(mood_to_int)}

In [None]:
"""
one-hot编码label
"""
from tensorflow.keras import utils

labels_train = np.array([mood_to_int[x] for x in Labels_train], dtype=int)
labels_train = utils.to_categorical(labels_train) 
labels_test = np.array([mood_to_int[x] for x in Labels_test], dtype=int)
labels_test = utils.to_categorical(labels_test)

In [None]:
"""
构建模型
"""

from keras.models import Sequential
from keras.layers import LSTM, Dense, Embedding, Dropout

vocabulary_size = len(tokenizer.word_index) + 1
print("汉字数量：", vocabulary_size)

model = Sequential()

model.add(Embedding(input_dim=vocabulary_size, output_dim=32))
model.add(LSTM(12, return_sequences=False, dropout=0.5))
model.add(Dense(6, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy'])

model.summary()

汉字数量： 4899
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 32)          156768    
_________________________________________________________________
lstm (LSTM)                  (None, 12)                2160      
_________________________________________________________________
dense (Dense)                (None, 6)                 78        
Total params: 159,006
Trainable params: 159,006
Non-trainable params: 0
_________________________________________________________________


In [None]:
# 训练
history = model.fit(
    reviews_train, labels_train, batch_size=32, epochs=20,
    validation_data=(reviews_test, labels_test)
)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
 43/608 [=>............................] - ETA: 35s - loss: 0.7972 - accuracy: 0.6997

In [None]:
from matplotlib import pyplot as plt

# 精度
plt.plot(history.history['accuracy'], marker='.', label='acc')
plt.plot(history.history['val_accuracy'], marker='.', label='val_acc')
plt.title('model accuracy')
plt.grid()
plt.xlabel('epoch')
plt.ylabel('accuracy')
plt.legend(loc='best')
plt.show()

# loss
plt.plot(history.history['loss'], marker='.', label='loss')
plt.plot(history.history['val_loss'], marker='.', label='val_loss')
plt.title('model loss')
plt.grid()
plt.xlabel('epoch')
plt.ylabel('loss')
plt.legend(loc='best')
plt.show()

In [None]:
# 手动测试
input_text = ["愤"]
input_text = tokenizer.texts_to_sequences(input_text)
input_text = pad_sequences(input_text, maxlen=max_len)
pre = model.predict(input_text)
pre_ans = pre.argmax()
print(pre_ans, int_to_mood[pre_ans])

In [None]:
# 手动测试
input_texts = ['开 心', '伤 心', '开', '伤', '心']
input_texts = tokenizer.texts_to_sequences(input_texts)
input_texts = pad_sequences(input_texts, maxlen=max_len)
preds = model.predict(input_texts)
for pred in preds:
    pred_ans = pred.argmax()
    print(pred_ans, int_to_mood[pred_ans])

In [None]:
"""
对模型进行测试
"""
df = pd.read_csv('/content/drive/MyDrive/SentimentAnalysis/corpus/5moods/train/usual_trainChinese.csv')
Labels_val, Reviews_val = df['label'].astype('str'), df['review'].astype('str')

reviews_val = tokenizer.texts_to_sequences(Reviews_val)
reviews_val = pad_sequences(reviews_val, max_len)
labels_val = [mood_to_int[x] for x in Labels_val]

In [None]:
preds = model.predict(reviews_val)
preds = preds.argmax(axis=1)

In [None]:
from sklearn import metrics
print(metrics.classification_report(labels_val, preds))
print("准确率:", metrics.accuracy_score(labels_val, preds))