In [1]:
from gensim.models import FastText
import time
import numpy as np
import pandas as pd
import torch
from torch import nn
import time


In [2]:
vector_size = 200
# train fasttext model
time_start = time.time()

filePath = '../corpus/6moods/train/usual_trainTrimed.csv'
df = pd.read_csv(filePath)
labels, reviews = df['label'].astype('str'), df['review'].astype('str')
reviews = [str(review).split() for review in reviews]

print(f'Loading reviews Coast {time.time()-time_start:.4f} Sec')
time_start = time.time()

ft = FastText(reviews, vector_size=vector_size, epochs=20, window=2, min_count=5, min_n=2, max_n=4, word_ngrams=1, workers=8)

print(f'Train Model Coast {time.time()-time_start:.4f} Sec')

Loading reviews Coast 0.1089 Sec
Train Model Coast 7.2250 Sec


In [3]:
# convert label from str to int
# mood dict
m2i = {
    'sad':0,
    'angry': 1,
    'fear': 2,
    'neutral': 3,
    'surprise': 4,
    'happy': 5,
}
i2m = {k:i for k, i in enumerate(m2i)}
labels = [m2i[label] for label in labels]


In [4]:
# word2vec and padding
vectors = [[ft.wv[word] for word in review] for review in reviews]
max_len = max([len(vector) for vector in vectors])
zeros = [0 for i in range(vector_size)]
for i in range(len(vectors)):
    while len(vectors[i]) < max_len:
        vectors[i].insert(0, zeros)

In [5]:
# reformat data
def to_categorical(labels):
    n = len(labels)
    num_type = 6
    res = [[0] * num_type for i in range(n)]
    for i in range(n):
        res[i][labels[i]] = 1
    return res

vectors = torch.tensor(vectors)
# labels = to_categorical(labels)
labels = torch.tensor(labels, dtype=int)
vectors.shape, labels.shape

(torch.Size([27768, 80, 200]), torch.Size([27768]))

In [6]:
# 打乱并分割数据集
from sklearn.model_selection import train_test_split

# random_state表示随机数种子
labels_train, labels_test, vectors_train, vectors_test = train_test_split(
    labels, vectors, 
    test_size=0.3, random_state=0
)
len(vectors_train), len(vectors_test)

(19437, 8331)

In [7]:
class SA(nn.Module):
    def __init__(self):
        super(SA, self).__init__()
        self.input_size = 200
        self.hidden_size = 256
        self.num_layers = 2
        self.fch1_size = 64
        self.fch2_size = 16
        self.output_size = 6
        self.lstm = nn.LSTM(self.input_size, self.hidden_size, self.num_layers)
        self.fc = nn.Sequential(
            nn.Linear(self.hidden_size, self.fch1_size),
            nn.ReLU(),
            nn.Linear(self.fch1_size, self.fch2_size),
            nn.ReLU(),
            nn.Linear(self.fch2_size, self.output_size),
        )
        self.fc = nn.Linear(self.hidden_size, self.output_size)
    def forward(self, input):
        out, hidden = self.lstm(input)
        # 只取最后一个输出
        out = torch.squeeze(out[:,-1:])
        out = self.fc(out)
        return out

In [8]:
model = SA()
print(model)

SA(
  (lstm): LSTM(200, 256, num_layers=2)
  (fc): Linear(in_features=256, out_features=6, bias=True)
)


In [9]:
def train(X, y, model, loss_fn, optimizer):
    n = len(X)
    start = 0
    while start < n:
        X_sample = X[start:min(start+batch_size, n)]
        y_sample = y[start:min(start+batch_size, n)]
        pred = model(X_sample)
        # print(pred.shape, y_sample.shape)
        loss = loss_fn(pred, y_sample)

        # backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        start += batch_size
        if start % 100 == 0:
            loss, current = loss.item(), start
            print(f"loss: {loss:>7f}  [{current:>5d}/{n:>5d}]")

In [10]:
def test(X, y, model):
    n = len(X)
    model.eval()
    test_loss, correct = 0, 0
    with torch.no_grad():
        pred = model(X)
        test_loss = loss_fn(pred, y).sum().item()
        correct = (pred.argmax(1) == y).sum().item()
    # test_loss /= n
    correct /= n
    print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, loss: {test_loss:>8f} \n")

In [13]:
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters())
batch_size = 64
epochs = 5
for t in range(epochs):
    start = time.time()
    print(f"Epoch {t+1}\n-------------------------------")
    train(vectors_train, labels_train, model, loss_fn, optimizer)
    test(vectors_test, labels_test, model)
    print(f'Cost time: {time.time() - start:.4f}')
print("Done!")


Epoch 1
-------------------------------
loss: 1.690273  [ 1600/19437]
loss: 1.560850  [ 3200/19437]
loss: 1.603476  [ 4800/19437]
loss: 1.444831  [ 6400/19437]
loss: 1.587663  [ 8000/19437]
loss: 1.372534  [ 9600/19437]
loss: 1.561523  [11200/19437]
loss: 1.387775  [12800/19437]
loss: 1.425275  [14400/19437]
loss: 1.407422  [16000/19437]
loss: 1.577097  [17600/19437]
loss: 1.287098  [19200/19437]
Test Error: 
 Accuracy: 46.1%, loss: 1.398483 

Cost time: 183.9810
Epoch 2
-------------------------------
loss: 1.510795  [ 1600/19437]
loss: 1.278479  [ 3200/19437]
loss: 1.438341  [ 4800/19437]
loss: 1.294572  [ 6400/19437]
loss: 1.573930  [ 8000/19437]
loss: 1.293692  [ 9600/19437]
loss: 1.547958  [11200/19437]
loss: 1.365297  [12800/19437]
loss: 1.423805  [14400/19437]
loss: 1.390561  [16000/19437]
loss: 1.529966  [17600/19437]
loss: 1.230670  [19200/19437]
Test Error: 
 Accuracy: 47.3%, loss: 1.382396 

Cost time: 228.4507
Epoch 3
-------------------------------
loss: 1.483703  [ 1600/1

In [14]:
pred = model(vectors_train[:2])
print(pred, pred.shape, labels_train[:2].shape)
loss = loss_fn(pred, labels_train[:2])
loss

tensor([[-0.0317, -0.2670, -1.2810, -0.7244, -1.2785,  2.6586],
        [ 1.1599,  0.3993, -2.3770, -0.5475,  0.6775,  0.0266]],
       grad_fn=<AddmmBackward>) torch.Size([2, 6]) torch.Size([2])


tensor(0.8111, grad_fn=<NllLossBackward>)