In [1]:
import numpy as np
import pandas as pd
from gensim import corpora
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import time

In [2]:
# 数据的导入与预处理
train_data = pd.read_csv("sentiment-analysis-on-movie-reviews/train.tsv",
                         sep='\t', encoding='ISO-8859-1')
phrase_id = train_data.loc[:, 'PhraseId'].values
phrase = train_data.loc[:, 'Phrase'].values
sentiment = train_data.loc[:, 'Sentiment'].values

In [3]:
# 由原始表格数据生成词典与词向量
train_texts = [[word for word in sentence.lower().split()] for sentence in
               phrase]  # lower()使大写变小写
dictionary = corpora.Dictionary(train_texts)  # 词典的生成,可以根据列表或列表的列表
# gensim中的dictionary实际上是一个单词到id的唯一映射,是一种词典,id从0开始计算
corpus = [dictionary.doc2bow(text) for text in train_texts]  # 稀疏one-hot向量形式
dict_len = len(dictionary)  # 词典中词的总数

# 由稀疏的bow向量生成稠密的文本特征向量
word_feature = np.zeros((dict_len, len(phrase)), dtype='uint8')  # unit8降低内存
for i in range(len(corpus)):
    for bow in corpus[i]:
        word_feature[bow[0], i] = bow[1]

# 情感标签特征向量的生成(one-hot形式)
sentiment_vec = np.zeros((5, len(sentiment)))
for i in range(len(sentiment)):
    sentiment_vec[sentiment[i], i] = 1

In [4]:
class Net(nn.Module):

    def __init__(self):
        super(Net, self).__init__()
        # kernel
        self.conv1 = nn.Conv1d(1, 1, 5)
        self.conv2 = nn.Conv1d(1, 1, 5)
        self.conv3 = nn.Conv1d(1, 1, 5)
        self.conv4 = nn.Conv1d(1, 1, 5)
        self.fc1 = nn.Linear(1030, 100)
        self.fc2 = nn.Linear(100, 5)

    def forward(self, x):
        # Max pooling
        x = F.max_pool1d(F.relu(self.conv1(x)), 2)
        x = F.max_pool1d(F.relu(self.conv2(x)), 2)
        x = F.max_pool1d(F.relu(self.conv3(x)), 2)
        x = F.max_pool1d(F.relu(self.conv4(x)), 2)
        x = F.relu(self.fc1(x))
        x = F.softmax(self.fc2(x))
        return x


net = Net()
print(net)

Net(
  (conv1): Conv1d(1, 1, kernel_size=(5,), stride=(1,))
  (conv2): Conv1d(1, 1, kernel_size=(5,), stride=(1,))
  (conv3): Conv1d(1, 1, kernel_size=(5,), stride=(1,))
  (conv4): Conv1d(1, 1, kernel_size=(5,), stride=(1,))
  (fc1): Linear(in_features=1030, out_features=100, bias=True)
  (fc2): Linear(in_features=100, out_features=5, bias=True)
)


In [5]:
params = list(net.parameters())
print(len(params))
for i in range(12):
    print(params[i].size())

12
torch.Size([1, 1, 5])
torch.Size([1])
torch.Size([1, 1, 5])
torch.Size([1])
torch.Size([1, 1, 5])
torch.Size([1])
torch.Size([1, 1, 5])
torch.Size([1])
torch.Size([100, 1030])
torch.Size([100])
torch.Size([5, 100])
torch.Size([5])


In [6]:
input = torch.from_numpy(word_feature[:, 0].T,).reshape(1, 16540).float()
print(input)
out = net(input)
print(out)

tensor([[1., 1., 2.,  ..., 0., 0., 0.]])
tensor([[0.2091, 0.1890, 0.2037, 0.2065, 0.1917]], grad_fn=<SoftmaxBackward0>)


  x = F.softmax(self.fc2(x))


In [7]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=0.001)

In [11]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# Assuming that we are on a CUDA machine, this should print a CUDA device:

print(device)
net.to(device)

cuda:0


Net(
  (conv1): Conv1d(1, 1, kernel_size=(5,), stride=(1,))
  (conv2): Conv1d(1, 1, kernel_size=(5,), stride=(1,))
  (conv3): Conv1d(1, 1, kernel_size=(5,), stride=(1,))
  (conv4): Conv1d(1, 1, kernel_size=(5,), stride=(1,))
  (fc1): Linear(in_features=1030, out_features=100, bias=True)
  (fc2): Linear(in_features=100, out_features=5, bias=True)
)

In [12]:
start = time.time()
for epoch in range(3):
    
    running_loss = 0.0
    for i in range(136060):
        # get the inputs
        inputs = torch.from_numpy(word_feature[:, i].T,).reshape(1, 16540).float()
        labels = torch.from_numpy(sentiment_vec[:, i].T,).reshape(1, 5).float()
        optimizer.zero_grad()
        inputs, labels = inputs.to(device), labels.to(device)
        # forward + backward + optimize
        outputs = net(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        # print statistics
        running_loss += loss.item()
        if i % 10000 == 9999:    # print every 2000 mini-batches
            print('[%d, %5d] loss: %.3f' %
                  (epoch + 1, i + 1, running_loss / 10000))
            running_loss = 0.0

print('Finished Training')
end = time.time()
print(f'cost time: {end-start} seconds')

  x = F.softmax(self.fc2(x))


 2000 loss: 1.283
 4000 loss: 1.339
 6000 loss: 1.378
 8000 loss: 1.366
10000 loss: 1.353
12000 loss: 1.376
14000 loss: 1.359
16000 loss: 1.360
18000 loss: 1.341
20000 loss: 1.358
22000 loss: 1.368
24000 loss: 1.369
26000 loss: 1.356
28000 loss: 1.354
30000 loss: 1.375
32000 loss: 1.407
34000 loss: 1.422
36000 loss: 1.403
38000 loss: 1.429
40000 loss: 1.405
42000 loss: 1.382
44000 loss: 1.391
46000 loss: 1.414
48000 loss: 1.423
50000 loss: 1.371
52000 loss: 1.409
54000 loss: 1.402
56000 loss: 1.387
58000 loss: 1.389
60000 loss: 1.421
62000 loss: 1.383
64000 loss: 1.417
66000 loss: 1.384
68000 loss: 1.382
70000 loss: 1.387
72000 loss: 1.405
74000 loss: 1.454
76000 loss: 1.409
78000 loss: 1.415
80000 loss: 1.398
82000 loss: 1.373
84000 loss: 1.411
86000 loss: 1.377
88000 loss: 1.394
90000 loss: 1.421
92000 loss: 1.415
94000 loss: 1.402
96000 loss: 1.430
98000 loss: 1.398
100000 loss: 1.389
102000 loss: 1.416
104000 loss: 1.389
106000 loss: 1.367
108000 loss: 1.422
110000 loss: 1.406
1120