In [None]:
import numpy as np
import pandas as pd
from gensim import corpora
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import time

In [None]:
# 数据的导入与预处理
train_data = pd.read_csv("sentiment-analysis-on-movie-reviews/train.tsv",
                         sep='\t', encoding='ISO-8859-1')
phrase_id = train_data.loc[:, 'PhraseId'].values
phrase = train_data.loc[:, 'Phrase'].values
sentiment = train_data.loc[:, 'Sentiment'].values

In [None]:
# 由原始表格数据生成词典与词向量
train_texts = [[word for word in sentence.lower().split()] for sentence in
               phrase]  # lower()使大写变小写
dictionary = corpora.Dictionary(train_texts)  # 词典的生成,可以根据列表或列表的列表
# gensim中的dictionary实际上是一个单词到id的唯一映射,是一种词典,id从0开始计算
corpus = [dictionary.doc2bow(text) for text in train_texts]  # 稀疏one-hot向量形式
dict_len = len(dictionary)  # 词典中词的总数

# 由稀疏的bow向量生成稠密的文本特征向量
word_feature = np.zeros((dict_len, len(phrase)), dtype='uint8')  # unit8降低内存
for i in range(len(corpus)):
    for bow in corpus[i]:
        word_feature[bow[0], i] = bow[1]

# 情感标签特征向量的生成(one-hot形式)
sentiment_vec = np.zeros((5, len(sentiment)))
for i in range(len(sentiment)):
    sentiment_vec[sentiment[i], i] = 1

In [None]:
sentence_len = []
for sentence in train_texts:
    sentence_len.append(len(sentence))
print(max(sentence_len))

In [2]:
from torchtext.vocab import GloVe
glove = GloVe(name='6B', dim=100)

In [None]:
class Net(nn.Module):

    def __init__(self):
        super(Net, self).__init__()
        # kernel
        self.conv1 = nn.Conv1d(100, 20, 2)
        self.conv2 = nn.Conv1d(100, 20, 3)
        self.conv3 = nn.Conv1d(100, 20, 4)
        self.fc1 = nn.Linear(60, 5)

    def forward(self, x):
        # Max pooling
        x1 = F.max_pool1d(F.relu(self.conv1(x)), 51)
        x2 = F.max_pool1d(F.relu(self.conv2(x)), 50)
        x3 = F.max_pool1d(F.relu(self.conv2(x)), 49)
        x = torch.flatten(torch.cat((x1, x2, x3), 0))
        x = F.softmax(self.fc1(x), 0)
        return x


net = Net()
print(net)

In [None]:
params = list(net.parameters())
print(len(params))
for i in range(8):
    print(params[i].size())

In [None]:
input = torch.from_numpy(word_feature[:, 0].T,).reshape(1, 16540).float()
print(input)
out = net(input)
print(out)

In [None]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=0.001)

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# Assuming that we are on a CUDA machine, this should print a CUDA device:

print(device)
net.to(device)

In [None]:
start = time.time()
for epoch in range(3):
    
    running_loss = 0.0
    for i in range(136060):
        # get the inputs
        word_embed = glove.get_vecs_by_tokens(train_texts[i])
        zero_embed = torch.zeros(52 - word_embed.size()[0], 100)
        inputs = torch.cat((word_embed, zero_embed), 0).T
        labels = torch.from_numpy(sentiment_vec[:, i].T,).float()
        optimizer.zero_grad()
        inputs, labels = inputs.to(device), labels.to(device)
        # forward + backward + optimize
        outputs = net(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        # print statistics
        running_loss += loss.item()
        if i % 10 == 9:    # print every 2000 mini-batches
            print('[%d, %5d] loss: %.3f' %
                  (epoch + 1, i + 1, running_loss / 10))
            running_loss = 0.0

print('Finished Training')
end = time.time()
print(f'cost time: {end-start} seconds')

In [15]:
tensor = glove.get_vecs_by_tokens(['', '1998', '199999998', ',', 'cat'], True)
print(tensor.size())

torch.Size([5, 100])
