In [1]:
import torch
import torch.nn as nn
import numpy as np
from sklearn.model_selection import train_test_split
import pickle
import torch.nn.functional as F
import import_ipynb
from text_process_function import *

importing Jupyter notebook from text_process_function.ipynb


In [2]:
vocab_dim = 20 
maxlen = 40  # Maximum length of text retention
   
embedding_weights = np.load("embeddings.npy") 
# Set a zero vector for words that do not appear in the vocabulary
embedding_weights = np.r_[np.zeros((1, vocab_dim)),embedding_weights]

f = open("vocab.pkl", 'rb') 
index_dict = pickle.load(f)    # index dictionary {'word': idx}

# Index each word + 1 because of the zero vector
for key, value in index_dict.items():  
    index_dict[key] = value + 1 

with open("../twitter-datasets/train_neg.txt", "r", encoding='UTF-8') as f:
    neg_data = f.readlines()
with open("../twitter-datasets/train_pos.txt", "r", encoding='UTF-8') as f:
    pos_data = f.readlines()
    
data = neg_data + pos_data

In [5]:
label_list = ([0] * len(neg_data) + [1] * len(pos_data))

In [6]:
####LSTM####
train_x,val_x,train_y,val_y = train_test_split(data, label_list, test_size=0.2)
train_x = text_to_index_array(index_dict, train_x)
val_x = text_to_index_array(index_dict, val_x)
train_y = np.array(train_y) 
val_y = np.array(val_y)

In [7]:
from torch.nn.utils.rnn import pad_sequence

# Cut the data to the same specified length 
train_x = pad_sequence([torch.from_numpy(np.array(x)) for x in train_x],batch_first=True).float() 
val_x = pad_sequence([torch.from_numpy(np.array(x)) for x in val_x],batch_first=True).float()
train_x = text_cut_to_same_long(train_x)
val_x = text_cut_to_same_long(val_x)

# Index to vector
train_x = creat_wordvec_tensor(embedding_weights,train_x)
val_x = creat_wordvec_tensor(embedding_weights,val_x)

print("train shape： ", train_x.shape)
print("val shape： ", val_x.shape)

train shape：  (160000, 40, 20)
val shape：  (40000, 40, 20)


In [8]:
n_epoch = 60 
batch_size = 64 

from torch.utils.data import TensorDataset, DataLoader
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_data = TensorDataset(torch.from_numpy(train_x), torch.from_numpy(train_y))
test_data = TensorDataset(torch.from_numpy(val_x), torch.from_numpy(val_y))

train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
test_loader = DataLoader(test_data, shuffle=True, batch_size=batch_size)
    
class lstm(nn.Module):
    def __init__(self):
        super(lstm, self).__init__()
        self.lstm = nn.LSTM(
            input_size=vocab_dim,
            hidden_size=64,
            batch_first=True)   
                                  
        self.fc = nn.Linear(64, 2)
        
    def forward(self, x):
        out, (h_0, c_0) = self.lstm(x)
        out = out[:, -1, :]
        out = self.fc(out)
        out = torch.sigmoid(out)    
        return out, h_0

model = lstm()
model = model.to(device)
optimizer = torch.optim.Adam(model.parameters())

In [9]:
####------train---------####
from sklearn.metrics import accuracy_score, classification_report
print ('————————train————————')
for epoch in range(n_epoch):
    correct = 0
    total = 0
    epoch_loss = 0
    model.train()
    for batch_idx, (data, target) in enumerate(train_loader):        

        data = torch.as_tensor(data, dtype=torch.float32)
        target = target.long()   
        optimizer.zero_grad()
        # data,target = data.cuda(),target.cuda()  
        output, h_state = model(data)
        #labels = output.argmax(dim= 1)
        #acc = accuracy_score(target, labels)
        
        correct += int(torch.sum(torch.argmax(output, dim=1) == target))
        total += len(target)
        
        optimizer.zero_grad()
        loss = F.cross_entropy(output, target) 
        epoch_loss += loss.item()
        loss.backward() 
        optimizer.step()
    
    loss = epoch_loss / (batch_idx + 1)
    print ('epoch:%s'%epoch, 'accuracy：%.3f%%'%(correct *100 / total), 'loss = %s'%loss)
    

————————train————————
epoch:0 accuracy：57.522% loss = 0.6667917513370514
epoch:1 accuracy：62.572% loss = 0.6408646491765976
epoch:2 accuracy：63.430% loss = 0.6354870833158494
epoch:3 accuracy：63.774% loss = 0.6325310456991196
epoch:4 accuracy：63.113% loss = 0.6363121729612351
epoch:5 accuracy：64.098% loss = 0.6288930665016175
epoch:6 accuracy：64.226% loss = 0.6247829536676407
epoch:7 accuracy：64.694% loss = 0.6197969586849212
epoch:8 accuracy：65.730% loss = 0.6140653542876243
epoch:9 accuracy：66.809% loss = 0.6077569638848305
epoch:10 accuracy：67.877% loss = 0.6007283889532089
epoch:11 accuracy：68.668% loss = 0.5944177234292031
epoch:12 accuracy：69.793% loss = 0.5874420775175094
epoch:13 accuracy：70.544% loss = 0.5823871362805366
epoch:14 accuracy：71.093% loss = 0.5774488172292709
epoch:15 accuracy：71.772% loss = 0.5722881619691849
epoch:16 accuracy：72.332% loss = 0.5675608823418617
epoch:17 accuracy：72.967% loss = 0.5626050394058227
epoch:18 accuracy：73.412% loss = 0.5597102617263794


In [10]:
####------validation---------####
print ('————————validation————————')
for epoch in range(1):
    correct = 0
    total = 0
    epoch_loss = 0
    model.train()
    for batch_idx, (data, target) in enumerate(test_loader):        
        #print (data.shape)
       
        data = torch.as_tensor(data, dtype=torch.float32)
        target = target.long()   
        optimizer.zero_grad()
        # data,target = data.cuda(),target.cuda() 
        output, h_state = model(data)
        #labels = output.argmax(dim= 1)
        #acc = accuracy_score(target, labels)
        
        correct += int(torch.sum(torch.argmax(output, dim=1) == target))
        total += len(target)
        
        optimizer.zero_grad()
        loss = F.cross_entropy(output, target)
        epoch_loss += loss.item()
        loss.backward() 
        optimizer.step()
    
    loss = epoch_loss / (batch_idx + 1)
    print ('epoch:%s'%epoch, 'accuracy：%.3f%%'%(correct *100 / total), 'loss = %s'%loss)

————————validation————————
epoch:0 accuracy：71.890% loss = 0.5757300402641297


In [12]:
torch.save(model, '/content/drive/MyDrive/Colab Notebooks/Glove/Glove_LSTM.pt')