In [1]:
import torch.utils.data
import sklearn.metrics
import torch
import pandas
from torch.utils.data import Dataset
import tqdm
import spacy
import random
import numpy as np
import math
import os
import torchvision 
from torchvision import models, datasets, transforms
SEED = 2222
random.seed(SEED)
torch.manual_seed(SEED)
#nlp = spacy.load('en')
nlp = spacy.load('en_core_web_lg')

In [2]:
class SentimentDataset(Dataset):
    
    def __init__(self):
        self.data = pandas\
            .read_csv('sentiment.tsv',sep='\t',header=0)\
            .groupby('id')\
            .first()
        
    def __len__ (self):
        return len(self.data)
    
    def __getitem__(self,idx):
        if type(idx) is torch.Tensor:
            idx = idx.item()
        sample = self.data.iloc[idx]
        token_vectors = []
        for token in nlp (sample.review.lower(),disable = ['parser','tagger','ner']):
            token_vectors.append(token.vector)
        return (torch.tensor(token_vectors),
                torch.tensor(len(token_vectors)),
                torch.tensor(sample.sentiment))
    
def collate(batch):


    batch.sort(key=lambda x: x[1], reverse = True)
    sequences, lengths, sentiments = zip(*batch)
    
    sequences = torch.nn.utils.rnn.pad_sequence(
        sequences, batch_first=True)
    sentiments = torch.stack(sentiments)
    lengths = torch.stack(lengths)
    return sequences, lengths, sentiments

In [3]:
class ModelLSTM (torch.nn.Module):
    def __init__(self, input_dimensions,size=128,layers=1):
        super().__init__()
        self.seq = torch.nn.LSTM(input_dimensions,size,layers)
        self.layer_one = torch.nn.Linear(size*layers, size)
        self.activation_one = torch.nn.ReLU()
        self.layer_two = torch.nn.Linear(size, size)
        self.activation_two = torch.nn.ReLU()
        self.shape_outputs = torch.nn.Linear(size, 2)

    def forward(self, inputs,lengths):
        number_of_batches = lengths.shape[0]
        packed_inputs = torch.nn.utils.rnn.pack_padded_sequence(
            inputs,
            lengths,
            batch_first = True)
        buffer,(hidden,cell) = self.seq(packed_inputs)
        # the sequence number has the step first not the batch first
        # this way to permute the batch first
        buffer = hidden.permute(1,0,2)
        #flatten out the last hidden state
        #this will be the tensor representing the whole batch
        #contiguous() -- similar with squeeze 
        buffer = buffer.contiguous().view(number_of_batches,-1)
        
        buffer = self.layer_one(buffer)
        buffer = self.activation_one(buffer)
        buffer = self.layer_two(buffer)
        buffer = self.activation_two(buffer)
        buffer = self.shape_outputs(buffer)
        return buffer

In [4]:

def valid(model, validationloader):
  loss_all = []
  with torch.no_grad():
      model.eval()
      for sequences, lengths, sentiments in validationloader:
          results = model(inputs)
          loss = loss_function(results, actual)
          loss_all.append(loss.item())
  return  np.mean(np.array(loss_all))

def train(model, trainloader, optimizer, loss_function):
  model.train()
  epoch_loss = []
  for sequences, lengths, sentiments in trainloader:
        optimizer.zero_grad()
        results = model(sequences, lengths)
        loss = loss_function(results, sentiments)
        epoch_loss.append(loss.item())
        loss.backward()
        optimizer.step()
  return np.mean(np.array(epoch_loss))


In [5]:
dataset = SentimentDataset()
number_for_validating = int(len(dataset)*0.1)
number_for_training = len(dataset) - number_for_validating
valid_dataset, train_dataset = torch.utils.data.random_split(dataset,[
    number_for_validating,number_for_training])
trainloader = torch.utils.data.DataLoader(train_dataset,batch_size =32, shuffle =True,collate_fn = collate)
validationloader = torch.utils.data.DataLoader(valid_dataset,batch_size =32, shuffle =True,collate_fn = collate)
print(len(valid_dataset),len(train_dataset),len(trainloader),len(validationloader))

#model
#model = Modeltrain(len(traindataset.ordinals))
model = ModelLSTM(dataset[0][0].shape[1])
optimizer = torch.optim.Adam(model.parameters())
loss_function = torch.nn.CrossEntropyLoss()
best_loss = float ('inf')
best_epoch = 0
for epoch in range(50):
    train_loss = train(model, trainloader, optimizer, loss_function)
    valid_loss = valid(model, validationloader)
    if valid_loss<best_loss:
        best_loss=valid_loss
        best_epoch = epoch
        torch.save(modeltrain.state_dict(),'best_model.pt')
    print('Epoch:%d, Current_loss:%.4f Best_epoch:%d Best_loss:%.4f'%(epoch,valid_loss,best_epoch, best_loss))


2500 22500 704 79


ValueError: too many values to unpack (expected 2)