In [0]:
import os
from google.colab import drive
drive.mount('/content/gdrive')
!pwd
os.chdir('gdrive/My Drive/research/Datasets/Amazon_UCSD')
!pwd

In [0]:
import pandas as pd
import torch
import torchtext
from torchtext.data import Field, LabelField
from torchtext import data, datasets
from torchtext.data import TabularDataset, BucketIterator, Iterator

def split_save_data(path_to_csv, path_to_split):
	corpus = pd.read_csv(path_to_csv)

	# create small datasets for quick tweaking
	train_small = corpus[:16000]
	valid_small = corpus[16000:20000]
	test_small = corpus[20000:25000]
	# write to files
	train_small.to_csv(path_to_split + '/train_small.csv', index=False)
	valid_small.to_csv(path_to_split + '/valid_small.csv', index=False)
	test_small.to_csv(path_to_split + '/test_small.csv', index=False)

class BatchWrapper:
    def __init__(self, iterator, x_var, y_var):
        self.iterator, self.x_var, self.y_var = iterator, x_var, y_var # we pass in the list of attributes for x 
        print (self.y_var)

    def __iter__(self):
        for batch in self.iterator:
            x = getattr(batch, self.x_var) # we assume only one input in this wrapper
            y = getattr(batch, self.y_var)
            y[y < 4] = 0
            y[y >= 4] = 1
            yield (x, y)

    def __len__(self):
        return len(self.iterator)

def read_test_valid(path_to_split, device = 'cpu'):

	BATCH_SIZE = 64
	MAX_VOCAB_SIZE = 25_000

	# a very simple tokenizer
	tokenize = lambda x:x.split()

	# Fields from torchtext: specifying how to process each field in the CSV files

	TEXT = Field(sequential=True,
             tokenize=tokenize,
             lower=True,
             include_lengths=True)

	LABEL = Field(sequential=False,
	             use_vocab=False,
	             dtype = torch.long)

	fields = [('reviewText', TEXT), ('overall', LABEL)]

	# load train, validation, and test data all in once
	train_data, valid_data, test_data = TabularDataset.splits(
	            path='',
	            train=path_to_split + '/train_small.csv',
	            validation= path_to_split + '/valid_small.csv',
	            test = path_to_split + '/test_small.csv',
	            format='csv',
	            skip_header=True,
	            fields=fields)

	# the vocab can only be built from the training portion
	TEXT.build_vocab(train_data, max_size = MAX_VOCAB_SIZE)
	LABEL.build_vocab(train_data)
	vocab = TEXT.vocab

	train_iter = BucketIterator(
	    train_data,
	    batch_size = BATCH_SIZE,
	    device = device,
	    sort_key=lambda x: len(x.reviewText),
	    sort_within_batch=False,
	    repeat=False
	)

	valid_iter = BucketIterator(
	    valid_data,
	    batch_size = BATCH_SIZE * 4,
	    device = device,
	    repeat = False
	)

	test_iter = BucketIterator(
        test_data,
        batch_size = BATCH_SIZE * 4,
        device = device,
        repeat = False
      )
      
	train_iter = BatchWrapper(train_iter, 'reviewText', 'overall')
	valid_iter = BatchWrapper(valid_iter, 'reviewText', 'overall')
	test_iter = BatchWrapper(test_iter, 'reviewText', 'overall')

	return train_iter, valid_iter, test_iter, vocab

In [0]:
import torch
from torch import nn
import torch.nn.functional as F

class MLP(nn.Module):
    def __init__(self, D_in, D_hidden, D_out, device='cpu'):
        super(MLP, self).__init__()
        self.vocab_size = D_in
        self.D_in = D_in
        self.D_hidden = D_hidden
        self.D_out = D_out
        
        self.l1 = nn.Linear(D_in, D_hidden)
        self.l2 = nn.Linear(D_hidden, int(D_hidden/2))
        self.l3 = nn.Linear(int(D_hidden / 2), D_out)
        
        # using nn.Dropout rather than F.dropout. See https://stackoverflow.com/questions/53419474/nn-dropout-vs-f-dropout-pytorch
        self.dropout = nn.Dropout()

        self.device = device
    
    def indexTensor2sparseTensor(self, text, text_length):
        """
            Transform the document in each column in the batch to a sparse vector. Then pack them back in a sparse Tensor
            text: max_len x batch_size
            text_length: 1 x batch_size
            return: vocab_size x batch_size
        """
        values = []
        indices = []
        for col in range(text_length.size()[0]):
            for i in range(text_length[col]):
                values.append(1)
                indices.append((text[i, col], col))
        
        indices = torch.LongTensor(indices).t()
        values = torch.FloatTensor(values)
        shape = (self.vocab_size, text_length.size()[0])
        
        return torch.sparse.FloatTensor(indices, values, torch.Size(shape))#.to_dense()
    
    def forward(self, text, text_length):
        x = self.indexTensor2sparseTensor(text, text_length)
        x = x.to(self.device)
        
        h = F.relu(self.l1(x.t()))
        h = self.dropout(h)
        
        h = F.relu(self.l2(h))
#         h = self.dropout(h)
        
        out = self.l3(h)
        return out

In [0]:
import pandas as pd
import torch
from torch import nn

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

path_to_split = './'
model_path = './'

train_iter, valid_iter, test_iter, vocab = read_test_valid(path_to_split, device = device)

HIDDEN_SIZE = 128
num_classes = 2
model = MLP(len(vocab), HIDDEN_SIZE, num_classes, device)

optimizer = torch.optim.Adam(model.parameters(), lr = 1e-2, weight_decay = 1e-5)
loss_func = nn.CrossEntropyLoss(reduction = 'mean')

model.to(device)
loss_func.to(device)

save_path = './best_model.pt'

def train_validate(num_epoches = 2, save_path=model_path):
    
    # I don't how many epoches are enough, so we will track the best performing model
    best_acc = 0
    best_epoch = -1
    to_save = {}
    
    for e in range(num_epoches):
        # because of dropout layer, we turn training on
        model.train()
        epoch_loss = 0
        for x, y in train_iter:
            
            optimizer.zero_grad()

            text, text_lengths = x
            out = model(text, text_lengths)
            
            loss = loss_func(out, y)
            epoch_loss += loss
            loss.backward()
            optimizer.step()

        print('epoch = {}, loss = {}'.format(e, epoch_loss / len(train_iter)))
        correct, total = 0, 0
        
        
        # enter evaluation mode, no need for grads to make it run faster
        with torch.no_grad():
            # because of dropout layer, we turn training off
            model.eval()
            for val_x, val_y in valid_iter:
                text, text_lengths = val_x
                # dim of out is batch_size x num_classes
                out = model(text, text_lengths)
                correct += torch.max(out, 1)[1].eq(val_y).sum()
                total += out.size()[0]
            cur_acc = float(correct) / total
            print('Validation accuracy = {}'.format(cur_acc))
            
            if cur_acc > best_acc:
                best_acc = cur_acc
                best_epoch = e
                
                # record the model state_dict() for saving later
                to_save = {
                    'epoch': e,
                    'model_state_dict': model.state_dict()
                }
                
    # report and save the best model
    print(f'best epoch = {best_epoch} with best validation accuracy = {best_acc}')
    torch.save(to_save, save_path + '/best_model.pt')

def load_model(save_path):

    model = MLP(len(vocab), HIDDEN_SIZE, num_classes, device = device)

    checkpoint = torch.load(save_path + '/best_model.pt')
    model.load_state_dict(checkpoint['model_state_dict'])
    epoch = checkpoint['epoch']

    # move the model to GPU if has one
    model.to(device)

    # need this for dropout
    model.eval()
    return model


def test_model(model):
    correct, total = 0, 0
    with torch.no_grad():
        # because of dropout layer, we turn training off
        model.eval()
        print(next(model.parameters()).device)
        for test_x, test_y in test_iter:
            text, text_lengths = test_x
            # dim of out is batch_size x num_classes
            out = model(text, text_lengths)
            correct += torch.max(out, 1)[1].eq(test_y).sum()
            total += out.size()[0]
        cur_acc = float(correct) / total
    print('Test accuracy = {}'.format(cur_acc))

train_validate()
model = load_model(model_path)
test_model(model)

overall
overall
overall
epoch = 0, loss = 0.4739110767841339
Validation accuracy = 0.80325
epoch = 1, loss = 0.3649899363517761
Validation accuracy = 0.82275
best epoch = 1 with best validation accuracy = 0.82275
cuda:0
Test accuracy = 0.8278
