In [1]:
import pandas as pd
import numpy as np
import time
import spacy
import random
from pathlib import Path
import torch 
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchtext import data 
import torchtext
from nltk.tokenize.treebank import TreebankWordDetokenizer

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# Setting device on GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)
print()

if device.type == 'cuda':
    print(torch.cuda.get_device_name(0))
    print('Memory Usage:')
    print('Allocated:', round(torch.cuda.memory_allocated(0)/1024**3,1), 'GB')
    print('Cached:   ', round(torch.cuda.memory_cached(0)/1024**3,1), 'GB')

torch.backends.cudnn.deterministic = True

Using device: cuda

Tesla K80
Memory Usage:
Allocated: 0.0 GB
Cached:    0.0 GB




#### Loading Dataset - Preprocessing on the Tweets

In [3]:
datas_dir = r'/content/drive/My Drive/Courses/DeepLearning/HW03/Q01/Datas/train.csv'
destination_folder = '/content/drive/My Drive/Courses/DeepLearning/HW03/Q01/Datas'

In [4]:
df = pd.read_csv(datas_dir, engine="python", header=None)
df.head(5)

Unnamed: 0,0,1,2,3,4,5
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [5]:
df[0]=df[0].replace(to_replace=4,value=1)
df[0].value_counts()

1    800000
0    800000
Name: 0, dtype: int64

In [6]:
df.sample(50000).to_csv("sentiment140-small.csv", header=None, index=None)

In [7]:
TEXT = torchtext.legacy.data.Field(tokenize='spacy', lower=True, include_lengths= True)
LABEL = torchtext.legacy.data.LabelField(dtype=torch.float)

fields = [('label', LABEL), ('id',None),('date',None),('query',None),
      ('name',None), ('text', TEXT),('category',None)]

dataset = torchtext.legacy.data.TabularDataset(
        path="sentiment140-small.csv",
        format="CSV",
        fields=fields,
        skip_header=False)

(train_data, test_data, valid_data) = dataset.split(split_ratio=[0.8,0.1,0.1])

print("Number of train data: {}".format(len(train_data)))
print("Number of test data: {}".format(len(test_data)))
print("Number of validation data: {}".format(len(valid_data)))

Number of train data: 40000
Number of test data: 5000
Number of validation data: 5000


In [8]:
# An example from the training set
print(vars(train_data.examples[0]))

{'label': '1', 'text': ['getting', 'ready', 'for', 'church', '...', 'cooking', 'out', 'later']}


#### Bulding Vocabulary

In [77]:
MAX_VOCAB_SIZE = 25000

TEXT.build_vocab(train_data, 
                 max_size = MAX_VOCAB_SIZE,
                 vectors = "glove.6B.100d",
                 unk_init = torch.Tensor.normal_)

LABEL.build_vocab(train_data)

TEXT.vocab.freqs.most_common(10)

[('i', 24910),
 ('!', 22553),
 ('.', 20134),
 (' ', 14450),
 ('to', 14175),
 ('the', 13027),
 (',', 12049),
 ('a', 9612),
 ('my', 7923),
 ('and', 7629)]

In [78]:
BATCH_SIZE = 128

train_iterator, valid_iterator, test_iterator = torchtext.legacy.data.BucketIterator.splits(
    (train_data, valid_data, test_data),
    device = device,
    batch_size = BATCH_SIZE,
    sort_key = lambda x: len(x.text),
    sort_within_batch = True)

#### Model - LSTM

In [87]:
class LSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, 
                 bidirectional, dropout, pad_idx):
        
        super().__init__()

        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx = pad_idx)

        self.encoder = nn.LSTM(embedding_dim, hidden_dim, num_layers=n_layers,
                                                  bidirectional=bidirectional,
                                                  dropout=dropout)

        self.predictor = nn.Linear(hidden_dim*2, output_dim)

        self.dropout = nn.Dropout(dropout)
      
    def forward(self, text, text_lengths):

        embedded = self.dropout(self.embedding(text)) 
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths)
        packed_output, (hidden, cell) = self.encoder(packed_embedded)
        output, output_lengths = nn.utils.rnn.pad_packed_sequence(packed_output)


        hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1))


        return self.predictor(hidden)

In [88]:
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 100
HIDDEN_DIM = 150
OUTPUT_DIM = 1
N_LAYERS = 2
BIDIRECTIONAL = False
DROPOUT = 0.5
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]

model_1D = LSTM(INPUT_DIM,
            EMBEDDING_DIM,
            HIDDEN_DIM,
            OUTPUT_DIM,
            N_LAYERS,
            BIDIRECTIONAL,
            DROPOUT,
            PAD_IDX)

BIDIRECTIONAL = True
model_2D = LSTM(INPUT_DIM,
            EMBEDDING_DIM,
            HIDDEN_DIM,
            OUTPUT_DIM,
            N_LAYERS,
            BIDIRECTIONAL,
            DROPOUT,
            PAD_IDX)

In [90]:
pretrained_embeddings = TEXT.vocab.vectors
model_1D.embedding.weight.data.copy_(pretrained_embeddings)
model_2D.embedding.weight.data.copy_(pretrained_embeddings)

tensor([[ 3.5870e-02,  1.7993e+00,  1.1746e+00,  ...,  6.0010e-01,
          1.9139e+00, -1.6888e+00],
        [-1.5553e+00,  1.3104e+00, -1.0326e+00,  ..., -1.1747e+00,
         -6.6917e-01,  1.1146e+00],
        [-4.6539e-02,  6.1966e-01,  5.6647e-01,  ..., -3.7616e-01,
         -3.2502e-02,  8.0620e-01],
        ...,
        [-9.9313e-02,  9.0826e-01,  2.6145e-04,  ..., -7.1313e-01,
          4.1456e-01, -7.9174e-01],
        [ 4.7870e-01,  2.7702e+00, -1.2370e-01,  ..., -1.3711e-01,
         -1.3272e-01, -2.6547e-01],
        [ 4.6528e-01,  7.9971e-01,  6.9831e-01,  ...,  1.2706e+00,
          4.5536e-01,  1.6046e+00]])

In [91]:
UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]

model_1D.embedding.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)
model_1D.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)


model_2D.embedding.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)
model_2D.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)

#### Helper Functions

In [85]:
def batch_accuracy(predictions, label):

    preds = torch.round(torch.sigmoid(predictions))
    correct = (preds == label).float()
    accuracy = correct.sum() / len(correct)

    return accuracy

def timer(start_time, end_time):

    time = end_time - start_time
    mins = int(time / 60)
    secs = int(time - (mins * 60))

    return mins, secs
    
def train(model, iterator, optimizer, criterion):

    training_loss = 0.0
    training_acc = 0.0
    
    model.train()
     
    for batch in iterator:
        
        optimizer.zero_grad()
        
        text, text_lengths = batch.text
        text_lengths = text_lengths.cpu()
        
        predictions = model(text, text_lengths).squeeze(1)
        
        loss = criterion(predictions, batch.label)

        accuracy = batch_accuracy(predictions, batch.label)
        
        loss.backward()
        
        optimizer.step()
        
        training_loss += loss.item()
        training_acc += accuracy.item()

    return training_loss / len(iterator), training_acc / len(iterator)

def evaluate(model, iterator, criterion):

    eval_loss = 0.0
    eval_acc = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for batch in iterator:

            text, text_lengths = batch.text
            text_lengths = text_lengths.cpu()
            predictions = model(text, text_lengths).squeeze(1)
            
            loss = criterion(predictions, batch.label)
            
            accuracy = batch_accuracy(predictions, batch.label)

            eval_loss += loss.item()
            eval_acc += accuracy.item()
        
    return eval_loss / len(iterator), eval_acc / len(iterator)

#### Training the Model - 1 Direction

In [92]:
optimizer = optim.Adam(model_1D.parameters(), lr=1e-3)

criterion = nn.BCEWithLogitsLoss()

model = model_1D.to(device)
criterion = criterion.to(device)

In [94]:
NUM_EPOCHS = 10

best_valid_loss = float('inf')

for epoch in range(NUM_EPOCHS):

    start_time = time.time()
    
    train_loss, train_acc = train(model_1D, train_iterator, optimizer, criterion)
    
    valid_loss, valid_acc = evaluate(model_1D, valid_iterator, criterion)

    end_time = time.time()

    mins, secs = timer(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'model-small.pt')

    print("Epoch {}:".format(epoch+1))
    print("\t Total Time: {}m {}s".format(mins, secs))
    print("\t Train Loss {} | Train Accuracy: {}%".format(round(train_loss, 2), round(train_acc*100, 2)))
    print("\t Validation Loss {} | Validation Accuracy: {}%".format(round(valid_loss, 2), round(valid_acc*100, 2)))

Epoch 1:
	 Total Time: 0m 5s
	 Train Loss 0.55 | Train Accuracy: 72.39%
	 Validation Loss 0.49 | Validation Accuracy: 75.92%
Epoch 2:
	 Total Time: 0m 4s
	 Train Loss 0.49 | Train Accuracy: 76.27%
	 Validation Loss 0.47 | Validation Accuracy: 77.97%
Epoch 3:
	 Total Time: 0m 4s
	 Train Loss 0.46 | Train Accuracy: 78.92%
	 Validation Loss 0.47 | Validation Accuracy: 77.52%
Epoch 4:
	 Total Time: 0m 5s
	 Train Loss 0.43 | Train Accuracy: 80.47%
	 Validation Loss 0.43 | Validation Accuracy: 80.92%
Epoch 5:
	 Total Time: 0m 5s
	 Train Loss 0.4 | Train Accuracy: 81.86%
	 Validation Loss 0.43 | Validation Accuracy: 80.62%
Epoch 6:
	 Total Time: 0m 4s
	 Train Loss 0.38 | Train Accuracy: 83.2%
	 Validation Loss 0.43 | Validation Accuracy: 80.25%
Epoch 7:
	 Total Time: 0m 5s
	 Train Loss 0.37 | Train Accuracy: 83.82%
	 Validation Loss 0.44 | Validation Accuracy: 80.27%
Epoch 8:
	 Total Time: 0m 5s
	 Train Loss 0.35 | Train Accuracy: 85.21%
	 Validation Loss 0.43 | Validation Accuracy: 81.07%
Ep

#### Training the Model - BiDirectional

In [95]:
optimizer = optim.Adam(model_2D.parameters(), lr=1e-3)

criterion = nn.BCEWithLogitsLoss()

model = model_2D.to(device)
criterion = criterion.to(device)

In [96]:
NUM_EPOCHS = 10

best_valid_loss = float('inf')

for epoch in range(NUM_EPOCHS):

    start_time = time.time()
    
    train_loss, train_acc = train(model_2D, train_iterator, optimizer, criterion)
    
    valid_loss, valid_acc = evaluate(model_2D, valid_iterator, criterion)

    end_time = time.time()

    mins, secs = timer(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'model-small.pt')

    print("Epoch {}:".format(epoch+1))
    print("\t Total Time: {}m {}s".format(mins, secs))
    print("\t Train Loss {} | Train Accuracy: {}%".format(round(train_loss, 2), round(train_acc*100, 2)))
    print("\t Validation Loss {} | Validation Accuracy: {}%".format(round(valid_loss, 2), round(valid_acc*100, 2)))

Epoch 1:
	 Total Time: 0m 8s
	 Train Loss 0.59 | Train Accuracy: 67.76%
	 Validation Loss 0.5 | Validation Accuracy: 75.47%
Epoch 2:
	 Total Time: 0m 8s
	 Train Loss 0.5 | Train Accuracy: 75.54%
	 Validation Loss 0.47 | Validation Accuracy: 76.99%
Epoch 3:
	 Total Time: 0m 8s
	 Train Loss 0.46 | Train Accuracy: 78.22%
	 Validation Loss 0.47 | Validation Accuracy: 78.18%
Epoch 4:
	 Total Time: 0m 8s
	 Train Loss 0.44 | Train Accuracy: 80.01%
	 Validation Loss 0.43 | Validation Accuracy: 80.1%
Epoch 5:
	 Total Time: 0m 10s
	 Train Loss 0.41 | Train Accuracy: 81.49%
	 Validation Loss 0.44 | Validation Accuracy: 79.9%
Epoch 6:
	 Total Time: 0m 8s
	 Train Loss 0.39 | Train Accuracy: 82.64%
	 Validation Loss 0.44 | Validation Accuracy: 79.41%
Epoch 7:
	 Total Time: 0m 8s
	 Train Loss 0.37 | Train Accuracy: 83.73%
	 Validation Loss 0.44 | Validation Accuracy: 80.45%
Epoch 8:
	 Total Time: 0m 8s
	 Train Loss 0.35 | Train Accuracy: 84.83%
	 Validation Loss 0.44 | Validation Accuracy: 80.45%
Epo