<a href="https://colab.research.google.com/github/EmmaBenedetti/DeepLearningProjects/blob/main/baseline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Import dependencies

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
cd /content/drive/MyDrive/DL_assignment2

/content/drive/MyDrive/DL_assignment2


In [3]:
import numpy as np
import random
import json
import copy

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset

import glob

from tqdm.notebook import tqdm

### Fix random seed

In [4]:
seed = 0
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.deterministic=True 

### Import meta info (tokens, number of users )

In [5]:
meta = json.load(open('./meta.json', 'r'))
tokens = meta['tokens']
num_token = len(tokens)
num_user = meta['num_user']

In [6]:
print('In dataset, there are {} number of tokens (words) and these tweets are from {} users'.format(num_token, num_user))

In dataset, there are 13369 number of tokens (words) and these tweets are from 8 users


### Load train and validation dataset

In [7]:
train_data = json.load(open('./train.json', 'r'))
valid_data = json.load(open('./valid.json', 'r'))
s_idx = 0

In [8]:
print('{} tweets in train dataset, {} tweets in valid dataset.'.format(len(train_data), len(valid_data)))
print('Each json file is a list of dictionary, and each dictionary has information of tweets')
print('[TWEET INFO]: user id, sentence, processed token id.')
print()

print('Sample train data: ', train_data[s_idx])
print()
print('Note that: tokens.index(word) = token_id')
print()
print('Example:')
print(train_data[0]['token_id'])
print([tokens.index(w) for w in train_data[s_idx]['sentence'].split()])

6400 tweets in train dataset, 356 tweets in valid dataset.
Each json file is a list of dictionary, and each dictionary has information of tweets
[TWEET INFO]: user id, sentence, processed token id.

Sample train data:  {'user_id': 0, 'sentence': 'i recently met lakeisha crum the first in her family to go to college loved her story', 'token_id': [5721, 9659, 7459, 6629, 2686, 11853, 4447, 5870, 5460, 4236, 12017, 4981, 12017, 2197, 7047, 5460, 11310]}

Note that: tokens.index(word) = token_id

Example:
[5721, 9659, 7459, 6629, 2686, 11853, 4447, 5870, 5460, 4236, 12017, 4981, 12017, 2197, 7047, 5460, 11310]
[5721, 9659, 7459, 6629, 2686, 11853, 4447, 5870, 5460, 4236, 12017, 4981, 12017, 2197, 7047, 5460, 11310]


### Define Dataset and DataLoader
- Note that below code for dataset and dataloader only supports `batch_size = 1`.
- Try to find out a way to batchfy the data.
- Even if you batchfy the data, put the `token_id` information into `sample['token_id']`

In [9]:
'''class tweetDataset(Dataset):
    def __init__(self, data):
        self.data = data
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        sample = self.data[idx]
        sample['token_id'] = torch.Tensor(sample['token_id'])
        return sample'''

"class tweetDataset(Dataset):\n    def __init__(self, data):\n        self.data = data\n        \n    def __len__(self):\n        return len(self.data)\n    \n    def __getitem__(self, idx):\n        sample = self.data[idx]\n        sample['token_id'] = torch.Tensor(sample['token_id'])\n        return sample"

In [10]:
# find maximal token length for training and validation dataset
max_len_train = 0
for diz in train_data:
            max_len_train = max(max_len_train, len(diz['token_id']))

max_len_val = 0
for diz in valid_data:
            max_len_val = max(max_len_val, len(diz['token_id']))


print(f"max_len_train: {max_len_train}\nmax_len_val: {max_len_val}")
max_len = max(max_len_train, max_len_val)
print(f"Maximal token length: {max_len}")

max_len_train: 60
max_len_val: 47
Maximal token length: 60


In [11]:
class tweetDataset(Dataset):
    def __init__(self, data, max_len, num_token):
        self.data = data
        self.max_len = max_len
        self.num_token = num_token

        
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        sample = self.data[idx]

        #curr_user = sample['user_id']
        curr_token = sample['token_id']
        # padding
        padding_ls = [self.num_token for _ in range(self.max_len)]
        padding_ls[:len(curr_token)] = curr_token


        sample['token_id'] = torch.LongTensor(padding_ls[:-1])
        sample['output'] = torch.LongTensor(padding_ls[1:])
        #sample['user_id'] = torch.Tensor(curr_user)

        
        '''sample = self.data[idx]
        sample['token_id'] = torch.Tensor(sample['token_id'])'''
        return sample

In [12]:
batch_size = 64
train_dataset = tweetDataset(train_data, max_len, num_token)
valid_dataset = tweetDataset(valid_data, max_len,num_token)

train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
valid_dataloader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=True)

In [13]:
sample = next(iter(train_dataloader))
print(sample['token_id'].shape)

torch.Size([64, 59])


### Sample datapoint information

In [14]:
sample = next(iter(train_dataloader))

print('Sample from train dataloader: ')
print('USER ID: ', sample['user_id'])
print('TOKEN ID: ', sample['token_id'])
print('TOKEN ID shape should be BATCH by LENGTH: ', sample['token_id'].shape)

Sample from train dataloader: 
USER ID:  tensor([5, 0, 7, 6, 0, 2, 2, 6, 3, 2, 2, 0, 7, 0, 1, 2, 4, 7, 3, 2, 4, 7, 3, 7,
        4, 2, 3, 6, 3, 2, 5, 6, 2, 0, 1, 3, 6, 0, 2, 4, 2, 6, 2, 1, 0, 6, 3, 5,
        3, 6, 2, 2, 6, 6, 4, 2, 0, 4, 2, 4, 2, 0, 5, 7])
TOKEN ID:  tensor([[12949,  5721, 11659,  ..., 13369, 13369, 13369],
        [ 6191,  8326,  8241,  ..., 13369, 13369, 13369],
        [ 6679,  8012, 10740,  ..., 13369, 13369, 13369],
        ...,
        [11908,  5109,  1935,  ..., 13369, 13369, 13369],
        [ 6983,  1382, 11564,  ..., 13369, 13369, 13369],
        [ 5300,  1248, 12017,  ..., 13369, 13369, 13369]])
TOKEN ID shape should be BATCH by LENGTH:  torch.Size([64, 59])


### Define model based on LSTM
- Note that below code for model only supports `batch_size = 1`.
- Try to find out a way to use mini-batch.

```diff
- You must make your class name as "Model", as below.
- You must make your model work with the input of sample['token_id']
```

#### Other models

In [15]:
### OG MODEL
## funziona solo con batch_size = 1
'''
class Model(nn.Module):
    def __init__(self, num_token, num_user, embed_dim, rnn_dim, num_layers):
        super(Model, self).__init__()
        self.num_token = num_token
        self.num_user = num_user
        self.embed_dim = embed_dim
        self.rnn_dim = rnn_dim
        self.num_layers = num_layers
        
        self.embedding = nn.Embedding(num_token, embed_dim)
        self.rnn = nn.LSTM(embed_dim, rnn_dim, num_layers=num_layers, batch_first=True)
        self.out_linear = nn.Linear(rnn_dim, num_user)
        
    def forward(self, token_id):
        embed = self.embedding(token_id)
        out, _ = self.rnn(embed)
        return self.out_linear(out[:, -1])
        '''
        

'\nclass Model(nn.Module):\n    def __init__(self, num_token, num_user, embed_dim, rnn_dim, num_layers):\n        super(Model, self).__init__()\n        self.num_token = num_token\n        self.num_user = num_user\n        self.embed_dim = embed_dim\n        self.rnn_dim = rnn_dim\n        self.num_layers = num_layers\n        \n        self.embedding = nn.Embedding(num_token, embed_dim)\n        self.rnn = nn.LSTM(embed_dim, rnn_dim, num_layers=num_layers, batch_first=True)\n        self.out_linear = nn.Linear(rnn_dim, num_user)\n        \n    def forward(self, token_id):\n        embed = self.embedding(token_id)\n        out, _ = self.rnn(embed)\n        return self.out_linear(out[:, -1])\n        '

#### YOUR MODEL

In [16]:
class Model(nn.Module):
    def __init__(self, num_token, num_user, embed_dim, rnn_dim, num_layers):
        super(Model, self).__init__()
        self.num_token = num_token +1    # added 1 unit to num_token because we added <EOS>
        self.num_user = num_user
        self.embed_dim = embed_dim
        self.rnn_dim = rnn_dim
        self.num_layers = num_layers
        
        self.embedding = nn.Embedding(self.num_token, self.embed_dim)
        self.rnn = nn.GRU( input_size = self.embed_dim,
                            hidden_size = self.rnn_dim,
                            num_layers=self.num_layers,
                            #nonlinearity = 'relu',
                            batch_first=True)

        self.out_linear = nn.Linear(self.rnn_dim, self.num_user)
        self.dropout = nn.Dropout(0.2)
        
    def forward(self, token_id):
        embed = self.embedding(token_id)
        embed = self.dropout(embed)
        out, _ = self.rnn(embed)
        return self.out_linear(out[:, -1])

### Make an instance of model and define optimizer

In [17]:
device = 'cuda' if torch.cuda.is_available() else "cpu"
print(f"device: {device}")

num_layers = 2
model = Model(num_token, num_user, embed_dim=512, rnn_dim=1024, num_layers=num_layers).to(device)
optimizer = optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-9)
#scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=20, gamma=0.5)

device: cpu


In [18]:
# lr=1e-3 is TOO HIGH FOR GRU, accuracy will keep at 30.3%

In [19]:
print(model)

Model(
  (embedding): Embedding(13370, 512)
  (rnn): GRU(512, 1024, num_layers=2, batch_first=True)
  (out_linear): Linear(in_features=1024, out_features=8, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
)


### Number of parameter information
```diff
- The number of parameters should not exceed 20,000,000 !!
- DO NOT USE TRANSFORMER-BASED MODELS!!
- Transformer-based models will not be accepted as a submission.
```

In [20]:
num_param = sum(p.numel() for p in model.parameters())
print(f'Number of parameters: {num_param}')
#print('[NOTE] Number of parameters SHOULD NOT exceed 20,000,000 (20 million).')

assert num_param < 2*10e7
print('You can use this model')

Number of parameters: 17875976
You can use this model


### Test the model
```diff
- Test the model if it generates proper output, which shape is B by num_user
```

In [21]:
print(sample['token_id'].long().size())
#pred = model(sample['token_id'].long().to(device))
pred = model(sample['token_id'])

assert pred.shape == (batch_size,num_user)
print('Prediction shape would be BATCH X NUM_USER(OUTPUT) : ', pred.shape)

torch.Size([64, 59])
Prediction shape would be BATCH X NUM_USER(OUTPUT) :  torch.Size([64, 8])


### Run training for 100 epochs

In [None]:
criteria = nn.CrossEntropyLoss()
avg_loss = 0.0
best_valid_accu = 0.0
best_epoch = -1
best_model = None
num_epoch = 100


for epoch in tqdm(range(num_epoch)):
    # start training
    for sample in train_dataloader:
        model.train()
        optimizer.zero_grad()

        pred = model(sample['token_id'].long().to(device))

        loss = criteria(pred, sample['user_id'].long().to(device))

        loss.backward()
        optimizer.step()
        # ADDED SCHEDULER
        #scheduler.step()

        avg_loss += loss.item() / len(train_dataloader)

    # start validation
    correct_cnt = 0.0
    data_cnt = 0.0
    for sample in valid_dataloader:
        model.eval()

        with torch.no_grad():
            pred = model(sample['token_id'].long().to(device))

        pred_user_id = torch.argmax(pred, dim=-1)

        accu = pred_user_id.detach().cpu() == sample['user_id']

        correct_cnt += torch.sum(accu)
        data_cnt += sample['token_id'].shape[0]

    # calculate best valid accuracy, and save the best model. 
    curr_valid_accu = (correct_cnt / data_cnt).item()

    best_valid_accu = max(best_valid_accu, curr_valid_accu)
    if best_valid_accu == curr_valid_accu:
        best_epoch = epoch
        best_model = copy.deepcopy(model)
        torch.save(best_model.state_dict(), f'best_baseline_GRU_{num_epoch}epochs_{num_layers}layers_{batch_size}batch.pth')
        print('[EPOCH {}] BEST VALID ACCURACY UPDATED: {}'.format(epoch+1, round(best_valid_accu,3)))

  0%|          | 0/100 [00:00<?, ?it/s]

[EPOCH 1] BEST VALID ACCURACY UPDATED: 0.486
[EPOCH 2] BEST VALID ACCURACY UPDATED: 0.649
[EPOCH 3] BEST VALID ACCURACY UPDATED: 0.666
[EPOCH 4] BEST VALID ACCURACY UPDATED: 0.733
[EPOCH 5] BEST VALID ACCURACY UPDATED: 0.747
[EPOCH 24] BEST VALID ACCURACY UPDATED: 0.761


In [None]:
print('FINISHED TRAINING : BEST MODEL AT EPOCH {} WITH ACCURACY {}'.format(best_epoch+1, best_valid_accu))