In [None]:
import time
import random
import numpy as np

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
import torchvision as tv

In [None]:
"""
Create dummy dataset
"""
BATCH_SIZE = 512
NUM_DATA = BATCH_SIZE * 10
NUM_CLASSES = 10
DIM = 100
LR = 1e-2
VOCAB_SIZE = 100     # index 0 for PAD
MIN_L = 10
MAX_L = 50

DEVICE = "cpu"
if torch.cuda.is_available:
    DEVICE = "cuda:0"

# dummy sequence data generation
# data format = [
#  [5, 21, 5, 9, 478, 38, 1, 2],
#  [9, 5, 3, 1, 4, 6, 7, 24, 44, 12, 2, 1, 3],
#  [5, 1, 6, 4],
#  ...
# ]
dummy_data = [[random.randint(1, VOCAB_SIZE - 1) for _ in range(random.randint(MIN_L, MAX_L))] for _ in range(NUM_DATA)]


# (NUM_CLASS - 1e-10) is to avoid the random variable is 10.
dummy_label = np.random.uniform(0, NUM_CLASSES - 1e-10, size=(NUM_DATA,)).astype(int) # 

print("max length: ", max([len(d) for d in dummy_data]))
print("min length: ", min([len(d) for d in dummy_data]))
print("max vocab index", max([max(d) for d in dummy_data]))

print(len(dummy_data))
print(dummy_label.shape)


max length:  50
min length:  10
max vocab index 99
5120
(5120,)


In [None]:
"""
Model
"""

class BaseModel(nn.Module):
    def __init__(self, input_size, num_classes):
        super(BaseModel, self).__init__()
        self.input_size = input_size

        dim = 32
        self.embedding = nn.Embedding(input_size, dim)
        self.model = nn.Sequential(
            nn.Linear(dim, dim),
            nn.ReLU(),
            nn.Linear(dim, dim),
            nn.ReLU(),
            nn.Linear(dim, num_classes),
        )

    def forward(self, x):

        x = self.embedding(x)

        return self.model(x.mean(1))

model = BaseModel(DIM, NUM_CLASSES).to(DEVICE)

print(model)
print(DEVICE)

BaseModel(
  (embedding): Embedding(100, 32)
  (model): Sequential(
    (0): Linear(in_features=32, out_features=32, bias=True)
    (1): ReLU()
    (2): Linear(in_features=32, out_features=32, bias=True)
    (3): ReLU()
    (4): Linear(in_features=32, out_features=10, bias=True)
  )
)
cuda:0


In [None]:
"""
Create the optimizer and objective function
"""
loss_function = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=LR)

"""
Evaluation metrics
"""
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score

def genMetrics(trueY, predY):
    """
    Return acc and auc
    """
    accuracy = accuracy_score(trueY, predY)
    
    return round(accuracy, 4)

In [None]:
"""
Data Module
""" 

class MyDataset(torch.utils.data.Dataset):
    def __init__(self, data, transformations):
        # data is composed of [xs, ys]
        # where xs.shape = [size of total data, dimension]
        # where ys.shape = [size of total data]

        self.data = data
        self.transformations = transformations

        assert len(self.data) == len(self.transformations)

    def __getitem__(self, index):
        return tuple(transformation(data_[index]) if transformation else data_[index] 
                     for data_, transformation in zip(self.data, self.transformations))

    def __len__(self):
        return len(self.data[0])

def getData_dataloader(x, y, batchSize):
    # Return a dataloader object

    x_transformation = None

    y_transformation = None

    d = MyDataset([x, y], [x_transformation, y_transformation])

    ### TODO 
    # Create a dataloader object and return it

    def custom_collate_func(batch):
        # TODO
        
        # Input: batch
        # batch = [data_tuples_1, data_tuples_2, ..., data_tuples_n]
        # data_tuples = (unpadded sequence, label)
        
        # First TODO
        # Extract the unpadded sequence from "batch", and assign it as "data_" (data_ will be list of list)
        # Extract the lengths of all the unpadded sequence and assign it as "lengths_" (list)
        # Extract the label from "batch", and assign it as "label_" (list)

        # Second TODO
        # Pad the sequence to the same length and assign it as "data_" (list of list)

        # Third TODO
        # Get the mask of the sequence. Mask denotes whether the elements in the padded data are existed in the origin unpadded data.
        # For example, if the unpadded data is [4, 1, 3, 1] and the padded data is [4, 1, 3, 1, 0, 0],
        # then the mask is [1, 1, 1, 1, 0, 0] 
        # mask: list of list, size is same as data_

        # Forth TODO
        # Transform data_ to torch.LongTensor, label_ to torch.LongTensor, lengths_ to torch.LongTensor, mask_ to torch.FloatTensor


        return data_, label_, lengths_, mask_

    return torch.utils.data.DataLoader(d, batch_size=batchSize, shuffle=True, num_workers=2, collate_fn=custom_collate_func)

###########################################################################################
#### Start training
###########################################################################################

# You can use getData_naive to see the output of data
getData = getData_dataloader

# Decorator for computing time
def print_time(func):
    def decorated_func(*args, **kwargs):
        s = time.time()
        ret = func(*args, **kwargs)
        e = time.time()

        print(f"spend {e - s:.3f} s")
        return ret

    return decorated_func

@print_time
def train(train_data, epoch, model, loss_function, optimizer):
    print("\n" + "="*30 + "TRAIN" + "="*30)
    y_p = []
    y_t = []
    model.train()
    avg_loss = 0.0
    count = 0
    # need a batch dataloader
    for batch in getData(train_data[0], train_data[1], BATCH_SIZE):
        input_data = batch[0].to(DEVICE)
        target_data = batch[1].to(DEVICE)

        y_pred = model(input_data)
        loss = loss_function(y_pred, target_data)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        avg_loss += loss.item()
        #a better metric here
        predictions = y_pred.data.max(1)[1].cpu().tolist()
        y_p += predictions
        y_t += target_data.tolist()
        #a better metric here 
        count += 1
        if count % 100 == 0:
            print(f"epoch: {epoch}, iterations: {count}, loss: {loss.item():.3f}")

    print(f"the average loss after completion of {epoch} epochs is {avg_loss/count:.3f}")
    acc = genMetrics(y_t, y_p)
    print(f"acc: {acc:.3f}")
    print("="*60)

epochs = 4
for epoch in range(1, epochs+1):
    train([dummy_data, dummy_label], epoch, model, loss_function, optimizer)


NameError: ignored

In [None]:
""