In [51]:
import numpy as np
import time

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
import torchvision as tv
from torch.utils.data import DataLoader

In [71]:
"""
Create dummy dataset
"""
BATCH_SIZE = 512
NUM_DATA = BATCH_SIZE * 10
NUM_CLASSES = 10
DIM = 100
LR = 1e-2

DEVICE = "cpu"
#print(bool(torch.cuda.is_available()))
if torch.cuda.is_available():
    DEVICE = "cuda:0"

dummy_data = np.random.normal(size=(NUM_DATA, DIM)).astype(np.float32)
# (NUM_CLASS - 1e-10) is to avoid the random variable is 10.
dummy_label = np.random.uniform(0, NUM_CLASSES - 1e-10, size=(NUM_DATA,)).astype(int) # 

print(dummy_data.shape, dummy_label.shape)
# z = [dummy_data, dummy_label]
# print(z[0])

(5120, 100) (5120,)


In [72]:
class MyDataset(torch.utils.data.Dataset):
    def __init__(self, data, transformations):
        # data: which is composed of [xs, ys]
        # where xs.shape = [size of total data, dimension]
        # where ys.shape = [size of total data]

        # transformations: [transformation] data augmentation for data

        self.data = data
        self.transformations = transformations

        assert len(self.data) == len(self.transformations)

    def __getitem__(self, index):
        ### TODO
        # Return the tuple of data given the index
        # That is, (x, y)
#         if self.transformations is not None:
#             data_ = transformations(data_)
        
        return (self.data[0][index,:], self.data[1][index])
        #raise NotImplementedError 

    def __len__(self):
        ### TODO
        # Return the size of the data
        #raise NotImplementedError
        return len(self.data[1][:])

In [73]:
"""
Dataset check
"""
def check_dataset():
    dset = MyDataset([dummy_data, dummy_label], [None, None])
    num_data = 0

    for data_ in dset:
        
        input_, label_ = data_
        assert input_.shape == (DIM, )

        num_data += 1

    assert num_data == dummy_data.shape[0], "Implementation of __len__ might be wrong"

    print("pass DataSet check")

check_dataset()

pass DataSet check


In [74]:
"""
DataLoader implementation
"""
def getData_dataloader(x, y, batchSize):
    # Return a dataloader object
    x, y = torch.FloatTensor(x), torch.LongTensor(y)
    
    x_transformation = None

    y_transformation = None

    d = MyDataset([x, y], [x_transformation, y_transformation])

    ### TODO 
    # Create a dataloader object and return it
    
    return DataLoader(d, batchSize, shuffle=True)

#     raise NotImplementedError

In [75]:
"""
DataLoader check
"""

def check_dataloader():
    dloader = getData_dataloader(dummy_data, dummy_label, 32)
    num_data = 0

    try:
        for data_ in dloader:
            
            input_, label_ = data_

            num_data += input_.shape[0]

        assert num_data == dummy_data.shape[0], "Implementation of __len__ might be wrong"

        print("pass DataLoader check")

    except IndexError:
        # out of index in data loader
        print("Implementation of __len__ might be wrong")


check_dataloader()

pass DataLoader check


In [76]:
"""
Model
"""

class BaseModel(nn.Module):
    def __init__(self, input_size, num_classes):
        super(BaseModel, self).__init__()
        self.input_size = input_size

        dim = 32
        self.model = nn.Sequential(
            nn.Linear(input_size, dim),
            nn.ReLU(),
            nn.Linear(dim, dim),
            nn.ReLU(),
            nn.Linear(dim, num_classes),
        )

    def forward(self, x):
        return self.model(x)

model = BaseModel(DIM, NUM_CLASSES).to(DEVICE)
#model = BaseModel(DIM, NUM_CLASSES)

print(model)
print(DEVICE)

BaseModel(
  (model): Sequential(
    (0): Linear(in_features=100, out_features=32, bias=True)
    (1): ReLU()
    (2): Linear(in_features=32, out_features=32, bias=True)
    (3): ReLU()
    (4): Linear(in_features=32, out_features=10, bias=True)
  )
)
cpu


In [77]:
"""
Create the optimizer and objective function
"""
loss_function = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=LR)

"""
Evaluation metrics
"""
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score

def genMetrics(trueY, predY):
    """
    Return acc and auc
    """
    accuracy = accuracy_score(trueY, predY)
    
    return round(accuracy, 4)

In [78]:
"""
Data Module
""" 

# Naive method
def getData_naive(x, y, batchSize):
    """
    Create a generator for loading data
    """

    # shuffle the data
    perm = np.arange(len(x))
    np.random.shuffle(perm)
    x = x[perm]
    y = y[perm]

    numBatches = len(x) // batchSize

    x, y = torch.FloatTensor(x), torch.LongTensor(y)

    for i in range(numBatches):
        start = i * batchSize
        end = start + batchSize
        batchX = x[start: end]
        batchY = y[start: end]

        yield batchX, batchY

###########################################################################################
#### Start training
###########################################################################################

# You can use getData_naive to see the output of data
getData = getData_dataloader

# Decorator for computing time
def print_time(func):
    def decorated_func(*args, **kwargs):
        s = time.time()
        ret = func(*args, **kwargs)
        e = time.time()

        print(f"spend {e - s:.3f} s")
        return ret

    return decorated_func

@print_time
def train(train_data, epoch, model, loss_function, optimizer):
    print("\n" + "="*30 + "TRAIN" + "="*30)
    y_p = []
    y_t = []
    model.train()
    avg_loss = 0.0
    count = 0
    # need a batch dataloader
    for batch in getData(train_data[0], train_data[1], BATCH_SIZE):
        input_data = batch[0].to(DEVICE)
        target_data = batch[1].to(DEVICE)

        y_pred = model(input_data)
        print(y_pred.shape)
        print(target_data.shape)
        loss = loss_function(y_pred, target_data)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        avg_loss += loss.item()
        #a better metric here
        predictions = y_pred.data.max(1)[1].cpu().tolist()
        y_p += predictions
        y_t += target_data.tolist()
        #a better metric here 
        count += 1
        if count % 100 == 0:
            print(f"epoch: {epoch}, iterations: {count}, loss: {loss.item():.3f}")

    print(f"the average loss after completion of {epoch} epochs is {avg_loss/count:.3f}")
    acc = genMetrics(y_t, y_p)
    print(f"acc: {acc:.3f}")
    print("="*60)

epochs = 4
for epoch in range(1, epochs+1):
    train([dummy_data, dummy_label], epoch, model, loss_function, optimizer)



torch.Size([512, 10])
torch.Size([512])
torch.Size([512, 10])
torch.Size([512])
torch.Size([512, 10])
torch.Size([512])
torch.Size([512, 10])
torch.Size([512])
torch.Size([512, 10])
torch.Size([512])
torch.Size([512, 10])
torch.Size([512])
torch.Size([512, 10])
torch.Size([512])
torch.Size([512, 10])
torch.Size([512])
torch.Size([512, 10])
torch.Size([512])
torch.Size([512, 10])
torch.Size([512])
the average loss after completion of 1 epochs is 2.309
acc: 0.100
spend 0.330 s

torch.Size([512, 10])
torch.Size([512])
torch.Size([512, 10])
torch.Size([512])
torch.Size([512, 10])
torch.Size([512])
torch.Size([512, 10])
torch.Size([512])
torch.Size([512, 10])
torch.Size([512])
torch.Size([512, 10])
torch.Size([512])
torch.Size([512, 10])
torch.Size([512])
torch.Size([512, 10])
torch.Size([512])
torch.Size([512, 10])
torch.Size([512])
torch.Size([512, 10])
torch.Size([512])
the average loss after completion of 2 epochs is 2.278
acc: 0.152
spend 0.359 s

torch.Size([512, 10])
torch.Size([512

In [68]:
cuda = torch.cuda.is_available() 

In [69]:
print(cuda)

False
