# Corner_Timing_Prediction

## Download data


In [55]:
!gdown --id '1L8Yo7n6z1QCysCq5WczCZjmGmF9DdChO' --output cell_corner.data.csv  

Downloading...
From: https://drive.google.com/uc?id=1L8Yo7n6z1QCysCq5WczCZjmGmF9DdChO
To: /content/cell_corner.data.csv
100% 471k/471k [00:00<00:00, 152MB/s]


## Import packages

In [56]:
# Numerical Operations
import math
import numpy as np

# Reading/Writing Data
import pandas as pd
import os
import csv

# For Progress Bar
from tqdm import tqdm

# Pytorch
import torch 
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, random_split

# For plotting learning curve
from torch.utils.tensorboard import SummaryWriter

# For plotting truth v.s. prediction graphs
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure

## Some Utility Functions

You do not need to modify this part.

In [57]:
def same_seed(seed): 
    '''Fixes random number generator seeds for reproducibility.'''
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

def data_split(data_set, valid_ratio, test_ratio, corners, seed):
    '''Split provided training data into training set and validation set'''
    shape = data_set.shape
    data_per_corner = int(shape[0]/corners)
    print(shape)
    data_set = np.reshape(data_set, ( corners, data_per_corner, shape[1]))
    print(data_set.shape)    

    # shuffles all corners so that they are ordered randomly
    np.random.shuffle(data_set)
    valid_set = data_set[0:int(corners*valid_ratio)]
    test_set =  data_set[int(corners*valid_ratio):int(corners*(valid_ratio+test_ratio))]
    train_set = data_set[int(corners*(valid_ratio+test_ratio)):]
    print(valid_set.shape, test_set.shape, train_set.shape)

    valid_set = np.reshape(valid_set, (valid_set.shape[0]*valid_set.shape[1], valid_set.shape[2]))
    test_set = np.reshape(test_set, (test_set.shape[0]*test_set.shape[1], test_set.shape[2]))
    train_set = np.reshape(train_set, (train_set.shape[0]*train_set.shape[1], train_set.shape[2]))
    print(valid_set.shape, test_set.shape, train_set.shape)


    # data_set = np.reshape(data_set, (shape[0], shape[1]))
    # valid_set_size = int(valid_ratio * len(data_set)) 
    # test_set_size = int(test_ratio * len(data_set))
    # train_set_size = len(data_set) - valid_set_size - test_set_size
    # train_set, valid_set, test_set = random_split(data_set, [train_set_size, valid_set_size, test_set_size], generator=torch.Generator().manual_seed(seed))
    return np.array(train_set), np.array(valid_set), np.array(test_set)
    
    # valid_set, test_set, train_set = [], [], []
    # for i, corner_set in enumerate(data_set):
    #   if i in valid_corners:
    #     valid_set.append(corner_set)
    #   elif i in test_corners:
    #     test_set.append(corner_set)
    #   else:
    #     train_set.append(corner_set)
    
    # valid_shape, test_shape, train_shape = np.asarray(valid_set).shape, np.asarray(test_set).shape, np.asarray(train_set).shape
    # print(valid_shape, test_shape, train_shape)
    # valid_set = np.reshape(valid_set, (valid_shape[0]*valid_shape[1], valid_shape[2]))
    # test_set = np.reshape(test_set, (test_shape[0]*test_shape[1], test_shape[2]))
    # train_set = np.reshape(train_set, (train_shape[0]*train_shape[1], train_shape[2]))

    # print(np.asarray(valid_set).shape, np.asarray(test_set).shape, np.asarray(train_set).shape)

# def data_split(data_set, valid_ratio, test_ratio, seed):
#     '''Split provided training data into training set and validation set'''
#     np.random.shuffle(data_set)
#     valid_set_size = int(valid_ratio * len(data_set)) 
#     test_set_size = int(test_ratio * len(data_set))
#     train_set_size = len(data_set) - valid_set_size - test_set_size
#     train_set, valid_set, test_set = random_split(data_set, [train_set_size, valid_set_size, test_set_size], generator=torch.Generator().manual_seed(seed))
#     return np.array(train_set), np.array(valid_set), np.array(test_set)

## Dataset

In [58]:
class My_Dataset(Dataset):
    '''
    x: Features.
    y: Targets, if none, do prediction.
    '''
    def __init__(self, x, y=None):
        if y is None:
            self.y = y
        else:
            self.y = torch.FloatTensor(y)

        x = torch.transpose(torch.FloatTensor(x),0,1)
        means = x.mean(1, keepdim=True)
        stds = x.std(1, keepdim=True)
        self.x = torch.transpose((x-means)/stds,0,1)

        # self.x = torch.FloatTensor(x)

    def __getitem__(self, idx):
        if self.y is None:
            return self.x[idx]
        else:
            return self.x[idx], self.y[idx]

    def __len__(self):
        return len(self.x)

## Neural Network Model
Try out different model architectures by modifying the class below.

In [59]:
class My_Model(nn.Module):
    def __init__(self, input_dim):
        super(My_Model, self).__init__()
        self.layers = nn.Sequential(
            nn.Linear(input_dim, 256),
            nn.ReLU(),
            nn.Linear(256,64),
            nn.ReLU(),
            nn.Linear(64,16),
            nn.ReLU(),
            nn.Linear(16,8),
            nn.ReLU(),
            nn.Linear(8,4),
            nn.ReLU(),
            nn.Linear(4,1)
        )

    def forward(self, x):
        x = self.layers(x)
        x = x.squeeze(1) # (B, 1) -> (B)
        return x

## Feature Selection
Choose features you deem useful by modifying the function below.

In [60]:
def select_feat(train_data, valid_data, test_data, select_all=True):
    '''Selects useful features to perform regression'''

    y_train, y_valid, y_test = train_data[:,-1], valid_data[:,-1], test_data[:,-1]
    raw_x_train, raw_x_valid, raw_x_test = train_data[:,:-1], valid_data[:,:-1], test_data[:,:-1]

    if select_all:
        feat_idx = list(range(raw_x_train.shape[1]))
    else:
        feat_idx = [0,1,2,3,4] # Select suitable feature columns.
        
    return raw_x_train[:,feat_idx], raw_x_valid[:,feat_idx], raw_x_test[:,feat_idx], y_train, y_valid, y_test

## Training Loop

In [61]:
def trainer(train_loader, valid_loader, model, config, device):

    criterion = nn.MSELoss(reduction='mean') 
    optimizer = torch.optim.AdamW(model.parameters(), lr=config['learning_rate']) 

    writer = SummaryWriter() # Writer of tensoboard.

    if not os.path.isdir('./models'):
        os.mkdir('./models') # Create directory of saving models.

    n_epochs, best_loss, step, early_stop_count = config['n_epochs'], math.inf, 0, 0

    for epoch in range(n_epochs):
        model.train() # Set your model to train mode.
        loss_record = []

        # tqdm is a package to visualize your training progress.
        train_pbar = tqdm(train_loader, position=0, leave=True)

        for x, y in train_pbar:
            optimizer.zero_grad()               # Set gradient to zero.
            x, y = x.to(device), y.to(device)   # Move your data to device. 
            pred = model(x)             
            loss = criterion(pred, y)
            loss.backward()                     # Compute gradient(backpropagation).
            optimizer.step()                    # Update parameters.
            step += 1
            loss_record.append(loss.detach().item())
            
            # Display current epoch number and loss on tqdm progress bar.
            train_pbar.set_description(f'Epoch [{epoch+1}/{n_epochs}]')
            train_pbar.set_postfix({'loss': loss.detach().item()})

        mean_train_loss = sum(loss_record)/len(loss_record)
        writer.add_scalar('Loss/train', mean_train_loss, step)

        model.eval() # Set your model to evaluation mode.
        loss_record = []
        for x, y in valid_loader:
            x, y = x.to(device), y.to(device)
            with torch.no_grad():
                pred = model(x)
                loss = criterion(pred, y)

            loss_record.append(loss.item())
            
        mean_valid_loss = sum(loss_record)/len(loss_record)
        print(f'Epoch [{epoch+1}/{n_epochs}]: Train loss: {(mean_train_loss*100):.4f}, Valid loss: {(mean_valid_loss*100):.4f}')
        writer.add_scalar('Loss/valid', mean_valid_loss, step)

        if mean_valid_loss < best_loss:
            best_loss = mean_valid_loss
            torch.save(model.state_dict(), config['save_path']) # Save your best model
            print('Saving model with loss {:.3f}...'.format(best_loss*100))
            early_stop_count = 0
        else: 
            early_stop_count += 1

        if early_stop_count >= config['early_stop']:
            print('\nModel is not improving, so we halt the training session.')
            return

## Configurations
`config` contains hyper-parameters for training and the path to save your model.

In [62]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
config = {
    'seed': 96524,      # Your seed number, you can pick your lucky number. :)
    'select_all': True,   # Whether to use all features.
    'valid_ratio': 0.2,   # validation_size = train_size * valid_ratio
    'test_ratio': 0.05,   # test_size = train_size * test_ratio
    'n_epochs': 3000,     # Number of epochs.            
    'batch_size': 64, 
    "corners": 27,
    'learning_rate': 0.0005,              
    'early_stop': 400,    # If model has not improved for this many consecutive epochs, stop training.     
    'save_path': './models/model.ckpt'  # Your model will be saved here.
}

## Dataloader
Read data from files and set up training, validation, and testing sets. You do not need to modify this part.

In [63]:
# Set seed for reproducibility
same_seed(config['seed'])

my_data = pd.read_csv('./cell_corner.data.csv').values
train_data, valid_data, test_data = data_split(my_data, config['valid_ratio'], config['test_ratio'], config['corners'], config['seed'])

# Print out the data size.
print(f"""train_data size: {train_data.shape} 
valid_data size: {valid_data.shape} 
test_data size: {test_data.shape}""")

# Select features
x_train, x_valid, x_test, y_train, y_valid, y_test = select_feat(train_data, valid_data, test_data, config['select_all'])

# Print out the number of features.
print(f'number of features: {x_train.shape[1]}')

train_dataset, valid_dataset, test_dataset= My_Dataset(x_train, y_train), My_Dataset(x_valid, y_valid), My_Dataset(x_test, y_test)

# Pytorch data loader loads pytorch dataset into batches.
train_loader = DataLoader(train_dataset, batch_size=config['batch_size'], shuffle=True, pin_memory=True)
valid_loader = DataLoader(valid_dataset, batch_size=config['batch_size'], shuffle=True, pin_memory=True)
test_loader = DataLoader(test_dataset, batch_size=config['batch_size'], shuffle=False, pin_memory=True)

(10584, 12)
(27, 392, 12)
(5, 392, 12) (1, 392, 12) (21, 392, 12)
(1960, 12) (392, 12) (8232, 12)
train_data size: (8232, 12) 
valid_data size: (1960, 12) 
test_data size: (392, 12)
number of features: 11


## Start training!

In [None]:
model = My_Model(input_dim=x_train.shape[1]).to(device) # put your model and data on the same computation device.
trainer(train_loader, valid_loader, model, config, device)

Epoch [1/3000]: 100%|██████████| 129/129 [00:01<00:00, 99.49it/s, loss=0.00359]


Epoch [1/3000]: Train loss: 1.0684, Valid loss: 0.2042
Saving model with loss 0.204...


Epoch [2/3000]: 100%|██████████| 129/129 [00:01<00:00, 98.37it/s, loss=0.000726]


Epoch [2/3000]: Train loss: 0.1720, Valid loss: 0.1193
Saving model with loss 0.119...


Epoch [3/3000]: 100%|██████████| 129/129 [00:01<00:00, 100.25it/s, loss=0.000442]


Epoch [3/3000]: Train loss: 0.0764, Valid loss: 0.1267


Epoch [4/3000]: 100%|██████████| 129/129 [00:01<00:00, 101.34it/s, loss=0.000172]


Epoch [4/3000]: Train loss: 0.0273, Valid loss: 0.1385


Epoch [5/3000]: 100%|██████████| 129/129 [00:01<00:00, 99.42it/s, loss=0.000134]


Epoch [5/3000]: Train loss: 0.0172, Valid loss: 0.1087
Saving model with loss 0.109...


Epoch [6/3000]: 100%|██████████| 129/129 [00:01<00:00, 98.67it/s, loss=0.00013]


Epoch [6/3000]: Train loss: 0.0127, Valid loss: 0.1377


Epoch [7/3000]: 100%|██████████| 129/129 [00:01<00:00, 99.73it/s, loss=5.25e-5]


Epoch [7/3000]: Train loss: 0.0108, Valid loss: 0.0982
Saving model with loss 0.098...


Epoch [8/3000]: 100%|██████████| 129/129 [00:01<00:00, 90.65it/s, loss=4.68e-5]


Epoch [8/3000]: Train loss: 0.0084, Valid loss: 0.1352


Epoch [9/3000]: 100%|██████████| 129/129 [00:01<00:00, 101.51it/s, loss=4.74e-5]


Epoch [9/3000]: Train loss: 0.0073, Valid loss: 0.0897
Saving model with loss 0.090...


Epoch [10/3000]:  17%|█▋        | 22/129 [00:00<00:01, 93.08it/s, loss=3.99e-5]

## Plot learning curves with `tensorboard` (optional)

`tensorboard` is a tool that allows you to visualize your training progress.

If this block does not display your learning curve, please wait for few minutes, and re-run this block. It might take some time to load your logging information. 

In [None]:
%reload_ext tensorboard
%tensorboard --logdir=./runs/

## Functions for Testing and Plotting

In [None]:
def plot_pred(dv_set, model, device, xlabel, ylabel, lim=1., preds=None, targets=None):
    ''' Plot prediction of your DNN '''
    if preds is None or targets is None:
        model.eval()
        preds, targets = [], []
        for x, y in dv_set:
          x, y = x.to(device), y.to(device)
          with torch.no_grad():
              pred = model(x)
              preds.append(pred.detach().cpu())
              targets.append(y.detach().cpu())
        preds = torch.cat(preds, dim=0).numpy()
        targets = torch.cat(targets, dim=0).numpy()
        
        diff = 0
        for i in range(0,len(preds)):
          diff += abs((preds[i]-targets[i])/targets[i])
        print('error:',diff/len(preds))
        print('(error = mean of |(pred_val - true_val)/true_val|)')

    figure(figsize=(5, 5))
    plt.scatter(targets, preds, c='r', alpha=0.5)
    plt.plot([-0.2, lim], [-0.2, lim], c='b')
    plt.xlim(-0.2, lim)
    plt.ylim(-0.2, lim)
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    plt.title(xlabel+' v.s. '+ylabel)
    plt.show()
    return preds

def save_pred(preds, file):
    ''' Save predictions to specified file '''
    with open(file, 'w') as fp:
        writer = csv.writer(fp)
        writer.writerow(['id', 'tested_positive'])
        for i, p in enumerate(preds):
            writer.writerow([i, p, y_test[i]])


## Compare Dataset and Prediction
The prediction of the model is stored as 'pred.csv'


In [None]:
plot_pred(valid_loader, model, device, 'valid data truth value', 'predicted value')
print()
preds = plot_pred(test_loader, model, device, 'test data truth value', 'predicted value')
save_pred(preds, 'pred.csv') 

## Saving Model
The model is saved to the path specified at config['save_path']

In [None]:
model = My_Model(input_dim=x_train.shape[1]).to(device)
model.load_state_dict(torch.load(config['save_path']))

## Reference
This notebook is adopted from code written by Heng-Jui Chang @ NTUEE (https://github.com/ga642381/ML2021-Spring/blob/main/HW01/HW01.ipynb)