## Download Data

In [4]:
tr_path = "./data/covid.train.csv"
tt_path = "./data/covid.test.csv"

## Import Some Packages

In [None]:
import os

# PyTorch
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

# For data preprocess
import numpy as np
import csv
import os

# For plotting
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure

# Sklearn
import sklearn
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn import preprocessing

# Pandas
import pandas as pd

# Random
import random

In [8]:
def set_seed(seed):
  """为所有常见的随机性来源设置种子"""
  np.random.seed(seed)    # 为 NumPy 设置种子
  random.seed(seed)       # 为 Python 内置 random 模块设置种子
  torch.manual_seed(seed) # 为 PyTorch CPU 设置种子
  if torch.cuda.is_available():
    torch.cuda.manual_seed(seed) # 为当前GPU设置种子
    torch.cuda.manual_seed_all(seed) # 为所有GPU设置种子（如果有多块）

  # 一些 CuDNN 基准优化操作本身具有不确定性，固定它们以保证可重复性
  torch.backends.cudnn.deterministic = True
  torch.backends.cudnn.benchmark = False

  # 为 Python 环境变量设置种子（影响哈希等行为）
  os.environ['PYTHONHASHSEED'] = str(seed)

myseed = 42069
set_seed(myseed)

device_count = torch.cuda.device_count()
print(f"Number of CUDA devices: {device_count}")

Number of CUDA devices: 0


## Some Utilies

In [None]:
def get_device():
    ''' Get device (if GPU is available, use GPU) '''
    return 'cuda' if torch.cuda.is_available() else 'cpu'

def plot_learning_curve(loss_record, title=''):
    ''' Plot learning curve of your DNN (train & dev loss) '''
    total_steps = len(loss_record['train'])
    x_1 = range(total_steps)
    x_2 = x_1[::len(loss_record['train']) // len(loss_record['dev'])]
    figure(figsize=(6, 4))
    plt.plot(x_1, loss_record['train'], c='tab:red', label='train')
    plt.plot(x_2, loss_record['dev'], c='tab:cyan', label='dev')
    plt.ylim(0.0, 5.)
    plt.xlabel('Training steps')
    plt.ylabel('MSE loss')
    plt.title('Learning curve of {}'.format(title))
    plt.legend()
    plt.show()


def plot_pred(dv_set, model, device, lim=35., preds=None, targets=None):
    ''' Plot prediction of your DNN '''
    if preds is None or targets is None:
        model.eval()
        preds, targets = [], []
        for x, y in dv_set:
            x, y = x.to(device), y.to(device)
            with torch.no_grad():
                pred = model(x)
                preds.append(pred.detach().cpu())
                targets.append(y.detach().cpu())
        preds = torch.cat(preds, dim=0).numpy()
        targets = torch.cat(targets, dim=0).numpy()

    figure(figsize=(5, 5))
    plt.scatter(targets, preds, c='r', alpha=0.5)
    plt.plot([-0.2, lim], [-0.2, lim], c='b')
    plt.xlim(-0.2, lim)
    plt.ylim(-0.2, lim)
    plt.xlabel('ground truth value')
    plt.ylabel('predicted value')
    plt.title('Ground Truth v.s. Prediction')
    plt.show()

## Preprocess

We have three kinds of dataset:
- `train`: for training
- `dev`: for validation
- `test`: for testing (w/o target value)

## Dataset

The `COVID19Dataset` below does:

- read `.csv` files
- extract features
- split `covid.train.csv` into train/dev sets
- normalize features

Finishing `TODO` below might make you pass medium baseline.

In [12]:
class COVID19Dataset(Dataset):
    """Dataset for loading and preprocessing the COVID19 dataset"""
    def __init__(self, path:str , mode='train', target_only=False):
        self.mode = mode

        # Read data into numpy arrays
        with open(path, 'r') as fp:
            data = list(csv.reader(fp))
            data = np.array(data[1:])[:, 1:].astype(float)
        
        if not target_only:
            feats = list(range(93))
        else:
            # TODO: using 40 states & 2 tested_positive features (indices = 57 & 75)
            pass

        if mode == 'test':
            # Testing data
            # data: 893 x 93 (40 states + day 1 (18) + day 2(18) + day 3 (17))
            data = data[:, feats]
            self.data = torch.FloatTensor(data)
        else:
            # Training data (train/dev sets)
            # data: 2700 x 94 (40 states + day 1 (18) + day 2(18) + day 3 (18))
            target = data[:, -1]
            data = data[:, feats]

            # Splitting training data into train & dev sets
            if mode == 'train':
                indices  = [i for i in range(len(data)) if i % 10 != 0]
            elif mode == 'dev':
                indices = [i for i in range(len(data)) if i % 10 == 0]

            # Convert data into PyTorch tensors
            self.data = torch.FloatTensor(data[indices])
            self.target = torch.FloatTensor(target[indices])

        # Normalize features (you may remove this part to see what will happen)
        self.data[:, 40:] = \
            (self.data[:, 40:] - self.data[:, 40:].mean(dim=0, keepdim=True)) \
            / self.data[:, 40:].std(dim=0, keepdim=True)
        self.dim = self.data.shape[1]
        print('Finished reading the {} set of COVID19 Dataset ({} samples found, each dim = {})'
              .format(mode, len(self.data), self.dim))
    
    def __getitem__(self, index):
        # Returns one sample at a time
        if self.mode in ['train', 'dev']:
            # For training
            return self.data[index], self.target[index]
        else:
            # For testing (no target)
            return self.data[index]
    
    def __len__(self):
        # Return the size of the dataset
        return len(self.data)

## DataLoader

A `DataLoader` loads data from a given Dataset into batches.

In [None]:
def prep_dataloader(path, mode, batch_size, n_jobs=0, target_only=False):
    ''' Generates a dataset, then is put into a dataloader. '''
    dataset = COVID19Dataset(path, mode=mode, target_only=target_only)  # Construct dataset
    dataloader = DataLoader(
        dataset, batch_size,
        shuffle=(mode == 'train'), drop_last=False,
        num_workers=n_jobs, pin_memory=True)                            # Construct dataloader
    return dataloader

## Deep Neural Network

`NeuralNet` is an `nn.Module` designed for regression.
The DNN consists of 2 fully-connected layers with ReLU activation.
This module also included a function `cal_loss` for calculating loss.

In [None]:
class NeuralNet(nn.Module):
    """A simple fully-connected deep neural network"""
    def __init__(self, input_dim):
        super(NeuralNet, self).__init__()

        # Define your neural network here
        # TODO: How to modify this model to achieve better performance?
        self.net = nn.Sequential(
            nn.Linear(input_dim, 64),
            nn.ReLU(),
            nn.Linear(64, 1)
        )

        # Mean squared error loss
        self.criterion = nn.MSELoss(reduction='mean')

    def forward(self, x):
        """Given input of size (batch_size x input_dim), compute output of the network"""
        return self.net(x).squeeze(1)
    
    def cal_loss(self, pred, target):
        """Caluculate loss"""
        # TODO: you may implement L1/L2 regularization here
        return self.criterion(pred, target)

## Train/Dev/Test

### Training

### Validation

### Testing