In [1]:
#pytorch
import torch
import torch.nn as nn
from torch.utils.data import Dataset,DataLoader

#dataPreprocess
import csv
import numpy as np
import os

#plottingTools
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure

#set a random seed for generation of random number by CPU/NUMPY/GPU
myseed = 420613
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
np.random.seed(myseed)
torch.manual_seed(myseed)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(myseed)

# Some Utilities

In [4]:
def get_device():
    '''geting device if cuda is available'''
    return 'cuda' if torch.cuda.is_available() else 'cpu'

def polt_learning_curve(loss_record,title = ''):
    '''plot learning curve of your DNN()(train&dev loss)'''
    total_steps = len(loss_record[train])
    x_1 = range(total_steps)
    x_2 = x1[::len(loss_record['train'])//len(loss_record['dev'])]
    figure(figsize=(6,4))
    plt.plot(x_1,loss_record['train'],c = 'tab:red', label = 'train')
    plt.plot(x_2,loss_record['dev'],c = 'tab:cyan', label = 'dev')
    plt.ylim(0.0,5.)
    plt.xlabel('TrainSteps')
    plt.ylabel('MSE loss')
    plt.title('Learning curve of {}'.format(title))
    plt.legend()
    plt.show()
    
def plot_pred(dv_set, model, device, lim = 35., preds = None, targets = None):
    '''plot predition of your DNN'''
    if preds is None or targets is None:
        model.eval()#set model to eval mode
        preds, targets = [], []
        for x, y in dv_set:
            x, y = x.to(device), y.to(device)
            with torch.no_grad():
                pred = model(x)
                preds.append(pred.detach().cpu())
                targets.append(y.detach().cpu())
        preds = torch.cat(preds, dim = 0).numpy()
        targets = torch.cat(target, dim = 0).numpy()
        
    figure(figsize=(5,5))
    plt.scatter(targets, preds, c='r', alpha=0.5)
    plt.plot([-0.2,lim], [-0.2, lim], c = 'b')
    plt.xlim(-0.2,lim)
    plt.ylim(-0.2,lim)
    plt.xlabel('ground truth value')
    plt.ylabel('predicted value')
    plt.title('Ground Truth with Prediction')
    plt.show()

# **Preprocess**
We have three kinds of dataset:
* `train`:for training
* `dev`:for validation
* `test`:for testing

# **Dataset**
The covid2019Dataset below dose:
*  read `csv` files
*  extract features
*  spilt `covid.train.csv` dataset into train/dev sets
*  normalize features

Finishing TODO might make you pass baseline.

**数据第0行为列名从第一行开始；第0列为ID，1到93列为特征，第94列即最后一列为标签，即positivetested_positive
    ![image.png](datasetStructure.png)**

In [None]:
'''
## 数据处理经验总结
* `1.`使用list来存储特征列index，使用list来存储split的行号
* `2.`在选择行和列结束之后对self.data进行赋值时候numpy转化成tensor
* `3.`对数据的标准化处理等修改数据的行为都直接对self.data进行
* `4.`不要在data转换为self.data之前对data进行修改
'''
class COVID2019Dataset(Dataset):
    '''dataset for loading and preprocessing the Covid2019Dataset'''
    __init__(self,
             path,
             mode = 'train',
             target_only = 'False'):
        self.mode = mode
        
        #read the data into numpy arrays
        #使用with关键字使得异常处理自动进行，with经常在处理文件时使用,并且其会自动关闭文件
        with open(path,'r') as fp:
            data = list(csv.reader(fp))
            data = np.array(data[1:])[:,1:].astype(float)
        
        if not target_only:
            feats = list(range(94))
            
        else:
            feats = list(range(40)) + [57,75]
            pass
        
        if mode == 'test':
            #testing data:894 * 93
            #93 = 40 states + 18(17+1)(day1) + 18(17+1))(day2) + 17(day3) 
            data = data[:,feats]
            self.data = torch.FloatTensor(data)
        else:
            #train data:2700 * 94
            #94 = 40 + 18 + 18 + 18
            target = data[:,-1]
            data = data[:feats]
            
            #splitting data into train and dev dataset
            '''
            len(data) 用于获取二维数组的行数。
            len(data[0]) 用于获取二维数组的列数（假设每行都有相同数量的元素）,
            即获取data[0]中的元素个数而不是data[0]的个数,首先获取对应行数据的index
            这样也便于后续标准化的处理，将numpy转化为张量之后再进行标准化即可
            '''
            if mode == 'dev':
                indice = [i for i in range(len(data)) if i%10 == 0]
            else if mode == 'train':
                indice = [i for i in range(len(data)) if 1i%10 != 0]
            
            self.data = torch.FloatTensor(data[indice])
            self.target = torch.FloatTensor(target[indice])
            #Normalize features，try to remove this part to see what happen
            #官网解释dim参数：dim (int) – the dimension to reduce（要压缩的那个参数）
            
            self.data