In [2]:

%load_ext autoreload
%autoreload 2
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time    : 2022/6/7 15:17
# @Author  : Wang Yujia
# @File    : mlp.ipynb
# @Description : 搭一个基本的mlp，用于测试思路
# @TODO: 画图细化一下training的流程：怎么用target data/ NN的规模/ batch size等


# 0. what for
1. 搭一个基本的mlp

# 1. Preparations
## 1.1 global settings

In [3]:

# nums of Gaussian kernels
N_gaussians = 3

# dataset划分
batch_size = 5
train_pct = 0.7
vali_pct = 0.2
test_pct = 0.1

# train and optim.
learning_rate = 0.01
total_train_step = 0
total_test_step = 0
epoch = 5


import pandas as pd
import numpy as np
import random
import torch.utils.data
from mydataset import *
import torch.nn as nn
from torch.utils.data import DataLoader, SubsetRandomSampler
# from tensorboardX import SummaryWriter

## 1.2 the data path


In [4]:
# training data
train_path = r"../data/train"
# target data
target_path = r"../data/targets"
# data keys
data_key_path = "../data/target_datakey.csv"

# 2. Dataloader and Split
1. DataLoader中的shuffer=True表示在每一次epoch中都打乱所有数据的顺序，然后以batch为单位从头到尾按顺序取用数据。这样的结果就是不同epoch中的数据都是乱序的,设置随机种子的作用就是让你的每一次训练都乱的一样，

## 2.1 Dataset and split it


- 设置随机数种子

In [5]:
def setup_seed(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    random.seed(seed)
    # torch.backends.cudnn.deterministic = True

setup_seed(7)

- 读取data

In [6]:
dataset = myDataset(train_path, target_path, data_key_path)

- 产生index的乱序排列

In [54]:
shuffled_indices = np.random.permutation(dataset.__len__())

train_idx = shuffled_indices[:int(train_pct*dataset.__len__())]

tmp = int((train_pct+vali_pct)*dataset.__len__())
val_idx = shuffled_indices[int(train_pct*dataset.__len__()):tmp]

test_idx = shuffled_indices[tmp:]

- 根据这个乱序排列抽取dataset

## 2.2 Dataloader and collate it
1. 主要是对label数据进行collate
    - 按照batch中的最大target data长度进行padding，padding with 0

In [52]:
def my_collate_fn(data):
# 这里的data是一个list， list的元素是元组: (self.data, self.label)
# collate_fn的作用是把[(data, label),(data, label)...]转化成([data, data...],[label,label...])
# 假设self.data的一个data的shape为(channels, length), 每一个channel的length相等,
# data[索引到index(batch)][索引到data或者label][索引到channel]

    data.sort(key=lambda x: len(x[1]), reverse=False)   # 按照targets数据长度升序排序
    max_len = len(data[-1][1])                         # 选取最长的targets数据长度
    print(max_len) # 157

    data_list = []
    target_list = []

    # padding with 0
    for batch in range(0, len(data)):
        data_list.append(data[batch][0])                # 原样保存training data
        target_list.append(torch.concat(
            [data[batch][1],torch.tensor([[0,0]]* (max_len - len(data[batch][1])))],
            dim = 0 ))     #

    # into tensor
    data_tensor = torch.tensor([item.cpu().detach().numpy() for item in data_list]).cuda()
    target_tensor = torch.tensor([item.cpu().detach().numpy() for item in target_list]).cuda()

    data_copy = (data_tensor, target_tensor)
    return data_copy


In [55]:
train_loader = DataLoader(dataset = dataset,batch_size = batch_size, shuffle=False, num_workers=0, drop_last=False, sampler=SubsetRandomSampler(train_idx), collate_fn = my_collate_fn)
val_loader = DataLoader(dataset = dataset,batch_size = batch_size, shuffle=False, num_workers=0, drop_last=False, sampler=SubsetRandomSampler(val_idx),collate_fn = my_collate_fn)

test_loader = DataLoader(dataset = dataset,batch_size = batch_size, shuffle=False, num_workers=0, drop_last=False, sampler=SubsetRandomSampler(test_idx),collate_fn = my_collate_fn)

# 3. The Net

In [None]:
class Mlp(nn.Module):
    # code->generate->override methods
    def __init__(self, n_gaussians) -> None:
        super().__init__()
        self.mlp_call = nn.Sequential(
            # 因为input只有5个features
            # nn.Flatten(start_dim=0),
            nn.Linear(5, 25),
            nn.ReLU(inplace=True),
            nn.Linear(25, 225),
            nn.ReLU(inplace=True),
            nn.Linear(225, 9),
            nn.Tanh()
        )
        # π μ σ for MDN
        self.z_pi = nn.Sequential(
            nn.Linear(9, n_gaussians),
            nn.Softmax(dim=1),
        )
        self.z_mu = nn.Linear(9, n_gaussians)
        self.z_sigma = nn.Linear(9, n_gaussians)

    def forward(self, x):
        x = torch.squeeze(x)
        mlp_output = self.mlp_call(x)
        # 输出n_gaussians个高斯的参数
        tmp = self.z_pi(mlp_output)
        pi = torch.mean(tmp,dim=0)
        tmp = self.z_mu(mlp_output)
        mu = torch.mean(tmp,dim=0)
        tmp = torch.exp(self.z_sigma(mlp_output))
        # sigma has to be positive, 如果tmp有任何<0的元素，Assert
        torch._assert((torch.nonzero(tmp<0, as_tuple=False).shape[0]<=0),"Sigma is less than zero!")
        sigma = torch.mean(tmp,dim=0)
        del(tmp)
        return pi, mu, sigma