In [7]:
%load_ext autoreload
%autoreload 2
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time    : 2023/12/7
# @Author  : Wang Yujia
# @File    : mlp.ipynb
# @Description : 无Conv层，只有MLP
# @TODO:

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# 0. what for
1. loss部分逻辑重写，添加`no grad`
2. 添加hooks
3. 使用pad_seq函数取代自己写的pad函数
4. 效果：可以train虽然loss不下降，不会出现NaN的问题

# 1. Preparations
## 1.1 global settings

In [8]:

# nums of Gaussian kernels
N_gaussians = 3

# dataset划分
batch_size = 10
train_pct = 0.7
vali_pct = 0.2
test_pct = 0.1

# train and optim.
learning_rate = 1e-4
lr_for_mu = 1e-2
total_train_step = 0
total_test_step = 0
EPOCH_NUM = 5
MIN_LOSS = 1e-7

import pandas as pd
import numpy as np
import random
import torch.utils.data
from mydataset import *
import torch.nn as nn
from torch.utils.data import DataLoader, SubsetRandomSampler
import torch.nn.functional as F
from torchsummary import summary
from visdom import Visdom
from torchviz import make_dot
from torch.nn.utils.rnn import pad_sequence
import math

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)

cuda:0


## 1.2 the data path


In [9]:
# training data
train_path = r"../data/train"
# target data
target_path = r"../data/targets_5"
# data keys
data_key_path = "../data/target_datakey.csv"

# 2. Dataset and Dataloader
1. DataLoader中的shuffer=True表示在每一次epoch中都打乱所有数据的顺序，然后以batch为单位从头到尾按顺序取用数据。这样的结果就是不同epoch中的数据都是乱序的,设置随机种子的作用就是让你的每一次训练都乱的一样，

## 2.1 Dataset and spliting


- 设置随机数种子

In [10]:
def setup_seed(seed):
    torch.manual_seed(seed)
    # torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    random.seed(seed)
    # torch.backends.cudnn.deterministic = True

setup_seed(7)

- 读取data

In [11]:
dataset = myDataset(train_path, target_path, data_key_path)

- 产生index的乱序排列

In [12]:
shuffled_indices = np.random.permutation(dataset.__len__())
# shuffled_indices = np.arange(0,dataset.__len__())
train_idx = shuffled_indices[:int(train_pct*dataset.__len__())]
# train_idx = shuffled_indices
tmp = int((train_pct+vali_pct)*dataset.__len__())
val_idx = shuffled_indices[int(train_pct*dataset.__len__()):tmp]

test_idx = shuffled_indices[tmp:]
print(train_idx)

[ 807  305  455  939  508  594  835 1082  598 1102   46  761  841  141
  407  334  253  500  734  936  698  446  907 1087 1009  140  463  547
 1155  856   34 1156  703 1121  751  587 1100  509  473  128  788   97
  471  385 1085  525  679 1135  284 1146  186  318 1088 1113    9  457
 1047  451  569 1120 1074  326  377  809  109   98  620 1194  825  828
   83  113  556  674  568  853  351  558   54  656  804 1149  101  344
  851  544  955   40 1021  489  626  664  657  868 1169  151  179 1130
  564  713  743 1014  966  861 1132  146  610  408  662  551  172 1020
  982 1148  431  517  270  858  170  374  816  618  205   17   53 1003
  263  857  716  843  498  228  339  725  752  278  649 1017  108  642
 1195 1174   99  530  632  888  189  961  358 1078  663  757 1051  204
  409  283  562   23  619  216  474  921  950 1162  123  785  769  621
  262  586 1178  541  795   70 1189  396  171  845  168 1125  361  224
  125  706  164  231  264   42  746  872  998  132 1035  430   38  522
  880 

- 根据这个乱序排列抽取dataset

## 2.2 Dataloader and collating
1. 主要是对label数据进行collate
    - 按照batch中的最大target data长度进行padding，padding with 0
2. 返回的结果多一个batch dim,比如下面的`5`
    - After collating:
        - `torch.Size([5, 3, 300]),torch.Size([5, 87, 2])`
        - `87`是最长的targets data长度

In [13]:
def test_pad_sequence():
    seq1 = torch.tensor([[ 2., 0.04761905], [3., 0.14285714], [4., 0.04761905]])
    seq2 = torch.tensor([[ 1., 0.04761905]])
    seq3 = torch.tensor([[ 3., 0.14285714], [4., 0.04761905]])
    ls = list((seq1,seq2,seq3))
    ls_length = torch.tensor([3,1,2])
    ans = pad_sequence(ls,batch_first=True)
    print(ans)
    # seq_pak = pack_padded_sequence(ans,ls_length,batch_first=True,enforce_sorted=False)
    # seq_unpacked, lens_unpacked = pad_packed_sequence(seq_pak, batch_first=True)
    # lens_unpacked

In [14]:
def my_collate_fn(data):
# 这里的data是一个list， list的元素是元组: (self.data, self.label)
# collate_fn的作用是把[(data, label),(data, label)...]转化成([data, data...],[label,label...])
# 假设self.data的一个data的shape为(channels, length), 每一个channel的length相等,
# data[索引到index(batch)][索引到data或者label][索引到channel]

    data_list = []
    target_list = []
    data_len = len(data)      # 读进来的data batch的大小

    batch = 0
    while batch < data_len:
        data_list.append(torch.tensor(data[batch][0]))
        target_list.append(torch.tensor(data[batch][1]))
        batch += 1

    # pad with zeros
    target_padded = pad_sequence(target_list,batch_first=True)

    data_tensor = torch.stack(data_list).float()
    target_tensor = target_padded.float()

    return data_tensor, target_tensor

In [15]:
train_loader = DataLoader(dataset = dataset,batch_size = batch_size, shuffle=False, num_workers=0, drop_last=False, sampler=SubsetRandomSampler(train_idx), collate_fn = my_collate_fn)

val_loader = DataLoader(dataset = dataset,batch_size = batch_size, shuffle=False, num_workers=0, drop_last=False, sampler=SubsetRandomSampler(val_idx),collate_fn = my_collate_fn)

test_loader = DataLoader(dataset = dataset,batch_size = batch_size, shuffle=False, num_workers=0, drop_last=False, sampler=SubsetRandomSampler(test_idx),collate_fn = my_collate_fn)

# 3. The Net
1. BatchNorm1d: The mean and std are calculated per-dimension over the mini-batches
2.

In [16]:
### BatchNorm2d测试
def test_BN():
    m = nn.BatchNorm2d(3, affine=False)  # affine: With Learnable Parameters or not
    print('m:', m)
    # The mean and std are calculated per-dimension over the mini-batches
    input = torch.tensor([
        [[1.,2.,3.,4.],[1.,2.,3.,4.],[-1.,-2.,-3.,-4.]],
        [[0.,0.,0.,0.],[0.,0.,0.,0.],[0.,0.,0.,0.]]
    ], requires_grad=True)

    print('input:', input.shape)
    input = input.unsqueeze(dim=2)
    print('input:', input.shape)
    output = m(input) # 归一化
    print('output:', output.shape)
    print('output:', output)

In [17]:
### 复现batchNorm2d在input shape为3维的情况
input = torch.tensor([[[1.,2.,3.,4.]],[[0.,0.,0.,0.]]])
# print(input.shape)
# torch.mean(input),torch.var(input,unbiased = False)
(input-torch.mean(input))/ torch.sqrt(torch.var(input,unbiased = False))

tensor([[[-0.1690,  0.5071,  1.1832,  1.8593]],

        [[-0.8452, -0.8452, -0.8452, -0.8452]]])

## 5.1 Init the weight
- 设置网络初始权重: 不太work

In [18]:
class model_param_init(nn.Module):
    def __init__(self, model):
        super().__init__()
        assert isinstance(model, nn.Module), 'model not a class nn.Module'
        self.net = model
        self.initParam()

    def initParam(self):
        for param in self.net.parameters():
            # nn.init.zeros_(param)
            # nn.init.ones_(param)
            # nn.init.normal_(param, mean=0, std=1)
            # nn.init.uniform_(param, a=0, b=1)
            # nn.init.constant_(param, val=1)   # 将所有权重初始化为1
            # nn.init.eye_(param)  # 只能将二维的tensor初始化为单位矩阵
            # nn.init.xavier_uniform_(param, gain=1)  # Glorot初始化  得到的张量是从-a——a中采用的
            # nn.init.xavier_normal_(param, gain=1)   # 得到的张量是从0-std采样的
            nn.init.kaiming_normal_(param, a=0, mode='fan_in', nonlinearity='relu') # he初始化方法
            # nn.init.kaiming_uniform_(param)

- print网络每层结构

In [19]:
DEBUG = False
class PrintLayer(nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, x):
        if(DEBUG):
            print("This layer: ")
            print(x)      #print(x.shape)
        return x

- Sequential结构
- 最后输出mu的时候求了mean，不太好？

- hook_backward_fn: 输入端的grad，输出端的grad，这里称呼的**输入与输出是站在前向传播的角度的**。如果模块有多个输入与输出的话， 其`grad_input`和`grad_output`可以是tuple类型。
- 与forward不同的是，backward传播的时候，**不仅反向传递input和output的grad，还会传递模块Parameter的grad**：
    - 比如fc模块，其`grad_input`是一个三元组的tuple，（对bias的梯度，对输入的梯度，对w的梯度）；
    - conv模块`grad_input`也是一个三元组tuple，为（对输入的梯度，对w的梯度，对bias的梯度

In [20]:
# hook functions have to take these 3 input
def hook_forward_fn(module, input, output):
    print("It's forward: ")
    print(f"module: {module}")
    print(f"input: {input}")
    print(f"output: {output}")
    print("="*20)

def hook_backward_fn(module, grad_input, grad_output):
    print("It's backward: ")
    print(f"module: {module}")
    print(f"grad_input: {grad_input}")
    print(f"grad_output: {grad_output}")
    print("="*20)

def hook_backward_fn_pi(module, grad_input, grad_output):
    print("It's backward in pi: ")
    print(f"module: {module}")
    print(f"grad_input: {grad_input}")
    print(f"grad_output: {grad_output}")
    print("="*20)

def hook_backward_fn_sigma(module, grad_input, grad_output):
    print("It's backward in sigma: ")
    print(f"module: {module}")
    print(f"grad_input: {grad_input}")
    print(f"grad_output: {grad_output}")
    print("="*20)

def hook_backward_fn_mu(module, grad_input, grad_output):
    print("It's backward in mu: ")
    print(f"module: {module}")
    print(f"grad_input: {grad_input}")
    print(f"grad_output: {grad_output}")
    print("="*20)

## 3.2 MLP结构


In [21]:
# Not Sequential
class MLP(nn.Module):
    # code->generate->override methods
    def __init__(self, n_gaussians) -> None:
        super().__init__()
        self.BN = nn.BatchNorm1d(num_features=3,affine=False)
        self.flatten = nn.Flatten()
        self.ac_func = nn.Softplus()
        self.drop = nn.Dropout(0.3)


        self.linear00 = nn.Linear(900,300)
        self.linear01 = nn.Linear(300,30)
        self.linear1 = nn.Linear(30, 9)


        self.z_pi = nn.Sequential(
            nn.Linear(9, n_gaussians),  # 30个params要learn
            nn.Softmax(dim=1)
        )

        self.z_mu = nn.Linear(9, n_gaussians)
        self.z_sigma = nn.Linear(9, n_gaussians)

    def forward(self, x):
        # 加一个height维度
        # x.unsqueeze_(dim=2)
        x = self.BN(x)
        # x.squeeze_()
        x = self.flatten(x)
        x = self.linear00(x)
        x = self.ac_func(self.drop(x))

        x = self.linear01(x)
        x = self.ac_func(self.drop(x))

        x = self.linear1(x)
        x = self.ac_func(self.drop(x))
        # print("after linear1, the output shape is: ",x.shape)

        pi = self.z_pi(x)
        mu = self.z_mu(x)
        sigma = torch.exp(self.z_sigma(x))

        return pi, mu, sigma

In [22]:
input = torch.tensor([
    [[1.,2.,3.,4.],[1.,2.,3.,4.],[-1.,-2.,-3.,-4.]],
    [[0.,0.,0.,0.],[0.,0.,0.,0.],[0.,0.,0.,0.]]
], requires_grad=True)
input = input.unsqueeze(dim=2)
print(f"input's shape is {input.shape}")
flt = nn.Flatten(start_dim=1)
flt(input).shape

input's shape is torch.Size([2, 3, 1, 4])


torch.Size([2, 12])

# 4. The Loss
- `loss_preparation`用来做loss的前期data准备：
    - 计算混合模型的分布`m`以及target data中的`duration`


In [23]:
# 当input的shape是[50,3]时，输出应该是50个GMM
def loss_preparation(pi, mu, sigma, target):

    m=[]
    for i in range(pi.shape[0]):
        # m.append(torch.distributions.Normal(loc=mu[i,:].T, scale=sigma[i,:].T))
        m.append(torch.distributions.Normal(loc=mu[i,:], scale=sigma[i,:]))

    # target_packed = pack_padded_sequence(target,target_len,batch_first=True,enforce_sorted=False)  # 去掉padded 0并拉成一个vector
    # seq_unpacked, lens_unpacked = pad_packed_sequence(target_packed, batch_first=True)             # 和上面互为逆运算

    duration = target[:,:,0]

    return duration,m

In [24]:
# # 当input的shape是[50,3]时，输出应该是50个GMM
# # 对这50个GMM看能生成什么output

def loss_fn(Pi,duration,m,N_gaussians):
    loss_list = []
    with torch.no_grad():
        # for each GMM
        # 后期肯定要矩阵化这个计算！
        for i in range(len(m)):
            target = duration[i,:]
            pi = Pi[i,:]

            # Drop all zero data and Expanded to the same dim
            target_nonzero = target[torch.nonzero(target)].squeeze_()
            target_nonzero = torch.repeat_interleave(target_nonzero.unsqueeze(dim=1), repeats=N_gaussians, dim=1).to(device)
            len_target = len(target_nonzero)       # The length of target data

            # loss_1 是高斯分布的概率密度value
            loss_1 = torch.exp(m[i].log_prob(target_nonzero))

            # loss_2 是MDN的概率密度value
            loss_2 = torch.sum(loss_1 * pi, dim=1)

            # loss_3 是值非0的概率密度value
            loss_3 = loss_2[torch.nonzero(loss_2)].view(-1,1)                # 再去掉所有的log(0)

            # 如果loss_2全是0则赋值为1e-40，否则赋值为loss的最小值
            MIN_LOSS = torch.min(loss_3) if torch.min(loss_2)>0 else 1e-40
            loss_padded = [MIN_LOSS]* (loss_2.shape[0]-loss_3.shape[0])
            loss_padded_ts = torch.tensor(loss_padded,device=device)

            # loss_4是用MIN_LOSS填充后的loss_3，和loss_2等长
            loss_4 = torch.cat((loss_3[:,0],loss_padded_ts))
            torch._assert(len(loss_4) == len(loss_2),"cat不正确")

            # loss_5是log likelihood loss
            loss_5 = torch.log(loss_4)
            #loss_5 = torch.log(loss_4*len_target)
            #loss_5 = torch.log(loss_4)*len_target   # loss值会比较大，不建议用
            loss_list.append(-torch.mean((loss_5)).item())

        # 最后处理loss
        loss = np.sum(loss_list)/ len(m)

    loss_ts = torch.tensor(loss,device=device,requires_grad=True)
    return loss_ts,loss_list

In [25]:
def loss_test():
    loss_2 = torch.tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])
    loss_3 = loss_2[torch.nonzero(loss_2)].view(-1,1)         # 去掉所有的log(0)
    print("loss_3:",loss_3.shape)

    MIN_LOSS = torch.min(loss_3) if torch.min(loss_2)>0 else 1e-20

    print("MIN_LOSS：",MIN_LOSS)
    loss_padded = [MIN_LOSS]* (loss_2.shape[0]-loss_3.shape[0])
    loss_padded_ts = torch.tensor(loss_padded) # ,device=device
    loss_3 = torch.cat((loss_3[:,0],loss_padded_ts))
    loss_3.shape, torch.log(loss_3)
    print("data: ",loss_3.data)
    print("data: ",type(loss_3))

# 5. Training
## 5.1 preparations
1. 初始化Visdom环境
2.


In [26]:
# writer = SummaryWriter("logs-MLP")
viz = Visdom(env="001")
print("Done")

Setting up a new session...


Done


## 5.2 Plot
1. draw:
    - loss
    - MLP的网络结构（.png）
    - target distrb.和pred. distrb.
    - 所有target data的分布图

In [27]:
def draw_all_target_data():
    # target data
    target_path = r"../data/targets"
    # target_path里有全部的target data地址
    target_all_path = os.listdir(target_path)
    len_target = len(target_all_path)
    print(f"一共有 *{len_target}* 组 target data")

    # Init
    # viz_env_str = "target_bar"
    # viz_env_str = "target_distrb"
    # viz_env_str = "target_hist"
    viz_env_str = "target_scatter"
    viz = Visdom(env = viz_env_str)

    for i in range(len_target):
        target_path_i_path = os.path.join(target_path,target_all_path[i])
        target_df = pd.read_csv(target_path_i_path,encoding="utf-8")

        # Init
        win_str = str(target_all_path[i])
        title_str = "Target Distrb. of "+win_str

        if(i < 100):
            # Hist plot which is not that great
            # viz.histogram(X = np.array(target_df.N), env=viz_env_str, win=win_str, opts= dict(title=title_str))

            # Line plot
            # target_df.drop_duplicates(inplace=True)
            # viz.line(X = np.array(target_df.N),Y= np.array(target_df.P), env=viz_env_str, win=win_str,opts= dict(title=title_str,markers = True,markersize = 7,markersymbol = "cross-thin-open"))

            # Bar plot. 这个看起来是最准确的一个
            # target_df.drop_duplicates(inplace=True)
            # arr_str = [str(i) for i in np.array(target_df.N)]
            # viz.bar(X=np.array(target_df.P), env=viz_env_str, win=win_str,
            #         opts=dict(title=title_str,rownames=arr_str))

            # Scatter plot
            target_df.drop_duplicates(inplace=True)
            viz.scatter(X=np.array(target_df),env=viz_env_str, win=win_str,opts=dict(title=title_str,markersize = 3))
    print("Done")


# draw_all_target_data()

In [28]:
#### Test for drawing
def test_draw():
    viz = Visdom(env="001")

    mu = torch.tensor([0,10,20])
    sigma = torch.tensor([1,1,1])
    duration = torch.tensor([0,1,2,0])
    duration = torch.repeat_interleave(duration.unsqueeze(dim=1), repeats=3, dim=1)
    m = torch.distributions.Normal(loc=mu, scale=sigma)
    pi = torch.tensor([0.2,0.3,0.5])

    # draw
    x_0 = torch.tensor(np.arange(0,1000))
    x = torch.repeat_interleave(x_0.unsqueeze(dim=1), repeats=3, dim=1)
    y = torch.exp(m.log_prob(x))
    y_sum = torch.unsqueeze(torch.sum(pi*y,dim=1),dim=1)
    viz.line(X = x_0,Y= torch.cat([y,y_sum],dim = 1), env="001", win="test_draw_2",
            opts= dict(title='test_draw', legend=['N1', 'N2', 'N3','NNN']))
# test_draw()

In [29]:
def draw_mdn(pi, m, target, total_train_step,N_gaussians):
    # 只画一个batch中的第一个

    # The target distrb.
    n_target = target[:,0]
    n = n_target[torch.nonzero(n_target)]
    # Auctions amount
    auction_num = len(n_target)
    p_target = target[:,1]
    p = p_target[torch.nonzero(p_target)]*auction_num
    max_n = max(n).item()           # 横轴长度

    # The predicted distrb.
    x_0 = torch.arange(0,max_n).to(device=device)
    x = torch.repeat_interleave(x_0.unsqueeze(dim=1), repeats=N_gaussians, dim=1)
    y = torch.exp(m.log_prob(x)).to(device=device)                       # y:多条高斯曲线; y_pred: 一条GMM曲线
    # y_pred = torch.unsqueeze(torch.sum(pi*y,dim=1),dim=1)                # 维度相等才能cat
    y_pred = torch.sum(pi*y,dim=1)
    y_pred = y_pred*auction_num    # 乘auction_num

    # Init
    win_str = "total_train_step = "+str(total_train_step)
    title_str = "Distrb. in "+win_str
    viz.line(X = [0.],Y = [0.], env="001", win=win_str, opts= dict(title=title_str))

    # Visdom本身不能把hist和line画在一个window中
    # 如果想画一起只能是两条line
    # Plot y_target
    # viz.histogram(X = n, env="001", win=win_str,
    #         opts= dict(title=title_str))
    viz.line(X = n,Y= p, env="001", win=win_str, update="append", name='target',
            opts= dict(title=title_str,markers = True,markersize = 7,markersymbol = "cross-thin-open"))

    # Plot y_pred
    viz.line(X = x_0,Y= y_pred, env="001", win=win_str, update="append", name='pred',
            opts= dict(title=title_str))

In [30]:
def draw_the_net_png():

    x = torch.randn([5, 3, 300])  # 定义网络的输入值
    mlp = MLP(N_gaussians)
    y = mlp(x)                    # 获取网络的预测值

    MyConvNetVis = make_dot(y, params=dict(list(mlp.named_parameters()) + [('x', x)]))
    MyConvNetVis.format = "png"
    # 指定文件生成的文件夹
    MyConvNetVis.directory = "data_pic"
    # 生成文件
    MyConvNetVis.view()
# draw_the_net_png()

In [31]:
win_train_loss_str = "The Loss of BATCH in the Training Data"
win_vali_loss_str = "The Loss in the Vali Data"
win_train_epoch_loss_str = "The Loss of EPOCH in the Training Data"

def draw_loss(X_step, loss, win_str):
    viz.line(X = [X_step], Y = [loss],win=win_str, update="append",
        opts= dict(title=win_str))


## 5.3 Training


In [36]:
mlp = MLP(N_gaussians)

# Init the params
# # mlp = model_param_init(mlp)
# Init the vis
viz.line(X = [0.],Y = [0.], env="001", win=win_train_loss_str, opts= dict(title=win_train_loss_str))
viz.line(X = [0.],Y = [0.], env="001", win=win_vali_loss_str, opts= dict(title=win_vali_loss_str))
viz.line(X = [0.],Y = [0.], env="001", win=win_train_epoch_loss_str, opts= dict(title=win_train_epoch_loss_str))

# Save the init params
torch.save(mlp.state_dict(), 'mlp_init.pth')

# Read the saved model
# model_data = torch.load('mlp_init_loss_17.pth')
# mlp.load_state_dict(model_data)

mlp = mlp.to(device=device)
summary(mlp, (3,300))
# optimizer = torch.optim.Adagrad(mlp.parameters(),lr=learning_rate, lr_decay=learning_rate, weight_decay=learning_rate)

# Set different lr for params
# id: id() 函数返回对象的唯一标识符，标识符是一个整数。返回对象的内存地址。
mu_params = list(map(id, mlp.z_mu.parameters()))
base_params = filter(lambda p: id(p) not in mu_params, mlp.parameters())
params = [{'params': base_params},         # 如果对某个参数不指定学习率，就使用最外层的默认学习率
        {'params': mlp.z_mu.parameters(), 'lr': learning_rate * 100}]
# optimizer = torch.optim.Adam(params, lr=learning_rate)
optimizer = torch.optim.Adagrad(params,lr=learning_rate, lr_decay=learning_rate, weight_decay=learning_rate)

# # hooks
# #mlp.conv1.register_forward_hook(hook_forward_fn)
# mlp.conv1.register_full_backward_hook(hook_backward_fn)
#
# #mlp.linear1.register_forward_hook(hook_forward_fn)
# #mlp.linear1.register_full_backward_hook(hook_backward_fn)
#
# #mlp.z_pi.register_forward_hook(hook_forward_fn)
# mlp.z_pi.register_full_backward_hook(hook_backward_fn_pi)
#
# #mlp.z_mu.register_forward_hook(hook_forward_fn)
# mlp.z_mu.register_full_backward_hook(hook_backward_fn_mu)
#
# #mlp.z_sigma.register_forward_hook(hook_forward_fn)
# mlp.z_sigma.register_full_backward_hook(hook_backward_fn_sigma)

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
       BatchNorm1d-1               [-1, 3, 300]               0
           Flatten-2                  [-1, 900]               0
            Linear-3                  [-1, 300]         270,300
           Dropout-4                  [-1, 300]               0
          Softplus-5                  [-1, 300]               0
            Linear-6                   [-1, 30]           9,030
           Dropout-7                   [-1, 30]               0
          Softplus-8                   [-1, 30]               0
            Linear-9                    [-1, 9]             279
          Dropout-10                    [-1, 9]               0
         Softplus-11                    [-1, 9]               0
           Linear-12                    [-1, 3]              30
          Softmax-13                    [-1, 3]               0
           Linear-14                   

In [37]:
# filename = "../log_file.txt"
# f = open(filename,'w')
total_train_step = 0
mlp.train()
for epoch in range(10):
    total_train_loss = 0
    for batch_id,data in enumerate(train_loader):

        input_data, target = data
        print(f"---- {batch_id} batch----")
        # Do the inference
        input_data = input_data.to(device)
        pi, mu, sigma = mlp(input_data)

        # Save the params
        # params = list(mlp.named_parameters())

        # Cal the loss and draw the distrb.
        duration,m  = loss_preparation(pi.detach(), mu.detach(), sigma.detach(), target.detach())
        loss,loss_list = loss_fn(pi,duration,m,N_gaussians)
        total_train_loss += loss.item()
        draw_loss(total_train_step, loss.item(),win_train_loss_str)
        # print("训练次数：{}，Loss：{}".format(total_train_step, loss.item()))

        ########### before step() ###############

        # for name, parms in mlp.named_parameters():
        #     print("=====before step()=====")
        #     print('-->name:', name)
        #     print('-->para:', parms)
        #     print('-->grad_requirs:',parms.requires_grad)
        #     print('-->grad_value:',parms.grad)
        #     print("===")

        # Optim
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # for name, parms in mlp.named_parameters():
        #     print("=====After step()=====")
        #     print('-->name:', name)
        #     print('-->para:', parms)
        #     print('-->grad_requirs:',parms.requires_grad)
        #     print('-->grad_value:',parms.grad)
        #     print("===")

        # print("训练次数：{}，Loss：{}, Loss's grad: {}".format(total_train_step, loss.item(), loss.grad))

        if total_train_step % 10 == 0:
            # print(pi,"\n",mu,"\n",sigma)
            # Only draw the 1st result in a batch (5 in total)
            draw_mdn(pi[0,:].detach(), m[0], target[0].detach(), total_train_step,N_gaussians)

        # # Do validation
        # total_vali_loss = 0
        # for vali_batch_id, vali_data in enumerate(val_loader):
        #     vali_input_data, vali_target = vali_data
        #     vali_input_data = vali_input_data.to(device)
        #     vali_pi, vali_mu, vali_sigma = mlp(vali_input_data)
        #
        #     # Cal the sum of vali loss instead of vali loss in a batch
        #     vali_duration,vali_m  = loss_preparation(vali_pi.detach(), vali_mu.detach(), vali_sigma.detach(), vali_target.detach())
        #     vali_loss, _ = loss_fn(vali_pi,vali_duration,vali_m,N_gaussians)
        #     total_vali_loss += vali_loss
        # # Plot the vali loss
        # # total_vali_loss.item()/83.
        # draw_loss(total_train_step, (total_vali_loss.item()), win_vali_loss_str)

        total_train_step += 1
       # Plot the loss in this EPOCH
    print(f"========== IN EPOCH {epoch} the total loss is {total_train_loss} ==========")
    draw_loss(epoch, total_train_loss,win_train_epoch_loss_str)

# f.close()
print("Done")

---- 0 batch----
---- 1 batch----
---- 2 batch----
---- 3 batch----
---- 4 batch----
---- 5 batch----
---- 6 batch----
---- 7 batch----
---- 8 batch----
---- 9 batch----
---- 10 batch----
---- 11 batch----
---- 12 batch----
---- 13 batch----
---- 14 batch----
---- 15 batch----
---- 16 batch----
---- 17 batch----
---- 18 batch----
---- 19 batch----
---- 20 batch----
---- 21 batch----
---- 22 batch----
---- 23 batch----
---- 24 batch----
---- 25 batch----
---- 26 batch----
---- 27 batch----
---- 28 batch----
---- 29 batch----
---- 30 batch----
---- 31 batch----
---- 32 batch----
---- 33 batch----
---- 34 batch----
---- 35 batch----
---- 36 batch----
---- 37 batch----
---- 38 batch----
---- 39 batch----
---- 40 batch----
---- 41 batch----
---- 42 batch----
---- 43 batch----
---- 44 batch----
---- 45 batch----
---- 46 batch----
---- 47 batch----
---- 48 batch----
---- 49 batch----
---- 50 batch----
---- 51 batch----
---- 52 batch----
---- 53 batch----
---- 54 batch----
---- 55 batch----
--


$\mathcal{L}(y \vert x) = - \log\bigg\{\sum_{k=1}^K \pi_k(x)  \mathcal{N}\big(y \vert \mu_k(x), \Sigma_k(x)\big)\bigg\}$

In [180]:
# torch.save(mlp.state_dict(), 'mlp_init_loss_17.pth')