In [69]:

%load_ext autoreload
%autoreload 2
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time    : 2022/6/7 15:17
# @Author  : Wang Yujia
# @File    : mlp.ipynb
# @Description : 把mlp的写法变成非sequential版的


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# 0. what for
1. 搭一个基本的mlp

# 1. Preparations
## 1.1 global settings

In [70]:

# nums of Gaussian kernels
N_gaussians = 3

# dataset划分
batch_size = 50
train_pct = 0.7
vali_pct = 0.2
test_pct = 0.1

# train and optim.
learning_rate = 0.0001
total_train_step = 0
total_test_step = 0
EPOCH_NUM = 5
MIN_LOSS = 1e-7


import pandas as pd
import numpy as np
from pprint import pprint
import random
import torch.utils.data
from mydataset import *
import torch.nn as nn
from torch.utils.data import DataLoader, SubsetRandomSampler
import torch.nn.functional as F
from torchsummary import summary
from tensorboardX import SummaryWriter
from visdom import Visdom
from torchviz import make_dot
import math

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

## 1.2 the data path


In [71]:
# training data
train_path = r"../data/train"
# target data
target_path = r"../data/targets"
# data keys
data_key_path = "../data/target_datakey.csv"

# 2. Dataset and Dataloader
1. DataLoader中的shuffer=True表示在每一次epoch中都打乱所有数据的顺序，然后以batch为单位从头到尾按顺序取用数据。这样的结果就是不同epoch中的数据都是乱序的,设置随机种子的作用就是让你的每一次训练都乱的一样，

## 2.1 Dataset and spliting


- 设置随机数种子

In [72]:
def setup_seed(seed):
    torch.manual_seed(seed)
    # torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    random.seed(seed)
    # torch.backends.cudnn.deterministic = True

setup_seed(7)

- 读取data

In [73]:
dataset = myDataset(train_path, target_path, data_key_path)

- 产生index的乱序排列

In [74]:
shuffled_indices = np.random.permutation(dataset.__len__())
# shuffled_indices = np.arange(0,dataset.__len__())
train_idx = shuffled_indices[:int(train_pct*dataset.__len__())]
# train_idx = shuffled_indices
tmp = int((train_pct+vali_pct)*dataset.__len__())
val_idx = shuffled_indices[int(train_pct*dataset.__len__()):tmp]

test_idx = shuffled_indices[tmp:]
print(train_idx)

[ 807  305  455  939  508  594  835 1082  598 1102   46  761  841  141
  407  334  253  500  734  936  698  446  907 1087 1009  140  463  547
 1155  856   34 1156  703 1121  751  587 1100  509  473  128  788   97
  471  385 1085  525  679 1135  284 1146  186  318 1088 1113    9  457
 1047  451  569 1120 1074  326  377  809  109   98  620 1194  825  828
   83  113  556  674  568  853  351  558   54  656  804 1149  101  344
  851  544  955   40 1021  489  626  664  657  868 1169  151  179 1130
  564  713  743 1014  966  861 1132  146  610  408  662  551  172 1020
  982 1148  431  517  270  858  170  374  816  618  205   17   53 1003
  263  857  716  843  498  228  339  725  752  278  649 1017  108  642
 1195 1174   99  530  632  888  189  961  358 1078  663  757 1051  204
  409  283  562   23  619  216  474  921  950 1162  123  785  769  621
  262  586 1178  541  795   70 1189  396  171  845  168 1125  361  224
  125  706  164  231  264   42  746  872  998  132 1035  430   38  522
  880 

- 根据这个乱序排列抽取dataset

## 2.2 Dataloader and collating
1. 主要是对label数据进行collate
    - 按照batch中的最大target data长度进行padding，padding with 0
2. 返回的结果多一个batch dim,比如下面的`5`
    - After collating:
        - `torch.Size([5, 3, 300]),torch.Size([5, 87, 2])`
        - `87`是最长的targets data长度

In [75]:
def my_collate_fn(data):
# 这里的data是一个list， list的元素是元组: (self.data, self.label)
# collate_fn的作用是把[(data, label),(data, label)...]转化成([data, data...],[label,label...])
# 假设self.data的一个data的shape为(channels, length), 每一个channel的length相等,
# data[索引到index(batch)][索引到data或者label][索引到channel]

    data.sort(key=lambda x: len(x[1]), reverse=False)   # 按照targets数据长度升序排序
    max_len = len(data[-1][1])                          # 选取最长的targets数据长度

    data_list = []
    target_list = []

    # target_data_record_num = 0
    padding_cnt = 0   # 数一下padding了多少

    # padding with 0 for those target data small in amount
    batch = 0
    while data[batch][1].shape[0] < max_len:
        padding_cnt += max_len - data[batch][1].shape[0]
        tmp = np.array([[0,0]]* (max_len - data[batch][1].shape[0]))
        data_list.append(data[batch][0])                # 原样保存training data
        # print(f"compare {data[batch][1].shape} with {tmp.shape}")
        target_list.append(np.concatenate([data[batch][1], tmp], axis=0 ))
        batch += 1

    while batch < len(data):                           # 避免出现2个max长度的data
        data_list.append(data[-1][0])
        target_list.append(data[-1][1])
        batch += 1

    data_tensor = torch.from_numpy(np.stack(data_list)).float()
    target_tensor = torch.from_numpy(np.stack(target_list)).float()

    # print("[PADDING in collating]：一共padding的0个个数：",padding_cnt)
    return (data_tensor, target_tensor)

In [76]:
train_loader = DataLoader(dataset = dataset,batch_size = batch_size, shuffle=False, num_workers=0, drop_last=False, sampler=SubsetRandomSampler(train_idx), collate_fn = my_collate_fn)
val_loader = DataLoader(dataset = dataset,batch_size = batch_size, shuffle=False, num_workers=0, drop_last=False, sampler=SubsetRandomSampler(val_idx),collate_fn = my_collate_fn)

test_loader = DataLoader(dataset = dataset,batch_size = batch_size, shuffle=False, num_workers=0, drop_last=False, sampler=SubsetRandomSampler(test_idx),collate_fn = my_collate_fn)

# 3. The Net and Init
1. BatchNorm1d: The mean and std are calculated per-dimension over the mini-batches
2.

In [77]:
### BatchNorm2d测试
def test_BN():
    m = nn.BatchNorm2d(3, affine=False)  # affine: With Learnable Parameters or not
    print('m:', m)
    # The mean and std are calculated per-dimension over the mini-batches
    input = torch.tensor([
        [[1.,2.,3.,4.],[1.,2.,3.,4.],[-1.,-2.,-3.,-4.]],
        [[0.,0.,0.,0.],[0.,0.,0.,0.],[0.,0.,0.,0.]]
    ], requires_grad=True)

    print('input:', input.shape)
    input = input.unsqueeze(dim=2)
    print('input:', input.shape)
    output = m(input) # 归一化
    print('output:', output.shape)
    print('output:', output)

In [78]:
### 复现batchNorm2d在input shape为3维的情况
input = torch.tensor([[[1.,2.,3.,4.]],[[0.,0.,0.,0.]]])
# print(input.shape)
# torch.mean(input),torch.var(input,unbiased = False)
(input-torch.mean(input))/ torch.sqrt(torch.var(input,unbiased = False))

tensor([[[-0.1690,  0.5071,  1.1832,  1.8593]],

        [[-0.8452, -0.8452, -0.8452, -0.8452]]])

- 设置网络初始权重: 不太work

In [79]:

class model_param_init(nn.Module):
    def __init__(self, model):
        super().__init__()
        assert isinstance(model, nn.Module), 'model not a class nn.Module'
        self.net = model
        self.initParam()

    def initParam(self):
        for param in self.net.parameters():
            # nn.init.zeros_(param)
            # nn.init.ones_(param)
            # nn.init.normal_(param, mean=0, std=1)
            # nn.init.uniform_(param, a=0, b=1)
            # nn.init.constant_(param, val=1)   # 将所有权重初始化为1
            # nn.init.eye_(param)  # 只能将二维的tensor初始化为单位矩阵
            # nn.init.xavier_uniform_(param, gain=1)  # Glorot初始化  得到的张量是从-a——a中采用的
            # nn.init.xavier_normal_(param, gain=1)   # 得到的张量是从0-std采样的
            nn.init.kaiming_normal_(param, a=0, mode='fan_in', nonlinearity='relu') # he初始化方法
            # nn.init.kaiming_uniform_(param)

- print网络每层结构

In [80]:
DEBUG = False
class PrintLayer(nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, x):
        if(DEBUG):
            print("This layer: ")
            print(x)      #print(x.shape)
        return x

- Sequential结构

In [81]:
# Sequential
class MLP(nn.Module):
    # code->generate->override methods
    def __init__(self, n_gaussians) -> None:
        super().__init__()

        self.mlp_call = nn.Sequential(
            nn.BatchNorm2d(num_features=3,affine=False),
            nn.Conv2d(in_channels=3, out_channels=3, kernel_size=(1,3), stride=(1,3), padding=0,bias=False),
            # PrintLayer(),
            nn.Softplus(),

            nn.BatchNorm2d(num_features=3,affine=False),
            nn.Conv2d(in_channels=3, out_channels=3, kernel_size=(1,2), stride=(1,2), padding=0,bias=False),
            #nn.ReLU(inplace=True),
            # PrintLayer(),
            nn.Softplus(),

            nn.BatchNorm2d(num_features=3,affine=False),
            nn.Conv2d(in_channels=3, out_channels=3, kernel_size=(1,5), stride=(1,5), padding=0,bias=False),
            # nn.ReLU(inplace=True),
            # PrintLayer(),
            nn.Softplus(),

            # nn.BatchNorm2d(num_features=3,affine=False),
            nn.Flatten(),
            nn.Linear(30, 9)
        )
        # π μ σ for MDN
        self.z_pi = nn.Sequential(
            nn.Linear(9, n_gaussians),  # 30个params要learn
            nn.Softmax(dim=1)
        )
        self.z_mu = nn.Linear(9, n_gaussians)
        self.z_sigma = nn.Linear(9, n_gaussians)

    def forward(self, x):
        # 加一个height维度
        x.unsqueeze_(dim=2)
        mlp_output = self.mlp_call(x)
        # print("mlp_output is :", mlp_output)
        # 输出n_gaussians个高斯的参数

        # # print the output of every layer
        # x = input
        # for i in range(len(list(self.mlp_call))):
        #     # 循环读入上一层的输出
        #     x = self.mlp_call[i](x)
        #     if i == 1:
        #         print(f"In layer {i}, the output is:\n",x)
        #
        tmp = self.z_pi(mlp_output)
        pi = torch.mean(tmp,dim=0)
        tmp = self.z_mu(mlp_output)
        mu = torch.mean(tmp,dim=0)
        tmp = torch.exp(self.z_sigma(mlp_output))
        # sigma has to be positive
        torch._assert((torch.nonzero(tmp<0, as_tuple=False).shape[0]<=0),"Sigma is less than zero!")
        sigma = torch.mean(tmp,dim=0)
        return pi, mu, sigma
        # return mlp_output

- 非Sequential结构

In [82]:
# Not Sequential
class MLP(nn.Module):
    # code->generate->override methods
    def __init__(self, n_gaussians) -> None:
        super().__init__()
        self.BN = nn.BatchNorm2d(num_features=3,affine=False)

        self.conv1 = nn.Conv2d(in_channels=3, out_channels=3, kernel_size=(1,3), stride=(1,3), padding=0,bias=False)
        # PrintLayer(),
        self.ac_func1 = nn.Softplus()

        self.conv2 = nn.Conv2d(in_channels=3, out_channels=3, kernel_size=(1,2), stride=(1,2), padding=0,bias=False)
        #nn.ReLU(inplace=True),
        # PrintLayer(),
        self.ac_func2 = nn.Softplus()
        self.conv3 = nn.Conv2d(in_channels=3, out_channels=3, kernel_size=(1,5), stride=(1,5), padding=0,bias=False)
        # nn.ReLU(inplace=True),
        # PrintLayer(),
        self.ac_func3 = nn.Softplus()

        self.flatten = nn.Flatten()
        self.linear1 = nn.Linear(30, 9)

        self.z_pi = nn.Sequential(
            nn.Linear(9, n_gaussians),  # 30个params要learn
            nn.Softmax(dim=1)
        )
        self.z_mu = nn.Linear(9, n_gaussians)
        self.z_sigma = nn.Linear(9, n_gaussians)

        self.weight_after_bp = []

    def forward(self, x):
        # 加一个height维度
        x.unsqueeze_(dim=2)
        x = self.BN(x)
        # print("The conv1's input is : ",x)
        # self.weight_after_bp.append(param.clone())
        x = self.conv1(x)
        # print("After the conv1: ",x)
        x = self.ac_func1(x)

        x = self.BN(x)
        x = self.conv2(x)
        x = self.ac_func2(x)

        x = self.BN(x)
        x = self.conv3(x)
        x = self.ac_func3(x)

        x = self.flatten(x)
        x = self.linear1(x)
        # print("after linear1, the output shape is: ",x.shape)

        # pi = torch.mean(self.z_pi(x))
        pi = self.z_pi(x)
        print("pi's shape: ",pi.shape)
        mu = self.z_mu(x)
        print("mu's shape: ",mu.shape)
        sigma = torch.exp(self.z_sigma(x))
        print("sigma's shape: ",sigma.shape)

        # # print the output of every layer
        # x = input
        # for i in range(len(list(self.mlp_call))):
        #     # 循环读入上一层的输出
        #     x = self.mlp_call[i](x)
        #     if i == 1:
        #         print(f"In layer {i}, the output is:\n",x)
        #

        return pi, mu, sigma
        # return mlp_output

In [83]:
input = torch.tensor([
    [[1.,2.,3.,4.],[1.,2.,3.,4.],[-1.,-2.,-3.,-4.]],
    [[0.,0.,0.,0.],[0.,0.,0.,0.],[0.,0.,0.,0.]]
], requires_grad=True)
input = input.unsqueeze(dim=2)
print(f"input's shape is {input.shape}")
flt = nn.Flatten(start_dim=1)
flt(input).shape

input's shape is torch.Size([2, 3, 1, 4])


torch.Size([2, 12])

# 4. The Loss
- `loss_preparation`用来做loss的前期data准备：
    - 去掉dataloader传输时padding的那些0
    - 计算混合模型的分布`m`以及target data中的`duration`
- `loss_fn`用来计算


In [84]:
class MLELoss(nn.Module):

    def __init__(self, pi, mu, sigma):
        super().__init__()
        self.pi = pi
        self.mu = mu
        self.sigma = sigma


    def forward(self,y_pred,y_true):
        bce = torch.nn.BCELoss(reduction = "none")(y_pred,y_true)
        p_t = (y_true * y_pred) + ((1 - y_true) * (1 - y_pred))

        return loss

- 构造直接计算prob的函数
- 输出50个GMM

In [87]:
ONEOVERSQRT2PI = 1.0 / math.sqrt(2 * math.pi)

def gaussian_probability(mu, sigma, target):
    """Returns the probability of `target` given MoG parameters `sigma` and `mu`.

    Arguments:
        sigma (BxGxO): The standard deviation of the Gaussians. B is the batch
            size, G is the number of Gaussians, and O is the number of
            dimensions per Gaussian.
        mu (BxGxO): The means of the Gaussians. B is the batch size, G is the
            number of Gaussians, and O is the number of dimensions per Gaussian.
        target (BxI): A batch of target. B is the batch size and I is the number of
            input dimensions.

    Returns:
        probabilities (BxG): The probability of each point in the probability
            of the distribution in the corresponding sigma/mu index.
    """
    # target_unpadded = torch.repeat_interleave(target.unsqueeze(1), repeats=3, dim=1)
    # print(f"target_unpadded is:{target_unpadded}")
    ret = ONEOVERSQRT2PI * torch.exp(-0.5 * ((target - mu) / sigma)**2) / sigma
    # ret = target_unpadded - mu
    # prod(): 返回输入张量给定维度dim=2上每行的积。 输出形状与输入相同，除了给定维度上为1
    #return torch.prod(ret, 2)
    return ret

In [88]:
def test_gaussian_probability():
    mu = torch.tensor([0,0])
    sigma = torch.tensor([1,1])
    target = torch.tensor([0,-1,1])
    pi = torch.tensor([1,1])
    prob = pi*gaussian_probability(mu,sigma,target)
    result = torch.sum(prob,dim=1)
    print(result)
test_gaussian_probability()

RuntimeError: The size of tensor a (3) must match the size of tensor b (2) at non-singleton dimension 0

In [None]:
# 当input的shape是[50,3]时，输出应该是50个GMM
def loss_preparation(pi, mu, sigma, target):

    m=[]
    for i in range(pi.shape[0]):
        m.append(torch.distributions.Normal(loc=mu[i,:].T, scale=sigma[i,:].T))

    duration = target[:,:,0].squeeze_()
    # # target_drop_padding.cpu().data.numpy()
    # # target_drop_padding.to_csv()
    # print("target_drop_padding shape:",target_drop_padding.shape)
    #
    # duration = torch.flatten(target[:,:,0])
    # print("duration shape:",duration.shape)
    # print("[PADDING]: drop 了多少个padded 0：",duration.shape[0] - target_drop_padding.shape[0])
    #
    # duration = torch.repeat_interleave(duration.unsqueeze(dim=1), repeats=3, dim=1).to(device)

    # # 统计一下target data的利用率
    # loss_1 = torch.exp(m.log_prob(duration))
    # #print("m.log_prob(duration): ",m.log_prob(duration))
    # loss_2 = torch.sum(loss_1 * pi, dim=1)
    # len_0 = len(loss_2)
    # print("non zero 占比：",len(loss_2[torch.nonzero(loss_2)])/len_0)
    return duration,m

- 测试gaussian_probability函数

In [None]:
# # 当input的shape是[50,3]时，输出应该是50个GMM
# # 对这50个GMM看能生成什么output
# # 测试gaussian_probability函数
# def loss_fn(Pi,duration,m,Mu,Sigma):
#     loss = torch.zeros(1,device=device)
#     # padding_cnt = 0
#     for i in range(len(m)):
#         target = duration[i,:]
#         pi = Pi[i,:]
#         mu = Mu[i,:]
#         sigma = Sigma[i,:]
#
#         # repeat and copy non-zero target data
#         target_unpadded = target[torch.nonzero(target)].squeeze_()
#         target_unpadded = torch.repeat_interleave(target_unpadded.unsqueeze(dim=1), repeats=3, dim=1).to(device)
#
#         print(f"-{i}:target_unpadded shape: {target_unpadded.shape}")
#         loss_1 = torch.exp(m[i].log_prob(target_unpadded))
#         # print(f"-{i}:loss_1 shape: {loss_1.shape}")
#         # print(f"-{i}:the pi is : {pi}")
#         print(f"-{i}:the m[i].log_prob(target_unpadded) is : {m[i].log_prob(target_unpadded)}")
#         print(f"-{i}:the loss_1 is : {loss_1}")
#         loss_2 = torch.sum(loss_1 * pi, dim=1)
#
#         prob = pi*gaussian_probability(mu,sigma,target_unpadded)
#         # result = torch.sum(prob,dim=1)
#         # print(f"--{i}: the result of GMM by hand: {result}")
#         # print(f"-{i}:loss_2 shape: {loss_2.shape}")
#         print(f"-{i}:the loss_2 is : {loss_2}")
#         # print("Is there any 0 in loss_2? ", ( len(torch.nonzero(loss_2)) < len(loss_2)))
#         loss_3 = loss_2[torch.nonzero(loss_2)].squeeze_()         # 去掉所有的log(0)
#         # print(f"-{i}:loss_3 shape: {loss_3.shape}")
#         # print(f"-{i}:the loss_3 is :{loss_3}")
#         loss += torch.mean(-torch.log(loss_3))
#         print(f"-{i}:the loss is : {loss}")
#
#     return loss.to(device)

In [None]:
# # 当input的shape是[50,3]时，输出应该是50个GMM
# # 对这50个GfMM看能生成什么output

def loss_fn(Pi,duration,m):
    loss = torch.zeros(1,device=device)
    # padding_cnt = 0
    for i in range(len(m)):
        target = duration[i,:]
        pi = Pi[i,:]

        # repeat and copy target data
        target_unpadded = target[torch.nonzero(target)].squeeze_()
        # padding_cnt += len(target)-len(target_unpadded)
        target_unpadded = torch.repeat_interleave(target_unpadded.unsqueeze(dim=1), repeats=3, dim=1).to(device)

        # print(f"-{i}:target_unpadded shape: {target_unpadded.shape}")
        loss_1 = torch.exp(m[i].log_prob(target_unpadded))

        # loss_1 = m[i].log_prob(target_unpadded)            # loss_1这里取消torch.exp的操作直接用pi*log_prob
        print(f"-{i}:loss_1 shape: {loss_1.shape}")
        print(f"-{i}:the pi is : {pi}")
        print(f"-{i}:the m[i].log_prob(target_unpadded) is : {m[i].log_prob(target_unpadded)}")
        print(f"-{i}:the loss_1 is : {loss_1}")
        loss_2 = torch.sum(loss_1 * pi, dim=1)
        #print(f"-{i}:loss_2 shape: {loss_2.shape}")
        print(f"-{i}:the loss_2 is : {loss_2}")
        # print("Is there any 0 in loss_2? ", ( len(torch.nonzero(loss_2)) < len(loss_2)))
        loss_3 = loss_2[torch.nonzero(loss_2)].squeeze_()         # 去掉所有的log(0)
        #print(f"-{i}:loss_3 shape: {loss_3.shape}")
        print(f"-{i}:the loss_3 is :{loss_3}")
        loss += torch.mean(-(loss_3))
        print(f"-{i}:the loss is : {loss}")

    # print(f"[UNPADDING in loss] There is {padding_cnt} zero unpadded")
    return loss.to(device)

- test for logsumexp的放缩思想
- didn't work here 因为exp会放大差值，这里的“差值”尤其大，不管指数上对x加减什么都无法拉回来

In [67]:
input = torch.tensor([[-1.0709e+03, -2.1728e+03, -2.2849e+02],
        [-1.2546e+03, -2.5474e+03, -2.6687e+02],
        [-2.0123e+03, -4.0929e+03, -4.2486e+02],
        [-7.9849e+03, -1.6292e+04, -1.6648e+03],
        [-9.7611e+03, -1.9922e+04, -2.0328e+03],
        [-1.2760e+04, -2.6051e+04, -2.6538e+03],
        [-3.0153e+04, -6.1614e+04, -6.2520e+03],
        [-3.6549e+04, -7.4694e+04, -7.5743e+03],
        [-5.5229e+04, -1.1290e+05, -1.1435e+04],
        [-9.1470e+04, -1.8703e+05, -1.8921e+04],
        [-1.0325e+05, -2.1112e+05, -2.1353e+04],
        [-1.5008e+05, -3.0693e+05, -3.1024e+04],
        [-1.7621e+05, -3.6038e+05, -3.6418e+04],
        [-1.8947e+05, -3.8752e+05, -3.9156e+04],
        [-2.6023e+05, -5.3229e+05, -5.3763e+04],
        [-3.9286e+05, -8.0366e+05, -8.1137e+04]])
c_max = torch.max(input)
c_min = torch.min(input)
c_mid = torch.median(input)
c_test = torch.tensor([90])
# c_test = torch.tensor([-100])
print(c_max,c_min,c_mid)
print(c_max-c_min)
input2 = input - c_max
print("input2: ",input2)

# exp本来就会放大差值
torch.exp(c_test)

tensor(-228.4900) tensor(-803660.) tensor(-31024.)
tensor(803431.5000)
input2:  tensor([[-8.4241e+02, -1.9443e+03,  0.0000e+00],
        [-1.0261e+03, -2.3189e+03, -3.8380e+01],
        [-1.7838e+03, -3.8644e+03, -1.9637e+02],
        [-7.7564e+03, -1.6064e+04, -1.4363e+03],
        [-9.5326e+03, -1.9694e+04, -1.8043e+03],
        [-1.2532e+04, -2.5823e+04, -2.4253e+03],
        [-2.9925e+04, -6.1386e+04, -6.0235e+03],
        [-3.6321e+04, -7.4466e+04, -7.3458e+03],
        [-5.5001e+04, -1.1267e+05, -1.1207e+04],
        [-9.1242e+04, -1.8680e+05, -1.8693e+04],
        [-1.0302e+05, -2.1089e+05, -2.1125e+04],
        [-1.4985e+05, -3.0670e+05, -3.0796e+04],
        [-1.7598e+05, -3.6015e+05, -3.6190e+04],
        [-1.8924e+05, -3.8729e+05, -3.8928e+04],
        [-2.6000e+05, -5.3206e+05, -5.3535e+04],
        [-3.9263e+05, -8.0343e+05, -8.0909e+04]])


tensor([inf])

# 5. Training
## 5.1 preparations
1. 初始化Visdom环境
2.


In [235]:
writer = SummaryWriter("logs-MLP")
viz = Visdom(env="001")

Setting up a new session...


## 5.2 Draw
1. draw:
    - mdn的图（visdom）以及mdn的test draw
    - loss图以及初始化（visdom）
    - MLP的网络结构（.png）

In [236]:
#### Test for drawing
def test_draw():
    viz = Visdom(env="001")

    mu = torch.tensor([0,10,20])
    sigma = torch.tensor([1,1,1])
    duration = torch.tensor([0,1,2,0])
    duration = torch.repeat_interleave(duration.unsqueeze(dim=1), repeats=3, dim=1)
    m = torch.distributions.Normal(loc=mu, scale=sigma)
    pi = torch.tensor([0.2,0.3,0.5])

    # draw
    x_0 = torch.tensor(np.arange(0,1000))
    x = torch.repeat_interleave(x_0.unsqueeze(dim=1), repeats=3, dim=1)
    y = torch.exp(m.log_prob(x))
    y_sum = torch.unsqueeze(torch.sum(pi*y,dim=1),dim=1)
    viz.line(X = x_0,Y= torch.cat([y,y_sum],dim = 1), env="001", win="test_draw_2",
            opts= dict(title='test_draw', legend=['N1', 'N2', 'N3','NNN']))
# test_draw()

In [237]:
def draw_mdn(pi,duration,m,total_train_step):
    # draw the distrb.
    x_0 = torch.arange(0,torch.max(duration).item()).to(device)
    x = torch.repeat_interleave(x_0.unsqueeze(dim=1), repeats=3, dim=1)
    y = torch.exp(m.log_prob(x)).to(device)
    y_sum = torch.unsqueeze(torch.sum(pi*y,dim=1),dim=1)   # 维度相等才能cat
    win_str = "total_train_step-"+str(total_train_step)
    viz.line(X = x_0,Y= torch.cat([y,y_sum],dim = 1), env="001", win=win_str,
        opts= dict(title=win_str, legend=['N1', 'N2', 'N3','NNN']))

In [238]:
def draw_the_net():

    x = torch.randn([5, 3, 300])  # 定义一个网络的输入值
    mlp = MLP(N_gaussians)
    y = mlp(x)    # 获取网络的预测值

    MyConvNetVis = make_dot(y, params=dict(list(mlp.named_parameters()) + [('x', x)]))
    MyConvNetVis.format = "png"
    # 指定文件生成的文件夹
    MyConvNetVis.directory = "data_pic"
    # 生成文件
    # MyConvNetVis.view()
# draw_the_net()

In [312]:
viz.line(X = [0.],Y = [0.], env="001", win="The Loss", opts= dict(title="The Loss"))
def draw_loss(total_train_step, loss):
    viz.line(X = [total_train_step], Y = [loss],win="The Loss", update="append",
        opts= dict(title="The Loss"))

## 5.3 Training


In [313]:
mlp = MLP(N_gaussians)
# # mlp = model_param_init(mlp)

# # save the init params
torch.save(mlp.state_dict(), 'mlp_init.pth')

# read the saved model
# model_data = torch.load('mlp_init_1epoch.pth')
# mlp.load_state_dict(model_data)

mlp = mlp.to(device=device)
summary(mlp, (3,300))
# optimizer = torch.optim.SGD(mlp.parameters(), lr=learning_rate)
optimizer = torch.optim.Adagrad(mlp.parameters(),lr=learning_rate, lr_decay=learning_rate, weight_decay=learning_rate)
#

pi's shape:  torch.Size([2, 3])
mu's shape:  torch.Size([2, 3])
sigma's shape:  torch.Size([2, 3])
----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
       BatchNorm2d-1            [-1, 3, 1, 300]               0
            Conv2d-2            [-1, 3, 1, 100]              27
          Softplus-3            [-1, 3, 1, 100]               0
       BatchNorm2d-4            [-1, 3, 1, 100]               0
            Conv2d-5             [-1, 3, 1, 50]              18
          Softplus-6             [-1, 3, 1, 50]               0
       BatchNorm2d-7             [-1, 3, 1, 50]               0
            Conv2d-8             [-1, 3, 1, 10]              45
          Softplus-9             [-1, 3, 1, 10]               0
          Flatten-10                   [-1, 30]               0
           Linear-11                    [-1, 9]             279
           Linear-12                    [-1, 3]              30
    

In [314]:
filename = "../log_file.txt"
f = open(filename,'w')
total_train_step = 0
mlp.train()
for epoch in range(0,1):
    for batch_id,data in enumerate(train_loader):
        input, target = data
        print(f"---- {batch_id} batch----")
        # print(f"---- {batch_id} batch----",file=f)

        # do the inference
        input = input.to(device)
        pi, mu, sigma = mlp(input)
        print(f"The [pi,mu,sigma] is : \n")
        print(pi,"\n",mu,"\n",sigma)

        # save the params
        # params = list(mlp.named_parameters())

        # # print the weight and grad with 'sequential' structure
        # for i,m in enumerate(mlp.mlp_call.children()):
        #     if isinstance(m, nn.Conv2d):
        #         print(str(i)+"(Conv2d).weight = ",m.weight)
        #         print(str(i)+"(Conv2d).weight.grad = ",m.weight.grad)
        #     if isinstance(m, nn.Linear):
        #         print(str(i)+"(Linear).weight = ",m.weight)
        #         print(str(i)+"(Linear).weight.grad = ",m.weight.grad)
        #
        # for i,m in enumerate(mlp.z_pi.children()):
        #     if isinstance(m, nn.Linear):
        #         print(str(i)+"(Linear).weight = ",m.weight)
        #         print(str(i)+"(Linear).weight.grad = ",m.weight.grad)
        #
        # for i,m in enumerate(mlp.z_mu.children()):
        #     if isinstance(m, nn.Linear):
        #         print(str(i)+"(Linear).weight = ",m.weight)
        #         print(str(i)+"(Linear).weight.grad = ",m.weight.grad)

        # cal the loss and draw the MDN
        duration,m  = loss_preparation(pi, mu, sigma, target)
        # draw_mdn(pi,duration,m,total_train_step)
        loss = loss_fn(pi,duration,m,mu,sigma)
        # loss.retain_grad()
        draw_loss(total_train_step, loss.item())
        print("训练次数：{}，Loss：{}".format(total_train_step, loss.item()))

        # optim
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # print("训练次数：{}，Loss：{}, Loss's grad: {}".format(total_train_step, loss.item(), loss.grad))

        total_train_step += 1
        # if total_train_step % 10 == 0:
            # 一般不写loss，而是loss.item()
f.close()

[PADDING]：一共padding的0个个数： 27917
---- 0 batch----
pi's shape:  torch.Size([50, 3])
mu's shape:  torch.Size([50, 3])
sigma's shape:  torch.Size([50, 3])
The [pi,mu,sigma] is : 

tensor([[0.2820, 0.2052, 0.5129],
        [0.2660, 0.2228, 0.5112],
        [0.2515, 0.1976, 0.5509],
        [0.2458, 0.2040, 0.5502],
        [0.2618, 0.2265, 0.5117],
        [0.2531, 0.1984, 0.5485],
        [0.2698, 0.2099, 0.5202],
        [0.2772, 0.2058, 0.5170],
        [0.2835, 0.2441, 0.4723],
        [0.2671, 0.2227, 0.5102],
        [0.2730, 0.2051, 0.5219],
        [0.2980, 0.2112, 0.4908],
        [0.2826, 0.1965, 0.5209],
        [0.2567, 0.1945, 0.5488],
        [0.2729, 0.2046, 0.5224],
        [0.2970, 0.2168, 0.4862],
        [0.2914, 0.2333, 0.4753],
        [0.2827, 0.2174, 0.4999],
        [0.2612, 0.2104, 0.5284],
        [0.3016, 0.1940, 0.5044],
        [0.2364, 0.2021, 0.5614],
        [0.2920, 0.2309, 0.4771],
        [0.2493, 0.2055, 0.5452],
        [0.2413, 0.1980, 0.5607],
        

ValueError: Expected parameter loc (Tensor of shape (3,)) of distribution Normal(loc: torch.Size([3]), scale: torch.Size([3])) to satisfy the constraint Real(), but found invalid values:
tensor([nan, nan, nan], device='cuda:0', grad_fn=<PermuteBackward0>)


$\mathcal{L}(y \vert x) = - \log\bigg\{\sum_{k=1}^K \pi_k(x)  \mathcal{N}\big(y \vert \mu_k(x), \Sigma_k(x)\big)\bigg\}$