In [27]:
import numpy as np
import torch
import os
import torch.nn as nn
from torch.utils.data import DataLoader
import torch.optim as optim
import random
import matplotlib.pyplot as plt
import pandas as pd
import datatable as dt
from time import time
from collections import Counter

from tools import *

random.seed(42)
np.random.seed(42)
SEED = 42
BATCH_SIZE = 1024

device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
device

'cuda:0'

In [29]:
time()

1629654956.0024886

In [30]:
# 节约内存的一个标配函数
def reduce_mem(df):
    starttime = time()
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if pd.isnull(c_min) or pd.isnull(c_max):
                continue
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024**2
    print('-- Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction),time spend:{:2.2f} min'.format(end_mem,
                                                                                                           100*(start_mem-end_mem)/start_mem,
                                                                                                           (time()-starttime)/60))
    return df

In [9]:
def confusion_matrix(label, predict, n):
    """
    计算混淆矩阵
    :param label: 标签，np.array类型。形状可以是(n_sample,) 或者 (n_sample, n_classes)，当为第二种形状时可以表示多标签分类的情况
    :param predict: 预测值，与 `label` 同理
    :param n: 类别数目
    :return: 混淆矩阵，np.array类型。shape 为 (n, n)。$cm_{ij}$表示真实标签为 $i$，预测标签为 $j$ 的样本个数
    """
    k = (label >= 0) & (label < n)
    # bincount()函数用于统计数组内每个非负整数的个数
    # 详见 https://docs.scipy.org/doc/numpy/reference/generated/numpy.bincount.html
    return np.bincount(n * label[k].astype(int) + predict[k], minlength=n ** 2).reshape(n, n)


def auc(y, p, classes):
    """
    给定真实标签和预测标签，计算每个类别的auc值。实际只算出了roc曲线上一个点，即一个(fpr, tpr)，再并上(0, 0)和(1, 1)来计算auc
    :param y: 标签，np.array类型
    :param p: 预测标签，np.array类型
    :param classes: 类别，list-like，表示有哪些类别
    """
    p = p.cpu()
    all_aucs = np.zeros(len(classes))
    for i, c in enumerate(classes):
        _y = np.zeros_like(y)
        _y[y==c] = 1
        _y[y!=c] = 0
        _p = np.zeros_like(p)
        _p[p==c] = 1
        _p[p!=c] = 0
#         print(_y, _p)
        cm = confusion_matrix(_y, _p, 2)
#         print(cm)
        tpr = (cm[0, 0] / (cm[0, 0] + cm[0, 1])) if (cm[0, 0] + cm[0, 1]) != 0 else 0
        fpr = (cm[1, 0] / (cm[1, 0] + cm[1, 1])) if (cm[1, 0] + cm[1, 1]) != 0 else 0
        tpr = [0, tpr, 1]
        fpr = [0, fpr, 1]
        auc = metrics.auc(fpr, tpr)
        all_aucs[i] = auc
        if _y.sum() == 0 or _p.sum() == 0:
            all_aucs[i] = 0
    return all_aucs

In [None]:
y = np.random.randint(0, 10, 100)
p = np.random.randint(0, 10, 100)

classes = list(range(10))
weights = np.arange(0, 1, 0.1)
all_aucs = auc(y, p, classes)

weighted_auc = (all_aucs * weights).sum()
print(f"{all_aucs}\n{weighted_auc}")

classes = list(range(2))
y = np.array([0, 0, 1, 1])
p = np.array([0, 1, 0, 1])
all_aucs = auc(y, p, classes)

print(f"{all_aucs}")

# 导入数据集

In [12]:
base_dir = "../2021_3_data"
test_data_dir  = os.path.join(base_dir, "testdata")
train_data_dir = os.path.join(base_dir, "traindata")

## 处理训练数据
可在此做一些预处理：
- 从用户历史行为数据中筛掉在视频特征中没出现过的video_id
- 删除多余的列
- 调整列的顺序
- 改变列的数据类型


- 加载训练数据

In [13]:
with_status = False
if with_status:
    user_features_name = "user_features_with_status"
    video_features_name = "video_features_with_status"
else:
    user_features_name = "user_features"
    video_features_name = "video_features"
    
p_user = os.path.join(train_data_dir, f"user_features_data/{user_features_name}.jay")
p_video = os.path.join(train_data_dir, f"video_features_data/{video_features_name}.jay")

In [16]:
%%time
## 使用datatable 加载训练数据
p_act = os.path.join(train_data_dir, "all_actions_with_status.jay")

df_train, others = load_train_test_data(None, pre_merged=False, return_others=True,
                           **{"p_user": p_user, "p_video": p_video, "p_action": p_act})
user_df = others['user']
video_df = others['video']
action_df = others['action']
df_train.shape

CPU times: user 28.7 s, sys: 1.39 s, total: 30.1 s
Wall time: 1.6 s


(7353024, 133)

In [17]:
%%time
# p_user = os.path.join(train_data_dir, "user_features_data/user_features.jay")
# p_video = os.path.join(train_data_dir, "video_features_data/video_features.jay")
p_act = os.path.join(test_data_dir, "test_with_status.jay")

#path = os.path.join(test_data_dir, "test.jay")
kwargs = {"p_user": p_user, "p_video": p_video, "p_action": p_act}

df_test, others = load_train_test_data(None, pre_merged=False, return_others=True, **kwargs)
test_df = others['action']
df_test.shape

CPU times: user 21.1 s, sys: 1.53 s, total: 22.6 s
Wall time: 1.17 s


(2822180, 130)

In [18]:
%%time
if isinstance(df_train, dt.Frame):
    df_train = df_train.to_pandas()
if isinstance(df_test, dt.Frame):
    df_test = df_test.to_pandas()

CPU times: user 30.9 s, sys: 5.1 s, total: 36 s
Wall time: 7.84 s


In [31]:
df_train = reduce_mem(df_train)

-- Mem. usage decreased to 1872.31 Mb (67.6% reduction),time spend:0.73 min


In [41]:
df_train = df_train[df_train['is_watch'] == 1]

In [32]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7353024 entries, 0 to 7353023
Columns: 133 entries, user_id to da_4
dtypes: float16(97), float32(6), int16(6), int32(2), int8(21), object(1)
memory usage: 1.8+ GB


In [36]:
# 删除 video_name、is_watch 列
df_train.drop(['video_name','is_watch'], axis=1, inplace=True)

In [37]:
# 删除 video_id、user_id列
df_train.drop(['user_id', 'video_id'], axis=1, inplace=True)

In [38]:
df_train

Unnamed: 0,is_share,watch_label,v_avg_watch_label_1,v_sum_watch_times_1,v_sum_watch_overs_1,v_sum_comment_times_1,v_sum_collect_times_1,v_sum_share_times_1,v_sum_quit_times_1,v_sum_skip_times_1,...,class_5,class_6,class_7,class_8,class_9,da_0,da_1,da_2,da_3,da_4
0,0,2,0.601562,31325.0,266.0,4.0,161.0,41.0,22520.0,0.0,...,0.304199,0.037476,0.037476,0.395996,0.037476,0.075867,0.075989,0.388672,0.383789,0.075684
1,0,0,1.643555,1937.0,47.0,4.0,23.0,3.0,888.0,0.0,...,0.041443,0.041443,0.041443,0.041443,0.626953,0.084351,0.361816,0.082886,0.082886,0.387939
2,0,5,0.686523,14038.0,541.0,11.0,321.0,68.0,11806.0,0.0,...,0.041779,0.350098,0.041779,0.041779,0.041779,0.324951,0.083557,0.083557,0.083557,0.424316
3,0,0,1.949219,7902.0,965.0,12.0,62.0,15.0,4732.0,0.0,...,0.034271,0.034302,0.034271,0.034271,0.034271,0.504395,0.072327,0.070557,0.069031,0.283936
4,0,4,2.929688,344.0,54.0,1.0,5.0,0.0,133.0,0.0,...,0.037964,0.037933,0.037933,0.037933,0.037933,0.076294,0.391846,0.077209,0.075867,0.378662
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7353019,0,0,2.705078,68.0,9.0,0.0,0.0,0.0,32.0,0.0,...,0.041534,0.041534,0.041534,0.625977,0.041534,0.346680,0.083069,0.083069,0.404053,0.083069
7353020,0,0,1.811523,468.0,55.0,0.0,3.0,0.0,282.0,0.0,...,0.041931,0.041931,0.041962,0.041962,0.041931,0.313232,0.083862,0.435059,0.083862,0.083862
7353021,0,8,1.539062,486.0,51.0,0.0,5.0,3.0,342.0,0.0,...,0.626953,0.041443,0.041443,0.041443,0.041443,0.083313,0.083923,0.666016,0.082886,0.083618
7353022,0,0,1.105469,69526.0,4676.0,60.0,658.0,185.0,53652.0,0.0,...,0.034271,0.034302,0.034271,0.034271,0.034271,0.504395,0.072327,0.070557,0.069031,0.283936


In [39]:
dataset = df_train
dataset.shape

(7353024, 129)

In [40]:
# 准备数据
watch_label = dataset.pop('watch_label').astype(np.uint8)
is_share = dataset.pop('is_share').astype(np.uint8)
watch_label.shape, is_share.shape, dataset.shape

((7353024,), (7353024,), (7353024, 127))

## 处理测试数据

- 加载测试数据

In [41]:
# 拼接好的测试数据集
df_test = reduce_mem(df_test)

-- Mem. usage decreased to 807.43 Mb (65.6% reduction),time spend:0.33 min


In [42]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2822180 entries, 0 to 2822179
Columns: 130 entries, user_id to da_4
dtypes: float16(120), float32(3), int32(2), object(5)
memory usage: 807.4+ MB


In [43]:
# 删除 video_name 列
if 'video_name' in df_test.columns:
    df_test.drop('video_name', axis=1, inplace=True)

In [44]:
# 删除 video_id、user_id 列
df_test.drop(['user_id', 'video_id'], axis=1, inplace=True)

In [45]:
df_test

Unnamed: 0,v_avg_watch_label_1,v_sum_watch_times_1,v_sum_watch_overs_1,v_sum_comment_times_1,v_sum_collect_times_1,v_sum_share_times_1,v_sum_quit_times_1,v_sum_skip_times_1,v_sum_watch_days_1,v_avg_watch_label_3,...,class_5,class_6,class_7,class_8,class_9,da_0,da_1,da_2,da_3,da_4
0,1.268555,368.0,12.0,0.0,8.0,2.0,216.0,0.0,1.0,1.337891,...,0.044250,0.044250,0.601562,0.044250,0.044250,0.089783,0.089844,0.220947,0.089905,0.509766
1,1.086914,14472.0,974.0,16.0,162.0,27.0,11288.0,0.0,1.0,1.148438,...,0.037079,0.037079,0.037079,0.037079,0.037079,0.224243,0.074158,0.553223,0.074158,0.074158
2,3.615234,13.0,5.0,0.0,0.0,0.0,6.0,0.0,1.0,2.539062,...,0.041443,0.345947,0.041443,0.041443,0.041443,0.084351,0.362305,0.082886,0.387451,0.082886
3,0.767090,13680.0,188.0,3.0,107.0,22.0,9232.0,0.0,1.0,0.674316,...,0.304199,0.037476,0.037476,0.395996,0.037476,0.075867,0.075989,0.388672,0.383789,0.075684
4,1.000000,5.0,0.0,0.0,0.0,0.0,4.0,0.0,1.0,1.432617,...,0.622559,0.041931,0.041931,0.041931,0.041931,0.084717,0.084839,0.662109,0.084106,0.084473
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2822175,2.230469,26.0,6.0,0.0,0.0,0.0,17.0,0.0,1.0,2.056641,...,0.036896,0.036896,0.036896,0.036896,0.269531,0.471924,0.073792,0.073792,0.074219,0.306396
2822176,2.089844,6460.0,822.0,10.0,47.0,18.0,3708.0,0.0,1.0,1.958984,...,0.034271,0.034302,0.034271,0.034271,0.034271,0.504395,0.072327,0.070557,0.069031,0.283936
2822177,3.445312,45.0,8.0,0.0,0.0,0.0,15.0,0.0,1.0,2.970703,...,0.037445,0.037445,0.037445,0.663086,0.037445,0.349854,0.075439,0.077454,0.422363,0.074890
2822178,1.420898,107.0,9.0,0.0,1.0,0.0,72.0,0.0,1.0,1.768555,...,0.036682,0.458008,0.036682,0.036682,0.248535,0.073364,0.073364,0.073364,0.706543,0.073364


In [17]:
# 填充缺失值
df_test.fillna(value=0, inplace=True)

In [46]:
# 查看train和test维度是否相同
df_train.shape[1] == df_test.shape[1]

True

# ?

In [None]:
# 测试数据
X_train_263 = dataset[:100].values
y_train = target[:100]
X_test_263 = df_test[:100].values
X_train_263.shape #最终一种263个特征

In [32]:
# 全量数据
X_train_263 = data.values
y_train = target
X_test_263 = df_test.values
X_train_263.shape #最终一种263个特征

(2395469, 128)

In [33]:
X_train_263.shape, y_train.shape, X_test_263.shape

((2395469, 128), (2395469,), (2822180, 128))

# 模型

In [47]:
dataset.shape

(7353024, 127)

In [48]:
watch_label.shape

(7353024,)

In [49]:
is_share.shape

(7353024,)

In [50]:
def to_categorical(y, num_classes=None, dtype='float32'):
    """
    From keras sorucecode: https://github.com/keras-team/keras/blob/master/keras/utils/np_utils.py#L9
    """

    y = np.array(y, dtype='int')
    input_shape = y.shape
    if input_shape and input_shape[-1] == 1 and len(input_shape) > 1:
        input_shape = tuple(input_shape[:-1])
    y = y.ravel()
    if not num_classes:
        num_classes = np.max(y) + 1
    n = y.shape[0]
    categorical = np.zeros((n, num_classes), dtype=dtype)
    categorical[np.arange(n), y] = 1
    output_shape = input_shape + (num_classes,)
    categorical = np.reshape(categorical, output_shape)
    return categorical

In [51]:
def split_dataset(dataset, val_ratio=0.2):
    # 将watch 转为one-hot
    transformed_watch_label = to_categorical(watch_label, num_classes=10, dtype=int)
    
    # 将train划分为 train、validation. validation占20%。
    validation_indices = dataset.sample(frac=val_ratio, replace=False, random_state=SEED).index
    validation_data = dataset.iloc[validation_indices]
    validation_label = [transformed_watch_label[validation_indices], is_share[validation_indices]] #key: income, marital.

    train_indices = list(set(dataset.index) - set(validation_indices))
    train_data = dataset.iloc[train_indices]
    train_label = [transformed_watch_label[train_indices], is_share[train_indices]]
    
    return train_data, train_label, validation_data, validation_label

In [52]:
train_data, train_label, validation_data, validation_label = split_dataset(dataset[:100000])

In [53]:
def getTensorDataset(my_x, my_y):
    tensor_x = torch.tensor(my_x)
    tensor_y = torch.tensor(my_y)
    return torch.utils.data.TensorDataset(tensor_x, tensor_y)

# 拼接两个label
train_label_tmp = np.column_stack([train_label[0],train_label[1]])
train_loader = DataLoader(dataset=getTensorDataset(train_data.to_numpy(), train_label_tmp), batch_size=BATCH_SIZE)

validation_label_tmp = np.column_stack([validation_label[0], validation_label[1]])
val_loader = DataLoader(dataset=getTensorDataset(validation_data.to_numpy(), validation_label_tmp), batch_size=BATCH_SIZE)

In [54]:
train_label_tmp.shape

(80000, 11)

### Define the model

In [55]:
class Expert(nn.Module):
    def __init__(self, input_size, output_size, hidden_size):
        super(Expert, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, output_size)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.3)
        # self.log_soft = nn.LogSoftmax(1)

    def forward(self, x):
        out = self.fc1(x)
        out = self.relu(out)
        out = self.dropout(out)
        out = self.fc2(out)
        # out = self.log_soft(out)
        return out
    
class Tower(nn.Module):
    def __init__(self, input_size, output_size, hidden_size):
        super(Tower, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, output_size)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.4)
        # self.softmax = nn.Softmax(dim=1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        out = self.fc1(x)
        out = self.relu(out)
        out = self.dropout(out)
        out = self.fc2(out)
        # out = self.softmax(out)
        out = self.sigmoid(out)
        return out

In [56]:
class MMOE(torch.nn.Module):
    def __init__(self, input_size, num_experts, experts_out, experts_hidden, towers_hidden, tasks):
        super(MMOE, self).__init__()
        # params
        self.input_size = input_size
        self.num_experts = num_experts
        self.experts_out = experts_out
        self.experts_hidden = experts_hidden
        self.towers_hidden = towers_hidden
        self.tasks = tasks
        # row by row
        self.softmax = nn.Softmax(dim=1)
        # model
        self.experts = nn.ModuleList([Expert(self.input_size, self.experts_out, self.experts_hidden) for i in range(self.num_experts)])
        self.w_gates = nn.ParameterList([nn.Parameter(torch.randn(input_size, num_experts), requires_grad=True) for i in range(self.tasks)])
        self.towers = nn.ModuleList([Tower(self.experts_out, 1, self.towers_hidden) for i in range(self.tasks)])

    def forward(self, x):
        # get the experts output
        print("x: ", x.shape)
        expers_o = [e(x) for e in self.experts]
        expers_o_tensor = torch.stack(expers_o)
        
        print("expers_o_tensor ", expers_o_tensor.shape)
        # get the gates output
        # x @ g 矩阵整体乘法。
        
#         print(x @ self.w_gates[0])
#         print(self.softmax(x @ self.w_gates[0]))
        gates_o = [self.softmax(x @ g) for g in self.w_gates]
#         print(gates_o)
#         print("gates_o ", len(gates_o))
        
        
        # multiply the output of the experts with the corresponding gates output
        # res = gates_o[0].t().unsqueeze(2).expand(-1, -1, self.experts_out) * expers_o_tensor
        # https://discuss.pytorch.org/t/element-wise-multiplication-of-the-last-dimension/79534
        
        towers_input = [g.t().unsqueeze(2).expand(-1, -1, self.experts_out) * expers_o_tensor for g in gates_o]
        towers_input = [torch.sum(ti, dim=0) for ti in towers_input]
        print("towers_input : ", len(towers_input))
        
        # get the final output from the towers
        final_output = [t(ti) for t, ti in zip(self.towers, towers_input)]
        
        # get the output of the towers, and stack them
        final_output = torch.stack(final_output, dim=1)
        print(final_output.shape)
        return final_output

In [418]:
# simple_input = torch.tensor([[10.0, 10.0, 15.0, 30.0, 18.0], [20.0, 50.0, 28.0, 22.0, 12.0], [20.0, 50.0, 28.0, 22.0, 12.0]])
# mmoe = MMOE(input_size=5, num_experts=3, experts_out=4, experts_hidden=2, towers_hidden=2, tasks=2)

In [419]:
# mmoe(simple_input)

In [62]:
input_size = train_data.shape[1]
model = MMOE(input_size=input_size, num_experts=12, experts_out=32, experts_hidden=32, towers_hidden=8, tasks=11)
model = model.to(device)
# print(model.state_dict())

# 模型预览

In [61]:
from torchsummary import summary

In [354]:
summary(model, input_size=[(128,)], batch_size=1024)

x:  torch.Size([2, 128])
expers_o_tensor  torch.Size([12, 2, 32])
towers_input :  11
----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Linear-1                 [1024, 32]           4,128
              ReLU-2                 [1024, 32]               0
           Dropout-3                 [1024, 32]               0
            Linear-4                 [1024, 32]           1,056
            Expert-5                 [1024, 32]               0
            Linear-6                 [1024, 32]           4,128
              ReLU-7                 [1024, 32]               0
           Dropout-8                 [1024, 32]               0
            Linear-9                 [1024, 32]           1,056
           Expert-10                 [1024, 32]               0
           Linear-11                 [1024, 32]           4,128
             ReLU-12                 [1024, 32]               0
          Dropout-

In [448]:
simple_input = torch.Tensor(2,128).to(device)

In [449]:
res = model(simple_input)

x:  torch.Size([2, 128])
expers_o_tensor  torch.Size([12, 2, 32])
towers_input :  11
torch.Size([2, 11, 1])


In [381]:
len(res)

11

In [316]:
torch.onnx.export(model, simple_input,'MMoE-DouLoss.onnx')

### Define the training part

In [None]:
# Sets hyper-parameters
lr = 1e-4
n_epochs = 50
tasks = 11

# # Defines loss function and optimizer
# loss_fn_watch = nn.CrossEntropyLoss(reduction='mean')
loss_fn = nn.BCELoss(reduction='mean')

# loss_fn_watch = nn.MSELoss(reduction='mean')
# loss_fn_share = nn.BCELoss(reduction='mean')
optimizer = optim.Adam(model.parameters(), lr=lr)

losses = []
val_losses = []

watch_auc = []
share_auc = []
sum_auc = []

# Training loop
for epoch in range(n_epochs):
    model.train()
    # Uses loader to fetch one mini-batch for training
    epoch_loss = []
    c = 0
    print("Epoch: {}/{}".format(epoch, n_epochs)) 
    for x_batch, y_batch in train_loader:
        # NOW, sends the mini-batch data to the device
        # so it matches location of the MODEL
        x_batch = x_batch.to(device)
        y_batch = y_batch.to(device)
        # One stpe of training
        yhat = model(x_batch.float())
        
        # loss = loss_fn(yhat, y_batch)      
        loss = 0
        for i in range(tasks):
            loss += loss_fn(yhat[:,i].float(), y_batch[:, i].view(-1, 1).float())

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        epoch_loss.append(loss.item())
        if c % 50 == 0:
            print("    Batch: {}/{}".format(c, int(len(train_data)/BATCH_SIZE)))
        c += 1
    losses.append(np.mean(epoch_loss))
        
    # After finishing training steps for all mini-batches,
    # it is time for evaluation!
        
    # We tell PyTorch to NOT use autograd...
    with torch.no_grad():
        # Uses loader to fetch one mini-batch for validation
        epoch_loss = []
        epoch_watch_auc = []
        epoch_share_auc = []
        epoch_sum_auc = []
        for x_val, y_val in val_loader:
            # Again, sends data to same device as model
            x_val = x_val.to(device)
            y_val = y_val.to(device)
#             print(x_val.shape,y_val.shape )
            
            
            model.eval()
            # Makes predictions
            yhat = model(x_val.float()) # len=11, 一组batch的预测值
            
            # Computes validation loss
            loss = 0
            for i in range(tasks):
                loss += loss_fn(yhat[:, i].float(), y_val[:, i].view(-1, 1).float())
            epoch_loss.append(loss.item())
            
            
            # label preds [0,1,2,..]
#             y_v = y_val.cpu() 
#             for i in range(len(y_v)):
#                 print("yhat : ", yh.shape)
#                 print("y_val: ", y_v.shape)
#                 epoch_watch_auc.append(auc(y_v[i][:10], yh[:10], np.arange(10)))
#                 epoch_share_auc.append(auc(y_v[i][-1], yh[-1][i], [0,1]))
#                 epoch_sum_auc.append(watch_auc * 0.7 + share_auc * 0.3)
             
        
#     watch_auc.append(np.mean(epoch_watch_auc))
#     share_auc.append(np.mean(epoch_share_auc))
#     sum_auc.append(np.mean(epoch_sum_auc))
    val_losses.append(np.mean(epoch_loss))

# print(model.state_dict())
print(np.mean(losses))
print(np.mean(val_losses))

Epoch: 0/50
x:  torch.Size([1024, 127])
expers_o_tensor  torch.Size([12, 1024, 32])
towers_input :  11
torch.Size([1024, 11, 1])
    Batch: 0/78
x:  torch.Size([1024, 127])
expers_o_tensor  torch.Size([12, 1024, 32])
towers_input :  11
torch.Size([1024, 11, 1])
x:  torch.Size([1024, 127])
expers_o_tensor  torch.Size([12, 1024, 32])
towers_input :  11
torch.Size([1024, 11, 1])
x:  torch.Size([1024, 127])
expers_o_tensor  torch.Size([12, 1024, 32])
towers_input :  11
torch.Size([1024, 11, 1])
x:  torch.Size([1024, 127])
expers_o_tensor  torch.Size([12, 1024, 32])
towers_input :  11
torch.Size([1024, 11, 1])
x:  torch.Size([1024, 127])
expers_o_tensor  torch.Size([12, 1024, 32])
towers_input :  11
torch.Size([1024, 11, 1])
x:  torch.Size([1024, 127])
expers_o_tensor  torch.Size([12, 1024, 32])
towers_input :  11
torch.Size([1024, 11, 1])
x:  torch.Size([1024, 127])
expers_o_tensor  torch.Size([12, 1024, 32])
towers_input :  11
torch.Size([1024, 11, 1])
x:  torch.Size([1024, 127])
expers_o

In [None]:
epochs = range(1, n_epochs+1)
plt.plot(epochs, losses, 'g', label='Training loss')
plt.plot(epochs, val_losses, 'b', label='validation loss')
plt.title('Training and Validation loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()

In [190]:
torch.save(model, "MMoE.pth")

### Testing the model

In [None]:
test_label_tmp = np.column_stack((np.argmax(test_label[0], axis=1), np.argmax(test_label[1], axis=1)))
test_loader = DataLoader(dataset=getTensorDataset(test_data.to_numpy(), test_label_tmp), batch_size=BATCH_SIZE)

In [None]:
t1_pred = []
t2_pred = []
t1_target = []
t2_target = []

# We tell PyTorch to NOT use autograd...
with torch.no_grad():
    # Uses loader to fetch one mini-batch for testing
    epoch_loss = []
    for x_test, y_test in test_loader:
        # Again, sends data to same device as model
        x_test = x_test.to(device)
        y_test = y_test.to(device)

        model.eval()
        # Makes predictions
        yhat = model(x_test)

        y_test_t1, y_test_t2 = y_test[:, 0], y_test[:, 1]
        yhat_t1, yhat_t2 = yhat[0], yhat[1]

        loss_t1 = loss_fn(yhat_t1, y_test_t1.view(-1, 1))
        loss_t2 = loss_fn(yhat_t2, y_test_t2.view(-1, 1))
        loss = loss_t1 + loss_t2
        
        # predict
        t1_hat = yhat_t1.view(-1) > 0.5
        t2_hat = yhat_t2.view(-1) > 0.5
        
        # save
        t1_pred.append(t1_hat)
        t2_pred.append(t2_hat)
        t1_target.append(y_test_t1)
        t2_target.append(y_test_t2)
        

        epoch_loss.append(loss.item())
print(np.mean(epoch_loss))

0.5134528784119353


In [None]:
t1_pred = torch.cat(t1_pred)
t2_pred = torch.cat(t2_pred)
t1_target = torch.cat(t1_target)
t2_target = torch.cat(t2_target)

In [None]:
from sklearn.metrics import confusion_matrix

confusion_matrix(t1_target.cpu().numpy(), t1_pred.cpu().numpy())

array([[46694,   103],
       [ 2486,   598]])

In [None]:
from sklearn.metrics import classification_report

print(classification_report(t1_target.cpu().numpy(), t1_pred.cpu().numpy()))

              precision    recall  f1-score   support

         0.0       0.95      1.00      0.97     46797
         1.0       0.85      0.19      0.32      3084

    accuracy                           0.95     49881
   macro avg       0.90      0.60      0.64     49881
weighted avg       0.94      0.95      0.93     49881



In [None]:
from sklearn.metrics import classification_report

print(classification_report(t2_target.cpu().numpy(), t2_pred.cpu().numpy()))

              precision    recall  f1-score   support

         0.0       0.91      0.89      0.90     28284
         1.0       0.86      0.89      0.87     21597

    accuracy                           0.89     49881
   macro avg       0.89      0.89      0.89     49881
weighted avg       0.89      0.89      0.89     49881



### Testing example for the loss function 

In [None]:
loss = nn.CrossEntropyLoss()
input = torch.tensor([[[0.1, 0.5], [0.1, 0.5]], [[0.1, 0.5], [0.3, 0.5]], [[0.1, 0.5], [1.0, 1.0]]])
target = torch.tensor([[0, 0], [1, 1], [1, 1]])
output = loss(input, target)
output

tensor(0.5821)