In [2]:
from data_process import *
from MMoE import MMOE
import logging,sys

In [3]:
BATCH_SIZE = 4096
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
out_f = './mmoe_biClass_regression_train_log.txt'
logging.basicConfig(filename= out_f , level=logging.INFO, filemode='a',
                        format='[%(asctime)s.%(msecs)03d] %(message)s', datefmt='%H:%M:%S')
logging.getLogger().addHandler(logging.StreamHandler(sys.stdout))

In [3]:
# 加载数据
dataset, watch_label, share_label = load_train_datas()

-- Mem. usage decreased to 1928.41 Mb (65.5% reduction),time spend:0.45 min


In [4]:
# 将分类label 变为回归label
watch_label_ratio = watch_label / 10

In [5]:
# 划分数据集，得到训练集和验证集
train_data, train_label, validation_data, validation_label = split_dataset(dataset, watch_label_ratio, share_label)

In [23]:
# 归一化数据
train_data_QT = QuantileTransformer(output_distribution='uniform').fit_transform(train_data)
validation_data_QT = QuantileTransformer(output_distribution='uniform').fit_transform(validation_data)

In [26]:
# 拼接两个label，封装DataLoader
train_label_tmp = np.column_stack([train_label[0],train_label[1]])
train_loader = DataLoader(dataset=getTensorDataset(train_data_QT, train_label_tmp), batch_size=BATCH_SIZE)

validation_label_tmp = np.column_stack([validation_label[0], validation_label[1]])
val_loader = DataLoader(dataset=getTensorDataset(validation_data_QT, validation_label_tmp), batch_size=BATCH_SIZE)

In [81]:
# 定义模型
model = MMOE(input_size=128, num_experts=12, experts_out=16, experts_hidden=16, towers_hidden=12, tasks=2)
model = model.to(device)

In [None]:
# 训练

# Sets hyper-parameters
lr = 1e-4
n_epochs = 10
tasks = 11
patience = 5
# BATCH_SIZE=4096

# # Defines loss function and optimizer
loss_wh_fn = nn.MSELoss(reduction='mean')
loss_sh_fn = nn.BCELoss(reduction='mean')
early_stopping = EarlyStopping(patience, verbose=True)
optimizer = optim.Adam(model.parameters(), lr=lr)

losses = []
val_losses = []

watch_auc = []
share_auc = []
sum_auc = []

# Training loop
for epoch in range(n_epochs):
    model.train()

    # Uses loader to fetch one mini-batch for training
    epoch_loss = []
    c = 0
    print("\nEpoch: {}/{}".format(epoch, n_epochs)) 
    for x_batch, y_batch in train_loader:

        # NOW, sends the mini-batch data to the device
        # so it matches location of the MODEL
        x_batch = x_batch.to(device)
        y_batch = y_batch.to(device)

        # One stpe of training
        yhat = model(x_batch.float())
        
        wh_loss = loss_wh_fn(yhat[:,0].float(), y_batch[:, 0].view(-1, 1).float())
        sh_loss = loss_sh_fn(yhat[:,1].float(), y_batch[:, 1].view(-1, 1).float())        
        loss = wh_loss/2 + sh_loss
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        epoch_loss.append(loss.item())
        c += 1
        if c % 100 == 0:     
            mesg = f'[Batch: {c}]' + \
                     f'train_loss: {np.mean(loss.item()):.5f} '
            logging.info(mesg)
    losses.append(np.mean(epoch_loss))

    # After finishing training steps for all mini-batches,
    # it is time for evaluation!

    # We tell PyTorch to NOT use autograd...
    with torch.no_grad():
        # Uses loader to fetch one mini-batch for validation
        epoch_loss = []
        epoch_watch_auc = []
        epoch_share_auc = []
        epoch_sum_auc = []
        for x_val, y_val in val_loader:
            # Again, sends data to same device as model
            x_val = x_val.to(device)
            y_val = y_val.to(device)

            model.eval()
            # Makes predictions
            yhat = model(x_val.float()) # len=11, 一组batch的预测值

            # Computes validation loss
            wh_loss = loss_wh_fn(yhat[:,0].float(), y_val[:, 0].view(-1, 1).float())
            sh_loss = loss_sh_fn(yhat[:,1].float(), y_val[:, 1].view(-1, 1).float())        
            loss = wh_loss/2 + sh_loss

            epoch_loss.append(loss.item())

    val_losses.append(np.mean(epoch_loss))

    epoch_len = len(str(n_epochs))
    mesg = f'[{epoch:>{epoch_len}}/{n_epochs:>{epoch_len}}] ' + \
                     f'train_loss: {losses[-1]:.5f} ' + \
                     f'valid_loss: {val_losses[-1]:.5f}'
    logging.info(mesg)

    # ************* Early Stopping ****************
    # early_stopping needs the validation loss to check if it has decresed, 
    # and if it has, it will make a checkpoint of the current model
    early_stopping(val_losses[-1], model)

    if early_stopping.early_stop:
        print("Early stopping")
        break

# print(model.state_dict())
print("loss: ", np.mean(losses))
print("val_loss: ", np.mean(val_losses))


Epoch: 0/10
[Batch: 100]train_loss: 0.60099 
[Batch: 200]train_loss: 0.47077 
[Batch: 300]train_loss: 0.32119 
[Batch: 400]train_loss: 0.22096 
[Batch: 500]train_loss: 0.20755 
[Batch: 600]train_loss: 0.13001 
[Batch: 700]train_loss: 0.11468 
[Batch: 800]train_loss: 0.09456 
[Batch: 900]train_loss: 0.08929 
[Batch: 1000]train_loss: 0.10000 
[Batch: 1100]train_loss: 0.06370 
[Batch: 1200]train_loss: 0.06945 
[Batch: 1300]train_loss: 0.07304 
[Batch: 1400]train_loss: 0.06256 
[Batch: 1500]train_loss: 0.06865 
[ 0/10] train_loss: 0.19900 valid_loss: 0.05185
Validation loss decreased (inf --> 0.051847).  Saving model ...

Epoch: 1/10
[Batch: 100]train_loss: 0.07526 
[Batch: 200]train_loss: 0.06954 
[Batch: 300]train_loss: 0.05283 
[Batch: 400]train_loss: 0.05349 
[Batch: 500]train_loss: 0.06741 
[Batch: 600]train_loss: 0.05406 
[Batch: 700]train_loss: 0.07566 
[Batch: 800]train_loss: 0.06067 
[Batch: 900]train_loss: 0.06648 
[Batch: 1000]train_loss: 0.07285 
[Batch: 1100]train_loss: 0.050

In [None]:
epochs = range(1, len(losses)+1)
plt.plot(epochs, losses, 'g', label='Training loss')
plt.plot(epochs, val_losses, 'b', label='validation loss')
plt.title('Training and Validation loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()

In [4]:
model = torch.load("./finish_model.pkl")

In [5]:
model = model.to(device)

In [6]:
test_data = load_test_datas()

-- Mem. usage decreased to 820.89 Mb (65.7% reduction),time spend:0.20 min


In [7]:
# 将测试数据组织为 DaTaloader
test_loader = DataLoader(dataset=torch.utils.data.TensorDataset(torch.tensor(test_data.astype(float).to_numpy())), batch_size=4096)

In [8]:
def predict():
    # 预测
    watch_pred = []
    share_pred = []
    with torch.no_grad():
        # Uses loader to fetch one mini-batch for testing
        for x_test in test_loader:
            # Again, sends data to same device as model
            x_test = x_test[0]
            x_test = x_test.to(device)

            model.eval()
            # Makes predictions
            yhat = model(x_test.float())
#             print(yhat)
#             print(yhat.shape)
            yhat = yhat.squeeze(2)
            yhat = yhat.view(-1, 2)
#             print(yhat)
            yhat_watch = yhat[:, 0]
#             print(yhat_watch.shape)
            yhat_share = yhat[:, 1]
#             print(yhat_share.shape)
            
            
            # save
            watch_pred.append(yhat_watch)
            share_pred.append(yhat_share)
    return watch_pred, share_pred

In [9]:
wh_pred, sh_pred = predict()

In [10]:
wh_preds = torch.cat(wh_pred, axis = 0)

In [11]:
wh_preds = wh_preds.cpu()

In [13]:
res = np.rint(wh_preds.numpy() * 10)

In [14]:
Counter(res)

Counter({0.0: 2769536,
         10.0: 28528,
         9.0: 5948,
         6.0: 2319,
         7.0: 2549,
         4.0: 4785,
         8.0: 3463,
         2.0: 912,
         5.0: 2075,
         1.0: 1344,
         3.0: 721})

In [15]:
sh_preds = torch.cat(sh_pred, axis = 0)

In [21]:
sh_preds = np.rint(sh_preds.cpu().numpy())

In [22]:
Counter(sh_preds)

Counter({0.0: 2764068, 1.0: 58112})