In [24]:
import os
import pandas as pd
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from tqdm import tqdm
from typing import *

tqdm.pandas()

# 1 数据准备

In [44]:
columns = ['age', 'workclass', 'fnlwgt', 'education', 'education_num', 'marital_status', 'occupation', 'relationship', 
           'race', 'sex', 'capital_gain', 'capital_loss', 'hours_per_week', 'native_country', 'income_50k']
label_columns = ["income_50k", "marital_status"]
categorical_columns = ["workclass", "education", "occupation", "relationship", "race", "sex", "native_country"]
numeric_columns = ["age", "fnlwgt", "education_num", "capital_gain", "capital_loss", "hours_per_week"]

In [41]:
train_df = pd.read_csv("data/adult.data", header=None, names=columns)
other_df = pd.read_csv("data/adult.test", header=None, names=columns)
train_df["is_train"] = 1
other_df["is_train"] = 0
data = pd.concat([train_df, other_df], axis=0, ignore_index=True).dropna()
for col in label_columns:
    data[col] = data[col].apply(lambda x: x.strip('. '))

data["label1"] = (data["income_50k"] != '<=50K').astype(int)
data["label2"] = (data["marital_status"] != 'Never-married').astype(int)
data.drop(["income_50k", "marital_status"], axis=1, inplace=True)
print(data["label1"].value_counts())
print(data["label2"].value_counts())

data[:5]

0    37155
1    11687
Name: label1, dtype: int64
1    32725
0    16117
Name: label2, dtype: int64


Unnamed: 0,age,workclass,fnlwgt,education,education_num,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,is_train,label1,label2
0,39,State-gov,77516.0,Bachelors,13.0,Adm-clerical,Not-in-family,White,Male,2174.0,0.0,40.0,United-States,1,0,0
1,50,Self-emp-not-inc,83311.0,Bachelors,13.0,Exec-managerial,Husband,White,Male,0.0,0.0,13.0,United-States,1,0,1
2,38,Private,215646.0,HS-grad,9.0,Handlers-cleaners,Not-in-family,White,Male,0.0,0.0,40.0,United-States,1,0,1
3,53,Private,234721.0,11th,7.0,Handlers-cleaners,Husband,Black,Male,0.0,0.0,40.0,United-States,1,0,1
4,28,Private,338409.0,Bachelors,13.0,Prof-specialty,Wife,Black,Female,0.0,0.0,40.0,Cuba,1,0,1


In [42]:
feature_dict: Dict[str, Tuple[int, int]] = {}

feature_index = 0

# 处理枚举型列
for col in categorical_columns:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])
    feature_dict[col] = (len(le.classes_), feature_index)
    feature_index += 1

# 处理数值型列
mm = MinMaxScaler()
data[numeric_columns] = mm.fit_transform(data[numeric_columns])
for col in numeric_columns:
    feature_dict[col] = (1, feature_index)
    feature_index += 1    

print(feature_dict)
data[:5]

{'workclass': (9, 0), 'education': (16, 1), 'occupation': (15, 2), 'relationship': (6, 3), 'race': (5, 4), 'sex': (2, 5), 'native_country': (42, 6), 'age': (1, 7), 'fnlwgt': (1, 8), 'education_num': (1, 9), 'capital_gain': (1, 10), 'capital_loss': (1, 11), 'hours_per_week': (1, 12)}


Unnamed: 0,age,workclass,fnlwgt,education,education_num,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,is_train,label1,label2
0,0.30137,7,0.044131,9,0.8,1,1,4,1,0.02174,0.0,0.397959,39,1,0,0
1,0.452055,6,0.048052,9,0.8,4,0,4,1,0.0,0.0,0.122449,39,1,0,1
2,0.287671,4,0.137581,11,0.533333,6,1,4,1,0.0,0.0,0.397959,39,1,0,1
3,0.493151,4,0.150486,1,0.4,6,0,2,1,0.0,0.0,0.397959,39,1,0,1
4,0.150685,4,0.220635,9,0.8,10,5,2,0,0.0,0.0,0.397959,5,1,0,1


In [43]:
train_cond = data["is_train"] == 1
train_data, test_data = data[train_cond].copy(), data[~train_cond].copy()
train_data.drop("is_train", axis=1, inplace=True)
test_data.drop("is_train", axis=1, inplace=True)

# 2 训练

In [75]:
import torch
import torch.nn as nn
from torch.optim import Adam
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import roc_auc_score
from model import MMoE, TaskTower

In [99]:
class AdultDataset(Dataset):
    
    def __init__(self, feature_dict: Dict[str, Tuple[int, int]], df: pd.DataFrame):
        columns = [""] * len(feature_dict)
        for feature_name, (_, feature_index) in feature_dict.items():
            columns[feature_index] = feature_name        
        
        self.features = df[columns].values
        self.label1 = df["label1"].values[:, None]
        self.label2 = df["label2"].values[:, None]
    
    def __getitem__(self, index):
        return self.features[index], self.label1[index], self.label2[index]
    
    def __len__(self):
        return len(self.features)

In [100]:
train_dataset = AdultDataset(feature_dict, train_data)
test_dataset = AdultDataset(feature_dict, test_data)

In [101]:
batch_size = 64
lr = 0.01
embedding_length = 64
loss_func = nn.BCEWithLogitsLoss()
epoch = 20

train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

mmoe = MMoE(feature_dict=feature_dict, embedding_length=embedding_length, n_experts=3, n_tasks=2)
towers = TaskTower(["income", "marital"], mmoe.units)
model = nn.Sequential(mmoe, towers)
optimizer = Adam(model.parameters(), lr=lr)
device = torch.device("cpu")

In [133]:
def train(model: nn.Module):
    total_loss, count = 0., 0
    y1_trues, y1_preds = [], []
    y2_trues, y2_preds = [], []
    for idx, (X, y1, y2) in enumerate(train_dataloader):
        y1_trues.extend(y1.numpy())
        y2_trues.extend(y2.numpy())
        
        optimizer.zero_grad()
        y1_pred, y2_pred = model(X.to(device))
        loss1 = loss_func(y1_pred, y1.float())
        loss2 = loss_func(y2_pred, y2.float())
        loss = loss1 + loss2
        
        y1_pred = y1_pred.squeeze().cpu().detach().numpy()
        y2_pred = y2_pred.squeeze().cpu().detach().numpy()
        y1_preds.extend(y1_pred)
        y2_preds.extend(y2_pred)
        
        count += 1
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
    
    auc1 = roc_auc_score(y1_trues, y1_preds)
    auc2 = roc_auc_score(y2_trues, y2_preds)

    return total_loss / count, auc1, auc2


def evaluate(model: nn.Module):
    total_loss, count = 0., 0
    y1_trues, y1_preds = [], []
    y2_trues, y2_preds = [], []
    
    with torch.inference_mode():
        model.train(mode=False)
        
        for idx, (X, y1, y2) in enumerate(test_dataloader):
            y1_trues.extend(y1.numpy())
            y2_trues.extend(y2.numpy())

            y1_pred, y2_pred = model(X.to(device))
            loss1 = loss_func(y1_pred, y1.float())
            loss2 = loss_func(y2_pred, y2.float())
            loss = loss1 + loss2

            y1_pred = y1_pred.squeeze().cpu().detach().numpy()
            y2_pred = y2_pred.squeeze().cpu().detach().numpy()
            y1_preds.extend(y1_pred)
            y2_preds.extend(y2_pred)

            count += 1
            total_loss += loss.item()

        auc1 = roc_auc_score(y1_trues, y1_preds)
        auc2 = roc_auc_score(y2_trues, y2_preds)
        
    return total_loss / count, auc1, auc2

In [135]:

model.to(device)

# 验证集的loss没有变小就提前停止
patience = 0
early_stop = 3
prev_eval_loss = float("inf")
checkpoint = {}
best_model_file = "mmoe-best-checkpoint.pt"

for i in range(epoch):
    train_loss, train_auc1, train_auc2 = train(model)
    eval_loss, eval_auc1, eval_auc2 = evaluate(model)
    print(f"""Epoch {i + 1} / {epoch}: Train-loss:{train_loss:.3f}, auc1:{train_auc1:.3f}, auc2:{train_auc2:.3f}   
              Eval-loss:{eval_loss:.3f}, auc1:{eval_auc1:.3f}, auc2:{eval_auc2:.3f}""")
    
    if eval_loss < prev_eval_loss:
        prev_eval_loss = eval_loss
        checkpoint = {"model_state_dict": model.state_dict(), 
                      "optimizer_state_dict": optimizer.state_dict(),
                      "epoch": epoch, 
                      "train_loss": train_loss, 
                      "eval_loss": eval_loss}
        patience = 0

    else:
        if patience < early_stop:
            patience += 1
        else:
            print(f"Eval loss doesn't decrease in {patience} epoch and break training.")
            break
            
if checkpoint:
    checkpoint["epoch"] = epoch
    torch.save(checkpoint, best_model_file)  

Epoch 1 / 20: Train-loss:0.559, auc1:0.901, auc2:0.962   
              Eval-loss:0.585, auc1:0.893, auc2:0.959
Epoch 2 / 20: Train-loss:0.554, auc1:0.902, auc2:0.964   
              Eval-loss:0.578, auc1:0.895, auc2:0.959
Epoch 3 / 20: Train-loss:0.547, auc1:0.905, auc2:0.965   
              Eval-loss:0.587, auc1:0.894, auc2:0.958
Epoch 4 / 20: Train-loss:0.545, auc1:0.906, auc2:0.965   
              Eval-loss:0.593, auc1:0.891, auc2:0.957
Epoch 5 / 20: Train-loss:0.540, auc1:0.907, auc2:0.965   
              Eval-loss:0.595, auc1:0.894, auc2:0.959
Epoch 6 / 20: Train-loss:0.536, auc1:0.909, auc2:0.965   
              Eval-loss:0.585, auc1:0.894, auc2:0.959
Eval loss doesn't decrease in 3 epoch and break training.
