In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
%matplotlib inline
warnings.filterwarnings('ignore')
# 设置全局图表大小
plt.rc('figure', figsize=(12, 8))
plt.rc('axes', titlesize=18)  # 设置轴标题的字体大小
plt.rc('axes', labelsize=14)  # 设置轴标签的字体大小
plt.rc('xtick', labelsize=12)  # 设置x轴刻度标签的字体大小
plt.rc('ytick', labelsize=12)  # 设置y轴刻度标签的字体大小
plt.rc('legend', fontsize=12)  # 设置图例的字体大小

df = pd.read_csv('./data/classify_data.csv')

# 衍生列
add_columns = ['season', 'day_time', 'weekday', 'day_of_year', 'week_of_month', 'week_of_year'
              ,'weekend', 'day', 'minute',  'month', 'hours']

# df.drop(add_columns, axis = 1, inplace = True)
df.info()

# track_to_label = pd.read_csv('./data/track_to_labels_dbscan.csv')
track_to_label = pd.read_csv('./data/track_to_labels_kmeans_2.csv')
track_to_label.info()

df = df.merge(track_to_label, on='track_id', how='left')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 361346 entries, 0 to 361345
Data columns (total 18 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   score          361346 non-null  float64
 1   hashtag        361346 non-null  object 
 2   lang           361346 non-null  object 
 3   tweet_lang     361346 non-null  object 
 4   time_zone      361346 non-null  object 
 5   track_id       361346 non-null  object 
 6   timestamp      361346 non-null  object 
 7   season         361346 non-null  object 
 8   day_time       361346 non-null  object 
 9   weekday        361346 non-null  int64  
 10  day_of_year    361346 non-null  int64  
 11  week_of_month  361346 non-null  int64  
 12  week_of_year   361346 non-null  int64  
 13  weekend        361346 non-null  bool   
 14  day            361346 non-null  int64  
 15  minute         361346 non-null  int64  
 16  month          361346 non-null  int64  
 17  hours          361346 non-nul

In [3]:
#对分数score取平方、3次方
df['score_2'] = df['score'] ** 2
df['score_3'] = df['score'] ** 3

# 对情绪分数score进行分箱操作
labels = ['disgust', 'sadness', 'neutral', 'happiness', 'surprise']
bins = [0.0, 0.2, 0.4, 0.6, 0.8, 1.0]
df['mood'] = pd.cut(df['score'], labels=labels, bins = bins, include_lowest=True)
df['mood'] = df['mood'].astype('category')

df['mood_label'] = df['mood'].cat.codes
print(df[['mood', 'mood_label']])

bins = [0.0, 0.5, 1.0]
df['happy'] = pd.cut(df['score'], labels=[0, 1], bins = bins, include_lowest=True)
df['happy'] = df['happy'].cat.codes

if (df['labels'] >= 2).sum() > 0:
    df.to_csv('./data/final_classify_data_5.csv', index = False)
else:
    df.to_csv('./data/final_classify_data_2.csv', index = False)

df.drop(['timestamp', 'mood'], axis = 1, inplace = True)
df.info()

             mood  mood_label
0       happiness           3
1       happiness           3
2       happiness           3
3       happiness           3
4       happiness           3
...           ...         ...
361341    disgust           0
361342   surprise           4
361343    sadness           1
361344    sadness           1
361345    sadness           1

[361346 rows x 2 columns]
<class 'pandas.core.frame.DataFrame'>
Int64Index: 361346 entries, 0 to 361345
Data columns (total 22 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   score          361346 non-null  float64
 1   hashtag        361346 non-null  object 
 2   lang           361346 non-null  object 
 3   tweet_lang     361346 non-null  object 
 4   time_zone      361346 non-null  object 
 5   track_id       361346 non-null  object 
 6   season         361346 non-null  object 
 7   day_time       361346 non-null  object 
 8   weekday        361346 non-null  int64  
 9   da

In [4]:
# 对几个类别特别多的列采用二进制编码
from category_encoders import BinaryEncoder
from sklearn.preprocessing import LabelEncoder
from collections import Counter
from imblearn.under_sampling import ClusterCentroids, RandomUnderSampler
from imblearn.over_sampling import SMOTE
from sklearn.cluster import KMeans

# 查看原始数据集的类别分布
print(f"Original class distribution: {Counter(df['labels'])}")

X = df.drop(['labels', 'track_id'], axis = 1)
y = df['labels']

# 不交叉组合
def encoder_category(df):
    columns = ['hashtag', 'tweet_lang', 'time_zone', 'lang']

    # # 特征衍生
    columns += ['day_time', 'season']
    for column in columns:
        if column == 'timestamp':
            continue
        encoder = LabelEncoder()
        df[column] = encoder.fit_transform(df[column])

    df['weekend'] = df['weekend'].astype(int)
    return df

X = encoder_category(X)
X, y = SMOTE(random_state=42).fit_resample(X, y)
print(f"Resampled class distribution: {Counter(y)}")

Original class distribution: Counter({0: 241581, 1: 119765})
Resampled class distribution: Counter({0: 241581, 1: 241581})


In [5]:
# X.shape, y.shape
# pd.concat([X, y], axis = 1).to_csv('./data/ClusterCentroids_undersample_data_2.csv', index = False)

In [6]:
# import pandas as pd

# df = pd.read_csv('./data/ClusterCentroids_undersample_data_2.csv')
# y = df['labels']
# X = df.drop('labels', axis = 1)
# print(X.info())

In [7]:
import torch
from sklearn.preprocessing import MinMaxScaler
categorical_features = ['hashtag', 'lang', 'tweet_lang', 'time_zone', 'season', 'day_time']
numerical_features = [i for i in X.columns if i not in categorical_features if i != 'timestamp']
print(categorical_features, '\n', numerical_features)

X[numerical_features] = MinMaxScaler().fit_transform(X[numerical_features])
categories = tuple(X.loc[:, categorical_features].nunique())

print(categories)

['hashtag', 'lang', 'tweet_lang', 'time_zone', 'season', 'day_time'] 
 ['score', 'weekday', 'day_of_year', 'week_of_month', 'week_of_year', 'weekend', 'day', 'minute', 'month', 'hours', 'score_2', 'score_3', 'mood_label', 'happy']
(4182, 29, 38, 121, 4, 4)


In [None]:
import torch
import torch.nn as nn
from tab_transformer_pytorch import TabTransformer
from sklearn.model_selection import train_test_split
from torch.optim import Adam
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
from torch.optim import Adam,SGD

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)
print(X_train.info())
print(y_test.value_counts())

X_train_cat = X_train[categorical_features]
X_train_con = X_train[numerical_features]
X_test_cat = X_test[categorical_features]
X_test_con = X_test[numerical_features]

# 将数据转换为 PyTorch 张量
X_train_categ = torch.tensor(X_train_cat.values, dtype=torch.long)
X_train_cont = torch.tensor(X_train_con.values, dtype=torch.float32)
X_test_categ = torch.tensor(X_test_cat.values, dtype=torch.long)
X_test_cont = torch.tensor(X_test_con.values, dtype=torch.float32)

y_train_tensor = torch.tensor(y_train.values, dtype=torch.long)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.long)

# 创建数据集和数据加载器
batch_size = 64
train_dataset = TensorDataset(X_train_categ, X_train_cont, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

test_dataset = TensorDataset(X_test_categ, X_test_cont, y_test_tensor)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [None]:
import logging
from datetime import datetime
# 获取当前时间，并格式化为字符串
current_time = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')

# 使用时间戳生成日志文件名
log_filename = f'./exp/Tabtransformer_{current_time}.log'

# 配置logging
logging.basicConfig(level=logging.INFO, 
                    format='%(asctime)s - %(levelname)s - %(message)s', 
                    filename=log_filename,  # 动态生成的日志文件名
                    filemode='w')

logger = logging.getLogger()
log_filename

In [None]:
# 定义 TabTransformer 模型
model = TabTransformer(
    categories = categories,                # 分类特征的类别数量
    num_continuous = len(numerical_features), # 连续特征数量
    dim = 32,                           # 模型维度
    dim_out = 2,                        # 输出维度（2类分类）
    depth = 6,                          # 模型深度
    heads = 8,                          # 注意力头数量
    attn_dropout = 0.5,                 # 注意力层 dropout
    ff_dropout = 0.5,                   # 前馈层 dropout
    mlp_hidden_mults = (4, 2, 1),          # MLP 隐藏层相对维度
    mlp_act = nn.ReLU(),           # MLP 激活函数
    continuous_mean_std = cont_mean_std # 连续特征的均值和标准差
)

logger.info(model)
model

def init_weights(m):
    if isinstance(m, nn.Linear):
        # 对线性层的权重应用正态分布初始化
        nn.init.normal_(m.weight, mean=0.3, std=0.25)
        # 如果存在偏置项，则将其初始化为0
        if m.bias is not None:
            nn.init.constant_(m.bias, 0)
    elif isinstance(m, nn.Embedding):
        # 对嵌入层也可以应用正态分布初始化
        nn.init.normal_(m.weight, mean=0.3, std=0.25)

model.apply(init_weights)

In [None]:
def show_and_save_fig(train_losses, train_accs, test_losses, test_accs):
    import matplotlib.pyplot as plt
    # 设置全局图表大小
    plt.rc('figure', figsize=(12, 8))
    plt.rc('axes', titlesize=18)  # 设置轴标题的字体大小
    plt.rc('axes', labelsize=14)  # 设置轴标签的字体大小
    plt.rc('xtick', labelsize=12)  # 设置x轴刻度标签的字体大小
    plt.rc('ytick', labelsize=12)  # 设置y轴刻度标签的字体大小
    plt.rc('legend', fontsize=12)  # 设置图例的字体大小

    plt.figure()
    plt.title('Loss and acc of the train process')
    plt.plot(range(num_epochs), train_losses, 'b--', label='train_loss')
    plt.plot(range(num_epochs), [x for x in train_accs], 'g', label = 'train_acc')
    plt.plot(range(num_epochs), test_losses, 'r--', label = 'test_loss')
    plt.plot(range(num_epochs), [x for x in test_accs], 'black', label = 'test_acc')
    plt.xlabel('epoch')
    plt.ylabel('acc / loss')
    plt.legend(loc='best')
    
    save_path = f'./exp/figures/{current_time}'  # 替换为你的路径
    plt.savefig(save_path)
    plt.show()


In [None]:
# 将模型移动到GPU（如果可用）
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
# 转换数据到 GPU
X_train_categ = X_train_categ.to(device)
X_train_cont = X_train_cont.to(device)
y_train_tensor = y_train_tensor.to(device)
X_test_categ = X_test_categ.to(device)
X_test_cont = X_test_cont.to(device)
y_test_tensor = y_test_tensor.to(device)

# 定义优化器
lr = 3e-4
optimizer = Adam(model.parameters(), lr=lr)

# 训练模型
num_epochs = 100
test_losses = []
test_accs = []
train_losses = []
train_accs = []

logger.info("Trainning params:")
logger.info(f"lr: {lr}")
logger.info(f"batch_size: {batch_size}")
logger.info(f"num_epochs: {num_epochs}")          
logger.info("Start trainning......")
for epoch in range(num_epochs):
    model.train()
    logger.info(f"---------------------------------- Epoch {epoch + 1} / Epoch {num_epochs} ----------------------------------")
    total_loss, total = 0.0, 0
    correct, num = 0, 0
    for X_batch_categ, X_batch_cont, y_batch in train_loader:
        X_batch_categ = X_batch_categ.to(device)
        X_batch_cont = X_batch_cont.to(device)
        y_batch = y_batch.to(device)

        optimizer.zero_grad()
        logits = model(X_batch_categ, X_batch_cont)
        loss = F.cross_entropy(logits, y_batch)
        preds = torch.argmax(logits, dim=1)
            
        correct += (preds == y_batch).sum().item()  # 计算正确预测的数量
        total += y_batch.size(0)
        total_loss += loss.item()
        
        loss.backward()
        optimizer.step()
    
    train_acc = correct / total
    train_loss = total_loss / len(train_loader)
    train_losses.append(train_loss)
    train_accs.append(train_acc)
    
    # 评估模型
    model.eval()
    test_loss = 0.0
    correct = 0
    total = 0
    s = {0:0, 1:0, 2:0, 3:0, 4:0}
    with torch.no_grad():
        for X_batch_categ, X_batch_cont, y_batch in test_loader:
            X_batch_categ = X_batch_categ.to(device)
            X_batch_cont = X_batch_cont.to(device)
            y_batch = y_batch.to(device)

            test_logits = model(X_batch_categ, X_batch_cont)
            test_loss += F.cross_entropy(test_logits, y_batch, reduction='sum').item()
            preds = torch.argmax(test_logits, dim=1)
            correct += (preds == y_batch).sum().item()
            total += y_batch.size(0)
            
            for i in range(5):
                s[i] += (preds == i).sum()
    
    for i in range(5):
        logger.info(f"测试集预测为{i}的样本数为：{s[i]}")
    test_loss /= total
    accuracy = correct / total
    test_losses.append(test_loss)
    test_accs.append(accuracy)
    logger.info(f'Train Loss: {train_loss:.4f}, Train Accuracy: {train_acc}, Test Loss: {test_loss:.4f}, Accuracy: {accuracy:.4f}')
    
logger.info("Trainning finish!")
torch.save(model, f'./models/tabtransformer_2_{accuracy:.2f}.pth')
show_and_save_fig(train_losses, train_accs, test_losses, test_accs)

#### wandb 参数调优

In [9]:
# File path: tab_transformer_classification.py

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from pytorch_tabular import TabularModel
from pytorch_tabular.config import DataConfig, OptimizerConfig, TrainerConfig, ExperimentConfig
from pytorch_tabular.models.common.heads import LinearHeadConfig
from pytorch_tabular.models import TabTransformerConfig
from tqdm import tqdm as tqdm
from pytorch_lightning import Trainer
from pytorch_lightning.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
import random
from pytorch_tabular.utils import print_metrics
import pandas as pd
import wandb
from datetime import datetime
from pprint import pprint
from argparse import Namespace
# 方法2：忽略信号
from signal import signal, SIGPIPE, SIG_DFL, SIG_IGN
signal(SIGPIPE, SIG_IGN)

wandb.login()
current_time = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
config = Namespace(
    # 实验参数
    project_name = 'pytorch-tabular-tabtransformer',
    
    # 训练参数
    batch_size = 512,
    lr = 1e-4,
    optim_type = 'Adam',
    epochs = 100,
    ckpt_path = f'./models/tabtransformers_{current_time}',
    
    # 模型参数
    input_embed_dim = 32,
    num_heads = 8,
    num_attn_blocks=6,
    embedding_initialization = 'kaiming_normal',

    # LinearHeadConfig
    layers = '32-64-32',
    initialization = 'kaiming',
    activation = 'ReLU'
)

# wandb.sweep
sweep_config = {
    'method': 'random',
    'metric': {
        'name': 'val_loss',
        'goal': 'minimize'
    }
}
sweep_config['parameters'] = {}

# 固定不变的参数
sweep_config['parameters'].update({
    'project_name': {'value': config.project_name},
    'epochs': {'value': 100},
    'ckpt_path': {'value': f'./models/tabtransformer_not_balance_{current_time}.pt'},
})
# 离散型分布的参数
sweep_config['parameters'].update({
    'optim_type': {'values': ['Adam', 'AdamW']},
    'input_embed_dim':{ 'values': [16, 32, 64, 128]},
    'num_heads' : {'values': [4, 8, 16, 32]},
    'num_attn_blocks': { 'values': [2, 4, 6, 8, 10]},
    'embedding_initialization': {'values': ['kaiming_uniform', 'kaiming_normal']},
    'batch_size': {'values': [32, 64, 128, 256, 512, 1024]},
    'activation': {'values': ['ReLU', 'LeakyReLU', 'TanH']},
    'initialization': {'values': ['kaiming', 'xavier', 'random']},
    'layers': {'values': ['32-64-16', '128-64-32-16']}
})
# 连续型参数
sweep_config['parameters'].update({
    'lr':{
        'distribution': 'log_uniform_values',
        'min': 1e-6,
        'max': 1e-4
    },
    'weight_decay':{
        'distribution': 'log_uniform_values',
        'min': 1e-6,
        'max': 1e-3
    },
    'attn_dropout':{
        'distribution': 'uniform',
        'min': 0.2,
        'max': 1.0
    },
    'add_norm_dropout':{
        'distribution': 'uniform',
        'min': 0.2,
        'max': 1.0
    },
    'ff_dropout':{
        'distribution': 'uniform',
        'min': 0.2,
        'max': 1.0
    }
})
pprint(sweep_config)

# 初始化 sweep controller
sweep_id = wandb.sweep(sweep_config, project=config.project_name)

[34m[1mwandb[0m: Currently logged in as: [33myoumengjiuzhui[0m ([33myoumeng[0m). Use [1m`wandb login --relogin`[0m to force relogin


{'method': 'random',
 'metric': {'goal': 'minimize', 'name': 'val_loss'},
 'parameters': {'activation': {'values': ['ReLU', 'LeakyReLU', 'TanH']},
                'add_norm_dropout': {'distribution': 'uniform',
                                     'max': 1.0,
                                     'min': 0.2},
                'attn_dropout': {'distribution': 'uniform',
                                 'max': 1.0,
                                 'min': 0.2},
                'batch_size': {'values': [32, 64, 128, 256, 512, 1024]},
                'ckpt_path': {'value': './models/tabtransformer_not_balance_2024-06-03_23-50-33.pt'},
                'embedding_initialization': {'values': ['kaiming_uniform',
                                                        'kaiming_normal']},
                'epochs': {'value': 100},
                'ff_dropout': {'distribution': 'uniform',
                               'max': 1.0,
                               'min': 0.2},
                'initializ

In [None]:
from dataclasses import asdict

data = pd.concat([X, y], axis = 1)

target_col = "labels"
categorical_cols = categorical_features
continuous_cols = numerical_features

# Split the combined dataset back into train, validation, and test sets
train, test = train_test_split(data, test_size=0.2, random_state=42)
train, val = train_test_split(train, test_size=0.2, random_state=42)
print(train.shape, val.shape)
def train_once():
    data_config = DataConfig(
        target=[target_col],
        continuous_cols=continuous_cols,
        categorical_cols=categorical_cols,
    )
    experiment_config = ExperimentConfig(
        project_name=config.project_name,
        run_name=f"TabTransformer_{current_time}",
        exp_watch="gradients",
        log_target="wandb",
    )
    linear_head_config = LinearHeadConfig(
        layers=config.layers, 
        activation=config.activation,
        dropout=0.3,
        use_batch_norm=True,
        initialization=config.initialization
    )
    model_config = TabTransformerConfig(
        task="classification",
        input_embed_dim=config.input_embed_dim,
        num_heads=config.num_heads,
        num_attn_blocks=config.num_attn_blocks,
        learning_rate=config.lr,
        metrics=["accuracy"],
        attn_dropout=0.3,
        add_norm_dropout=0.3,
        ff_dropout=0.3,
        batch_norm_continuous_input=True,
#         embedding_initialization = config.embedding_initialization,
        head_config = asdict(linear_head_config)
    )
    trainer_config = TrainerConfig(
        auto_lr_find=True,
        batch_size=config.batch_size,
        max_epochs=config.epochs,
        accelerator='gpu',
        devices=-1,
        load_best=True, 
        track_grad_norm =-1,
        min_epochs = 20,
        early_stopping=None,
   
    )
    optim_config = OptimizerConfig(
        optimizer = config.optim_type,
    )
    # Initialize and train the model
    tabular_model = TabularModel(
        data_config=data_config,
        model_config=model_config,
        optimizer_config=optim_config,
        trainer_config=trainer_config,
        experiment_config=experiment_config,
        verbose=False,
        suppress_lightning_logger=True,
    )
    with wandb.init() as run:
        tabular_model.fit(train=train, validation=val)
        # Evaluate the model
        result = tabular_model.evaluate(test)
        print(result)
        wandb.save(f"./models/tabtransformer_{current_time}.pth")

# 该agent 随机搜索5次
wandb.agent(sweep_id, train_once, count = 5)

(309223, 21) (77306, 21)


[34m[1mwandb[0m: Agent Starting Run: tkfxqgeg with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	add_norm_dropout: 0.6040465555214707
[34m[1mwandb[0m: 	attn_dropout: 0.8416560693454203
[34m[1mwandb[0m: 	batch_size: 1024
[34m[1mwandb[0m: 	ckpt_path: ./models/tabtransformer_not_balance_2024-06-03_23-50-33.pt
[34m[1mwandb[0m: 	embedding_initialization: kaiming_normal
[34m[1mwandb[0m: 	epochs: 100
[34m[1mwandb[0m: 	ff_dropout: 0.7659779525471568
[34m[1mwandb[0m: 	initialization: xavier
[34m[1mwandb[0m: 	input_embed_dim: 64
[34m[1mwandb[0m: 	layers: 32-64-16
[34m[1mwandb[0m: 	lr: 3.875949155698934e-05
[34m[1mwandb[0m: 	num_attn_blocks: 2
[34m[1mwandb[0m: 	num_heads: 32
[34m[1mwandb[0m: 	optim_type: Adam
[34m[1mwandb[0m: 	project_name: pytorch-tabular-tabtransformer
[34m[1mwandb[0m: 	weight_decay: 3.2556295391881365e-06


[34m[1mwandb[0m: logging graph, to disable use `wandb.watch(log_graph=False)`


Finding best initial lr:   0%|          | 0/100 [00:00<?, ?it/s]



Output()