In [1]:
import autogluon
from autogluon.tabular import TabularDataset, TabularPredictor
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

def add_features(df):
    # 日期特征
    day_map = {'Monday':0, 'Tuesday':1, 'Wednesday':2, 'Thursday':3,
             'Friday':4, 'Saturday':5, 'Sunday':6}
    df['Publication_Day'] = df['Publication_Day'].map(day_map)

    # 时间分类特征处理
    time_map = {"Morning":7, "Afternoon":8, "Evening":9, "Night":10}
    df['Time_Period'] = df['Publication_Time'].map(time_map)

    # 新增交互特征
    df['Is_Weekend'] = (df['Publication_Day'] >= 5).astype(int)
    df['Genre_Time'] = df['Genre'] + '_' + df['Time_Period'].astype(str)

    # 文本特征处理
    df['Title_Length'] = df['Episode_Title'].str.len()
    df['Has_Special'] = df['Episode_Title'].str.contains('特别版|直播|专访').astype(int)

    # 分类特征编码
    cat_features = ['Podcast_Name', 'Genre', 'Episode_Sentiment', 'Genre_Time']
    for col in cat_features:
        if col in df.columns:
            le = LabelEncoder()
            df[col] = le.fit_transform(df[col].astype(str))

    # 删除冗余列
    return df.drop(columns=['Publication_Time', 'Episode_Title'], errors='ignore')

# 数据加载与处理
train = pd.read_csv("../data/playground-series-s5e4/train.csv", index_col='id')
test = pd.read_csv('../data/playground-series-s5e4/test.csv', index_col='id')
train = add_features(train)
test = add_features(test)

# Split data into train and validation sets
train_data, val_data = train_test_split(train, test_size=0.2, random_state=42)

# 模型训练配置优化
predictor = TabularPredictor(
    label='Listening_Time_minutes',
    problem_type='regression',
    eval_metric='root_mean_squared_error',
    path='ag_optimized'
).fit(
    train_data=train_data,
    #tuning_data=val_data,
    presets='best_quality',
    time_limit=1800,
    hyperparameters={
        'GBM': {'num_boost_round': 300},
        'CAT': {'iterations': 1500},
        'XGB': {'max_depth': 10}
    },
    verbosity=2
)

# 生成预测结果
test_pred = predictor.predict(test)

Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.2
Python Version:     3.10.0
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #1 SMP Tue Nov 5 00:21:55 UTC 2024
CPU Count:          12
Memory Avail:       51.65 GB / 57.48 GB (89.9%)
Disk Space Avail:   3550.29 GB / 6519.49 GB (54.5%)
Presets specified: ['best_quality']
Setting dynamic_stacking from 'auto' to True. Reason: Enable dynamic_stacking when use_bag_holdout is disabled. (use_bag_holdout=False)
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=8, num_bag_sets=1
DyStack is enabled (dynamic_stacking=True). AutoGluon will try to determine whether the input data is affected by stacked overfitting and enable or disable stacking as a consequence.
	This is used to identify the optimal `num_stack_levels` value. Copies of AutoGluon will be fit on subsets of the data. Then holdout validation data is used to detect stacked overfitting.
	Running DyStack for up to 450s of the 1800s of rema

In [6]:
test_pred

id
750000    55.327278
750001    18.398293
750002    50.291405
750003    74.953148
750004    46.769905
            ...    
999995    11.029882
999996    58.795971
999997     6.645740
999998    74.079834
999999    57.524734
Name: Listening_Time_minutes, Length: 250000, dtype: float32

In [7]:
#submission = pd.DataFrame({'id': test.index, 'Listening_Time_minutes': test_pred})
submission = pd.DataFrame(test_pred)
submission.to_csv('../output/submission.csv') # 12.90