In [4]:
import autogluon
from autogluon.tabular import TabularDataset, TabularPredictor
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

def add_features(df):
    # 时间分类特征处理
    time_map = {"Morning":0, "Afternoon":1, "Evening":2, "Night":3}
    df['Time_Period'] = df['Publication_Time'].map(time_map)

    # 日期特征
    day_map = {'Monday':0, 'Tuesday':1, 'Wednesday':2, 'Thursday':3,
             'Friday':4, 'Saturday':5, 'Sunday':6}
    df['Publication_Day'] = df['Publication_Day'].map(day_map)

    # 新增交互特征
    df['Is_Weekend'] = (df['Publication_Day'] >= 5).astype(int)
    df['Genre_Time'] = df['Genre'] + '_' + df['Time_Period'].astype(str)

    # 文本特征处理
    df['Title_Length'] = df['Episode_Title'].str.len()
    df['Has_Special'] = df['Episode_Title'].str.contains('特别版|直播|专访').astype(int)

    # 分类特征编码
    cat_features = ['Podcast_Name', 'Genre', 'Episode_Sentiment', 'Genre_Time']
    for col in cat_features:
        if col in df.columns:
            le = LabelEncoder()
            df[col] = le.fit_transform(df[col].astype(str))

    # 删除冗余列
    return df.drop(columns=['Publication_Time', 'Episode_Title'], errors='ignore')

# 数据加载与处理
train = pd.read_csv("../data/playground-series-s5e4/train.csv", index_col='id')
test = pd.read_csv('../data/playground-series-s5e4/test.csv', index_col='id')
train = add_features(train)
test = add_features(test)

# 模型训练配置优化
predictor = TabularPredictor(
    label='Listening_Time_minutes',
    problem_type='regression',
    eval_metric='root_mean_squared_error',
    path='ag_optimized'
).fit(
    train_data=train,
    presets='best_quality',
    time_limit=7200,  # 延长至2小时
    hyperparameters={
        'GBM': {'num_boost_round': 300},
        'CAT': {'iterations': 1500},
        'XGB': {'max_depth': 10}
    },
    verbosity=3
)

# 生成预测结果
test_pred = predictor.predict(test)
submission = pd.DataFrame({'id': test.index, 'Listening_Time_minutes': test_pred})
submission.to_csv('../output/submission_final.csv')

Verbosity: 3 (Detailed Logging)
AutoGluon Version:  1.2
Python Version:     3.10.0
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #1 SMP Tue Nov 5 00:21:55 UTC 2024
CPU Count:          12
GPU Count:          1
Memory Avail:       54.69 GB / 57.48 GB (95.2%)
Disk Space Avail:   3551.40 GB / 6519.49 GB (54.5%)
Presets specified: ['best_quality']
User Specified kwargs:
{'auto_stack': True, 'num_bag_sets': 1, 'verbosity': 3}
Full kwargs:
{'_feature_generator_kwargs': None,
 '_save_bag_folds': None,
 'ag_args': None,
 'ag_args_ensemble': None,
 'ag_args_fit': None,
 'auto_stack': True,
 'calibrate': 'auto',
 'delay_bag_sets': False,
 'ds_args': {'clean_up_fits': True,
             'detection_time_frac': 0.25,
             'enable_callbacks': False,
             'enable_ray_logging': True,
             'holdout_data': None,
             'holdout_frac': 0.1111111111111111,
             'memory_safe_fits': True,
             'n_folds': 2,
             'n_repeats': 1,


[36m(_ray_fit pid=51079)[0m [50]	valid_set's rmse: 13.3009
[36m(_ray_fit pid=51078)[0m [100]	valid_set's rmse: 13.1496[32m [repeated 11x across cluster] (Ray deduplicates logs by default. Set RAY_DEDUP_LOGS=0 to disable log deduplication, or see https://docs.ray.io/en/master/ray-observability/user-guides/configure-logging.html#log-deduplication for more options.)[0m
[36m(_ray_fit pid=51085)[0m [200]	valid_set's rmse: 13.1019[32m [repeated 16x across cluster][0m


[36m(_ray_fit pid=51084)[0m Saving /mnt/n/code/competition/kaggle/Predict_Podcast_Listening_Time/notebook/ag_optimized/ds_sub_fit/sub_fit_ho/models/LightGBM_BAG_L1/S1F8/model.pkl
[36m(_ray_fit pid=51084)[0m 	Fitting 300 rounds... Hyperparameters: {'learning_rate': 0.05}[32m [repeated 7x across cluster][0m
[36m(_ray_fit pid=51079)[0m Saving /mnt/n/code/competition/kaggle/Predict_Podcast_Listening_Time/notebook/ag_optimized/ds_sub_fit/sub_fit_ho/models/LightGBM_BAG_L1/S1F3/model.pkl
[36m(_dystack pid=50271)[0m 	-13.0779	 = Validation score   (-root_mean_squared_error)
[36m(_dystack pid=50271)[0m 	21.35s	 = Training   runtime
[36m(_dystack pid=50271)[0m 	10.45s	 = Validation runtime
[36m(_dystack pid=50271)[0m 	7971.1	 = Inference  throughput (rows/s | 83334 batch size)
[36m(_dystack pid=50271)[0m Saving /mnt/n/code/competition/kaggle/Predict_Podcast_Listening_Time/notebook/ag_optimized/ds_sub_fit/sub_fit_ho/models/trainer.pkl
[36m(_dystack pid=50271)[0m Fitting model:

[36m(_ray_fit pid=51539)[0m 0:	learn: 26.1731009	test: 26.1177815	best: 26.1177815 (0)	total: 244ms	remaining: 6m 5s
[36m(_ray_fit pid=51081)[0m [300]	valid_set's rmse: 13.1391[32m [repeated 20x across cluster][0m
[36m(_ray_fit pid=51544)[0m 40:	learn: 13.6857033	test: 13.7775108	best: 13.7775108 (40)	total: 7.22s	remaining: 4m 17s[32m [repeated 16x across cluster][0m
[36m(_ray_fit pid=51542)[0m 80:	learn: 13.2301134	test: 13.2159761	best: 13.2159761 (80)	total: 14.4s	remaining: 4m 13s[32m [repeated 16x across cluster][0m
[36m(_ray_fit pid=51544)[0m 120:	learn: 13.1727460	test: 13.2537050	best: 13.2537050 (120)	total: 21.3s	remaining: 4m 2s[32m [repeated 16x across cluster][0m
[36m(_ray_fit pid=51544)[0m 160:	learn: 13.1529136	test: 13.2355149	best: 13.2355149 (160)	total: 27.9s	remaining: 3m 51s[32m [repeated 16x across cluster][0m
[36m(_ray_fit pid=51538)[0m 180:	learn: 13.1637128	test: 13.1100385	best: 13.1100385 (180)	total: 33.2s	remaining: 4m 1s[32m [repe

KeyboardInterrupt: 