In [24]:
# Import necessary modules
from autogluon.tabular import TabularDataset, TabularPredictor
import pandas as pd
from sklearn.model_selection import train_test_split

In [25]:
# Load and prepare the data
train = pd.read_csv("../data/playground-series-s5e4/train.csv", index_col='id')
test = pd.read_csv('../data/playground-series-s5e4/test.csv', index_col='id')

In [26]:
# Feature engineering function
def add_features(df):

    # Convert binary features to numeric
    day_map = {
        'Monday': 0,
        'Tuesday': 1,
        'Wednesday': 2,
        'Thursday': 3,
        'Friday': 4,
        'Saturday': 5,
        'Sunday': 6,
    }
    df['Publication_Day'] = df['Publication_Day'].map(day_map)

    time_map = {
       "Morning": 0,
       "Afternoon": 1,
       "Night": 2
   }

    df['Publication_Time'] = df['Publication_Time'].map(time_map)

    return df

In [27]:
#Apply feature engineering to all datasets
train = add_features(train)
test = add_features(test)

In [28]:
# Combine train and train extra data sets
df = pd.concat([train], axis=0, ignore_index=True)
# 打印df前10行
print(df.head(10))

      Podcast_Name Episode_Title  Episode_Length_minutes       Genre  \
0  Mystery Matters    Episode 98                     NaN  True Crime   
1    Joke Junction    Episode 26                  119.80      Comedy   
2   Study Sessions    Episode 16                   73.90   Education   
3   Digital Digest    Episode 45                   67.17  Technology   
4      Mind & Body    Episode 86                  110.51      Health   
5    Fitness First    Episode 19                   26.54      Health   
6   Criminal Minds    Episode 47                   69.83  True Crime   
7     News Roundup    Episode 44                   48.52        News   
8     Daily Digest    Episode 32                  105.87        News   
9    Music Matters    Episode 81                     NaN       Music   

   Host_Popularity_percentage  Publication_Day  Publication_Time  \
0                       74.81                3               2.0   
1                       66.95                5               0.0   
2  

In [29]:
# Define target column
target = 'Listening_Time_minutes'

In [30]:
# Split data into train and validation sets
train_data, val_data = train_test_split(df, test_size=0.2, random_state=42)

In [31]:
train_data

Unnamed: 0,Podcast_Name,Episode_Title,Episode_Length_minutes,Genre,Host_Popularity_percentage,Publication_Day,Publication_Time,Guest_Popularity_percentage,Number_of_Ads,Episode_Sentiment,Listening_Time_minutes
453635,True Crime Stories,Episode 81,94.30,True Crime,97.67,4,0.0,,2.0,Positive,77.27788
11651,Learning Lab,Episode 53,82.15,Education,94.78,5,2.0,,1.0,Positive,50.02839
431999,Tech Trends,Episode 21,13.72,Technology,68.60,5,0.0,65.77,3.0,Negative,10.07496
529211,Laugh Line,Episode 99,24.00,Comedy,42.14,5,0.0,41.29,0.0,Negative,17.82074
110925,Athlete's Arena,Episode 15,,Sports,34.10,4,2.0,,0.0,Neutral,94.80341
...,...,...,...,...,...,...,...,...,...,...,...
259178,Crime Chronicles,Episode 71,42.65,True Crime,80.53,4,0.0,96.31,3.0,Negative,16.77295
365838,Business Insights,Episode 66,94.50,Business,42.80,4,0.0,8.68,3.0,Positive,73.00649
131932,Melody Mix,Episode 18,61.54,Music,63.97,3,0.0,62.82,2.0,Negative,38.48631
671155,Business Briefs,Episode 27,87.86,Business,75.76,2,2.0,74.87,0.0,Positive,69.27837


In [32]:
# Initialize AutoGluon predictor with time constraints
predictor = TabularPredictor(
    label=target,
    problem_type='regression',
    eval_metric='root_mean_squared_error',
    path='ag_models_backpack'
).fit(
    train_data=train_data,
    tuning_data=val_data,
    # Use medium_quality preset instead of best_quality for faster training
    presets='medium_quality',
    # Set a strict 10-minute time limit (600 seconds)
    time_limit=600,
    # Skip hyperparameter tuning to save time
    hyperparameters='default',
    # Limit model types to faster ones
    excluded_model_types=['KNN', 'NN_TORCH', 'FASTAI'],
    verbosity=2
)

print("Done")

Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.2
Python Version:     3.10.0
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #1 SMP Tue Nov 5 00:21:55 UTC 2024
CPU Count:          12
Memory Avail:       40.79 GB / 57.48 GB (71.0%)
Disk Space Avail:   3550.88 GB / 6519.49 GB (54.5%)
Presets specified: ['medium_quality']
Beginning AutoGluon training ... Time limit = 600s
AutoGluon will save models to "/mnt/n/code/competition/kaggle/Predict_Podcast_Listening_Time/notebook/ag_models_backpack"
Train Data Rows:    600000
Train Data Columns: 10
Tuning Data Rows:    150000
Tuning Data Columns: 10
Label Column:       Listening_Time_minutes
Problem Type:       regression
Preprocessing data ...
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:                    41960.29 MB
	Train Data (Original)  Memory Usage: 224.41 MB (0.5% of available memory)
	Inferring data type of each feature based on column valu

[1000]	valid_set's rmse: 12.992
[2000]	valid_set's rmse: 12.9697
[3000]	valid_set's rmse: 12.9633
[4000]	valid_set's rmse: 12.9629
[5000]	valid_set's rmse: 12.9695


	-12.9619	 = Validation score   (-root_mean_squared_error)
	53.69s	 = Training   runtime
	4.37s	 = Validation runtime
Fitting model: LightGBM ... Training model for up to 539.50s of the 539.50s of remaining time.


[1000]	valid_set's rmse: 12.943
[2000]	valid_set's rmse: 12.9253
[3000]	valid_set's rmse: 12.9152
[4000]	valid_set's rmse: 12.9147
[5000]	valid_set's rmse: 12.9182


	-12.9138	 = Validation score   (-root_mean_squared_error)
	41.03s	 = Training   runtime
	4.79s	 = Validation runtime
Fitting model: RandomForestMSE ... Training model for up to 493.41s of the 493.41s of remaining time.
	-12.8146	 = Validation score   (-root_mean_squared_error)
	138.31s	 = Training   runtime
	0.75s	 = Validation runtime
Fitting model: CatBoost ... Training model for up to 349.93s of the 349.93s of remaining time.
	Ran out of time, early stopping on iteration 2692.
	-12.9827	 = Validation score   (-root_mean_squared_error)
	351.14s	 = Training   runtime
	0.42s	 = Validation runtime
Fitting model: WeightedEnsemble_L2 ... Training model for up to 360.00s of the -1.89s of remaining time.
	Ensemble Weights: {'RandomForestMSE': 0.583, 'LightGBM': 0.333, 'LightGBMXT': 0.083}
	-12.7393	 = Validation score   (-root_mean_squared_error)
	0.05s	 = Training   runtime
	0.0s	 = Validation runtime
AutoGluon training complete, total runtime = 603.46s ... Best model: WeightedEnsemble_L2

Done


In [33]:
# Evaluate on validation data
performance = predictor.evaluate(val_data)

In [34]:
performance

{'root_mean_squared_error': -12.739313076530292,
 'mean_squared_error': -162.29009766185573,
 'mean_absolute_error': -9.271743871480904,
 'r2': 0.7794454082451199,
 'pearsonr': 0.8828715748100041,
 'median_absolute_error': -6.752446163330081}

In [35]:
# Generate predictions on test data
test_pred = predictor.predict(test)
val_pred = predictor.predict(val_data)
test_pred, val_pred

(id
 750000    55.659588
 750001    18.041803
 750002    50.093010
 750003    78.082108
 750004    47.846947
             ...    
 999995    11.943676
 999996    57.343903
 999997     7.280855
 999998    72.405991
 999999    57.891914
 Name: Listening_Time_minutes, Length: 250000, dtype: float32,
 404846    31.499479
 580313    33.931038
 552086    36.275009
 370876    46.668530
 239330    51.379475
             ...    
 235496    53.066738
 372040    34.793037
 695665     3.045597
 386092    82.558327
 549832    78.448334
 Name: Listening_Time_minutes, Length: 150000, dtype: float32)

In [36]:
# Create submission file
submission = pd.DataFrame({'id': test.index, 'Listening_Time_minutes': test_pred})
submission.set_index('id', inplace=True)
submission.to_csv('../output/submission.csv')
print("Submission file created")

Submission file created
