In [85]:
CUDA = False
if CUDA:
    !pip install -qq scikit-learn==1.6.1

In [86]:
##
if CUDA:
    %load_ext cudf.pandas
    import cudf
import pandas as pd
import numpy as np
import warnings

from xgboost import XGBRegressor
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error

from tqdm import tqdm
from itertools import combinations

In [87]:
if CUDA:
    test = pd.read_csv('/kaggle/input/playground-series-s5e4/test.csv', index_col='id')
    train = pd.read_csv('/kaggle/input/playground-series-s5e4/train.csv', index_col='id')
    print(train.shape)
else:
    test = pd.read_csv('data/test.csv', index_col='id')
    train = pd.read_csv('data/train.csv', index_col='id')
    print(train.shape)

(750000, 11)


In [88]:
warnings.filterwarnings('ignore')

In [89]:
pd.options.display.max_columns = None

# FE

In [90]:
y = train['Listening_Time_minutes']

In [91]:
TARGET_COL = ['Listening_Time_minutes']
CAT_COLS = ['Podcast_Name', 'Genre', 'Publication_Day', 'Publication_Time', 'Episode_Sentiment']

In [92]:
%%time

def feature_eng(df):
    podc_dict = {'Mystery Matters': 0, 'Joke Junction': 1, 'Study Sessions': 2, 'Digital Digest': 3, 'Mind & Body': 4,
                 'Fitness First': 5, 'Criminal Minds': 6, 'News Roundup': 7, 'Daily Digest': 8, 'Music Matters': 9,
                 'Sports Central': 10, 'Melody Mix': 11, 'Game Day': 12, 'Gadget Geek': 13, 'Global News': 14,
                 'Tech Talks': 15, 'Sport Spot': 16, 'Funny Folks': 17, 'Sports Weekly': 18, 'Business Briefs': 19,
                 'Tech Trends': 20, 'Innovators': 21, 'Health Hour': 22, 'Comedy Corner': 23, 'Sound Waves': 24,
                 'Brain Boost': 25, "Athlete's Arena": 26, 'Wellness Wave': 27, 'Style Guide': 28, 'World Watch': 29,
                 'Humor Hub': 30, 'Money Matters': 31, 'Healthy Living': 32, 'Home & Living': 33,
                 'Educational Nuggets': 34, 'Market Masters': 35, 'Learning Lab': 36, 'Lifestyle Lounge': 37,
                 'Crime Chronicles': 38, 'Detective Diaries': 39, 'Life Lessons': 40, 'Current Affairs': 41,
                 'Finance Focus': 42, 'Laugh Line': 43, 'True Crime Stories': 44, 'Business Insights': 45,
                 'Fashion Forward': 46, 'Tune Time': 47}
    genr_dict = {'True Crime': 0, 'Comedy': 1, 'Education': 2, 'Technology': 3, 'Health': 4, 'News': 5, 'Music': 6,
                 'Sports': 7, 'Business': 8, 'Lifestyle': 9}
    week_dict = {'Monday': 0, 'Tuesday': 1, 'Wednesday': 2, 'Thursday': 3, 'Friday': 4, 'Saturday': 5, 'Sunday': 6}
    time_dict = {'Morning': 0, 'Afternoon': 1, 'Evening': 2, 'Night': 3}
    sent_dict = {'Negative': 0, 'Neutral': 1, 'Positive': 2}

    df['Episode_Num'] = df['Episode_Title'].str[8:]

    df['Genre'] = df['Genre'].replace(genr_dict)
    df['Podcast_Name'] = df['Podcast_Name'].replace(podc_dict)
    df['Publication_Day'] = df['Publication_Day'].replace(week_dict)
    df['Publication_Time'] = df['Publication_Time'].replace(time_dict)
    df['Episode_Sentiment'] = df['Episode_Sentiment'].replace(sent_dict)

    df['Genre'] = df['Genre']
    df['Podcast_Name'] = df['Podcast_Name']
    df['Publication_Day'] = df['Publication_Day']
    df['Publication_Time'] = df['Publication_Time']
    df['Episode_Sentiment'] = df['Episode_Sentiment']

    df = df.drop(columns=['Episode_Title'])

    df['Episode_Length_minutes'] = df['Episode_Length_minutes'].astype('float32')
    df['Host_Popularity_percentage'] = df['Host_Popularity_percentage'].astype('float32')
    df['Guest_Popularity_percentage'] = df['Guest_Popularity_percentage'].astype('float32')
    df['Number_of_Ads'] = df['Number_of_Ads'].astype('float32')

    df['LinearFeat'] = 0.728 * df['Episode_Length_minutes']
    df['Is_High_Host_Popularity'] = (df['Host_Popularity_percentage'] > 70).astype(int)
    df['Is_High_Guest_Popularity'] = (df['Guest_Popularity_percentage'] > 70).astype(int)
    df['Host_Guest_Popularity_Gap'] = df['Host_Popularity_percentage'] / df['Guest_Popularity_percentage']
    df['Host_Guest_Popularity_Gap'] = df['Host_Guest_Popularity_Gap'].replace([np.inf, -np.inf], np.nan)
    df['Ad_Density'] = df['Number_of_Ads'] / df['Episode_Length_minutes']
    df['Ad_Density'] = df['Ad_Density'].replace([np.inf, -np.inf], np.nan)
    df['Is_Long_Medium_Small_Episode'] = df['Episode_Length_minutes'].apply(
        lambda x: 2 if x > 60 else 0 if x < 20 else 1)

    return df


train = feature_eng(train)
test = feature_eng(test)

train['Listening_Time_minutes'] = train['Listening_Time_minutes'].astype('float32')
CAT_COLS.append('Episode_Num')

CPU times: user 2.33 s, sys: 104 ms, total: 2.44 s
Wall time: 2.44 s


### Combination of Categorical Columns

In [93]:
%%time

encode_columns = ['Episode_Length_minutes', 
                  'Episode_Num', 
                  'Host_Popularity_percentage', 
                  'Number_of_Ads',
                  'Episode_Sentiment',
                  'Publication_Day', 
                  'Publication_Time', 
                  'Guest_Popularity_percentage',
                  'Podcast_Name', 
                  'Genre']

pair_size = [2]
COMBO = []
if CUDA:
    train = cudf.from_pandas(train)
    test = cudf.from_pandas(test)

    for r in pair_size:
        for cols in tqdm(list(combinations(encode_columns, r))):
            new_col_name = '_'.join(cols)

            train[new_col_name] = train[cols[0]].astype(str)
            for col in cols[1:]:
                train[new_col_name] = train[new_col_name] + '_' + train[col].astype(str)

            test[new_col_name] = test[cols[0]].astype(str)
            for col in cols[1:]:
                test[new_col_name] = test[new_col_name] + '_' + test[col].astype(str)

            COMBO.append(new_col_name)

    train = train.to_pandas()
    test = test.to_pandas()
else:
    for r in pair_size:
        for cols in tqdm(list(combinations(encode_columns, r))):
            new_col_name = '_'.join(cols)
            train[new_col_name] = train[list(cols)].astype(str).agg('_'.join, axis=1)
            test[new_col_name] = test[list(cols)].astype(str).agg('_'.join, axis=1)
            COMBO.append(new_col_name)

100%|██████████| 45/45 [01:57<00:00,  2.61s/it]

CPU times: user 1min 54s, sys: 1.71 s, total: 1min 55s
Wall time: 1min 57s





## Create columns lists

In [94]:
NUM_COLS = [col for col in train.columns if col not in CAT_COLS + TARGET_COL + COMBO]
FEATURES = CAT_COLS + NUM_COLS + COMBO

In [95]:
# STATISTICS TO AGGREGATE FOR OUR FEATURE GROUPS
STATS = ['mean']

## KFOLD 

In [96]:
%%time

ParamsXGB = {'max_depth': 10, 'learning_rate': 0.00462847749422193, 'min_child_weight': 4,
             'subsample': 0.8244361720956633, 'colsample_bytree': 0.5586626138810886,
             'gamma': 1.1614500954011453, 'reg_alpha': 0.3548920754067436, 'reg_lambda': 3.9465129148897287,
             "n_estimators": 10000, 'enable_categorical': True,
             }

FOLDS = 7
kf = KFold(n_splits=FOLDS, shuffle=True, random_state=42)

oof = np.zeros((len(train)))
pred = np.zeros((len(test)))

# OUTER K FOLD
for i, (train_index, test_index) in enumerate(kf.split(train)):
    print(f"### OUTER Fold {i + 1} ###")

    X_train = train.loc[train_index, FEATURES + TARGET_COL].reset_index(drop=True).copy()
    y_train = train.loc[train_index, 'Listening_Time_minutes']

    X_valid = train.loc[test_index, FEATURES].reset_index(drop=True).copy()
    y_valid = train.loc[test_index, 'Listening_Time_minutes']

    X_test = test[FEATURES].reset_index(drop=True).copy()

    # INNER K FOLD (TO PREVENT LEAKAGE WHEN USING Listening_Time_minutes)
    kf2 = KFold(n_splits=FOLDS, shuffle=True, random_state=42)
    for j, (train_index2, test_index2) in enumerate(kf2.split(X_train)):
        print(f" ## INNER Fold {j + 1} (outer fold {i + 1}) ##")

        X_train2 = X_train.loc[train_index2, FEATURES + TARGET_COL].copy()
        X_valid2 = X_train.loc[test_index2, FEATURES].copy()

        ## FEATURE SET 2 (uses Listening_Time_minutes) ###
        for col in COMBO:
            tmp = X_train2.groupby(col).Listening_Time_minutes.agg(STATS)
            tmp.columns = [f"TE2_{col}_{s}" for s in STATS]
            X_valid2 = X_valid2.merge(tmp, on=col, how="left")
            for c in tmp.columns:
                X_train.loc[test_index2, c] = X_valid2[c].values
    ## FEATURE SET 2 (uses Listening_Time_minutes) ###
    for col in COMBO:
        tmp = X_train.groupby(col).Listening_Time_minutes.agg(STATS)
        tmp.columns = [f"TE2_{col}_{s}" for s in STATS]
        X_valid = X_valid.merge(tmp, on=col, how="left")
        X_test = X_test.merge(tmp, on=col, how="left")

    # CONVERT TO CAT_COLS SO XGBOOST RECOGNIZES THEM
    X_train[CAT_COLS] = X_train[CAT_COLS].astype("category")
    X_valid[CAT_COLS] = X_valid[CAT_COLS].astype("category")
    X_test[CAT_COLS] = X_test[CAT_COLS].astype("category")

    # DROP Listening_Time_minutes THAT WAS USED FOR TARGET ENCODING
    X_train = X_train.drop(TARGET_COL + COMBO, axis=1)
    X_valid = X_valid.drop(COMBO, axis=1)
    X_test = X_test.drop(COMBO, axis=1)

    # BUILD MODEL
    if CUDA:
        model = XGBRegressor(
            **ParamsXGB,
            tree_method='gpu_hist',
            random_state=42,
            early_stopping_rounds=100
        )
    else:
        model = XGBRegressor(
            **ParamsXGB,
            tree_method='hist',
            random_state=42,
            early_stopping_rounds=100
        )

        # TRAIN MODEL
    COLS = X_train.columns
    model.fit(
        X_train[COLS], y_train,
        eval_set=[(X_valid[COLS], y_valid)],
        verbose=100
    )

    # PREDICT OOF AND TEST
    oof[test_index] = model.predict(X_valid[COLS])
    pred += model.predict(X_test[COLS])

    fold_rmse = mean_squared_error(y_valid, oof[test_index]) ** 0.5
    print(f"✅ Fold {i + 1} RMSE: {fold_rmse:.5f}")

overall_rmse = mean_squared_error(y, oof) ** 0.5
print(f"\n🎯 Overall CV RMSE: {overall_rmse:.5f}")
pred /= FOLDS

### OUTER Fold 1 ###
 ## INNER Fold 1 (outer fold 1) ##
 ## INNER Fold 2 (outer fold 1) ##
 ## INNER Fold 3 (outer fold 1) ##
 ## INNER Fold 4 (outer fold 1) ##
 ## INNER Fold 5 (outer fold 1) ##
 ## INNER Fold 6 (outer fold 1) ##
 ## INNER Fold 7 (outer fold 1) ##
[0]	validation_0-rmse:26.98424
[100]	validation_0-rmse:19.65185
[200]	validation_0-rmse:15.75517
[300]	validation_0-rmse:13.86078
[400]	validation_0-rmse:12.99601
[500]	validation_0-rmse:12.61073
[600]	validation_0-rmse:12.43676
[700]	validation_0-rmse:12.35224
[800]	validation_0-rmse:12.30809
[900]	validation_0-rmse:12.28449
[1000]	validation_0-rmse:12.26867
[1100]	validation_0-rmse:12.25699
[1200]	validation_0-rmse:12.24816
[1300]	validation_0-rmse:12.24207
[1400]	validation_0-rmse:12.23713
[1500]	validation_0-rmse:12.23290
[1600]	validation_0-rmse:12.22934
[1700]	validation_0-rmse:12.22624
[1800]	validation_0-rmse:12.22324
[1900]	validation_0-rmse:12.22033
[2000]	validation_0-rmse:12.21737
[2100]	validation_0-rmse:12.2152

# Submission

In [99]:
df_subm = pd.read_csv('data/sample_submission.csv', index_col='id')

In [100]:
%%time

df_subm['Listening_Time_minutes'] = pred
df_subm.to_csv('submission_XGB.csv')
df_subm.head()

CPU times: user 241 ms, sys: 22.8 ms, total: 264 ms
Wall time: 299 ms


Unnamed: 0_level_0,Listening_Time_minutes
id,Unnamed: 1_level_1
750000,54.035843
750001,23.069012
750002,45.636138
750003,78.1005
750004,49.719663
