In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/playground-series-s5e4/sample_submission.csv
/kaggle/input/playground-series-s5e4/train.csv
/kaggle/input/playground-series-s5e4/test.csv


In [2]:
train = pd.read_csv('/kaggle/input/playground-series-s5e4/train.csv')
test = pd.read_csv('/kaggle/input/playground-series-s5e4/test.csv')

In [3]:
categorical = ['Podcast_Name', 'Episode_Title', 'Genre', 'Publication_Day', 'Publication_Time', 'Episode_Sentiment']
numerical = ['Episode_Length_minutes', 'Host_Popularity_percentage', 'Guest_Popularity_percentage', 'Number_of_Ads', 'Listening_Time_minutes']

In [4]:
from sklearn.preprocessing import OneHotEncoder

def preprocess(df, categorical, numerical):
    df = df.copy()
    df.bfill(inplace = True)
    for column in categorical:
        encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')
        transformed = encoder.fit_transform(df[[column]])

        col_names = [f"{column}_{cat}" for cat in encoder.categories_[0]]
        one_hot_df = pd.DataFrame(transformed, columns=col_names, index=df.index)

        df = pd.concat([df.drop(columns=[column]), one_hot_df], axis=1)
    df.ffill(inplace = True)

    return df

In [5]:
train = preprocess(train, categorical, numerical)



In [6]:
train

Unnamed: 0,id,Episode_Length_minutes,Host_Popularity_percentage,Guest_Popularity_percentage,Number_of_Ads,Listening_Time_minutes,Podcast_Name_Athlete's Arena,Podcast_Name_Brain Boost,Podcast_Name_Business Briefs,Podcast_Name_Business Insights,...,Publication_Day_Thursday,Publication_Day_Tuesday,Publication_Day_Wednesday,Publication_Time_Afternoon,Publication_Time_Evening,Publication_Time_Morning,Publication_Time_Night,Episode_Sentiment_Negative,Episode_Sentiment_Neutral,Episode_Sentiment_Positive
0,0,119.80,74.81,75.95,0.0,31.41998,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
1,1,119.80,66.95,75.95,2.0,88.01241,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
2,2,73.90,69.97,8.97,0.0,44.92531,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
3,3,67.17,57.22,78.70,2.0,46.27824,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
4,4,110.51,80.07,58.68,3.0,75.61031,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
749995,749995,75.66,69.36,84.89,0.0,56.87058,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
749996,749996,75.75,35.21,84.89,2.0,45.46242,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
749997,749997,30.98,78.58,84.89,0.0,15.26000,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
749998,749998,108.98,45.39,93.27,0.0,100.72939,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0


In [7]:
x_train = train.drop(columns = ['id', 'Listening_Time_minutes'])
y_train = train['Listening_Time_minutes']

In [8]:
import xgboost as xgb

xgb = xgb.XGBRegressor(
    n_estimators=100,       # how many trees to build
    learning_rate=0.1,      # how fast we learn; lower = safer
    max_depth=6,            # how deep each tree goes
    subsample=0.8,          # % of data to sample per tree (boosts generalization)
    colsample_bytree=0.8,   # % of features per tree (prevents overfitting)
    random_state=42,        # so results don’t change every time
    n_jobs=-1,              # use all the cores like a multitasking queen
    verbosity=2             # 0 = silent, 1 = warning, 2 = info, 3 = debug
)

xgb.fit(x_train, y_train)

In [9]:
test = preprocess(test, categorical, numerical)



In [10]:
x_test = test.drop(columns = ['id'])

In [11]:
preds = xgb.predict(x_test)
test['preds'] = preds

In [12]:
submission = pd.DataFrame({
    'id' : test['id'],
    'Listening_Time_minutes' : test['preds']
})

In [13]:
submission.to_csv('submission.csv', index = False)