In [1]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import lightgbm as lgb

In [2]:
df = pd.read_csv("playground-series-s5e4/train.csv")

In [3]:
df.describe()

Unnamed: 0,id,Episode_Length_minutes,Host_Popularity_percentage,Guest_Popularity_percentage,Number_of_Ads,Listening_Time_minutes
count,750000.0,662907.0,750000.0,603970.0,749999.0,750000.0
mean,374999.5,64.504738,59.859901,52.236449,1.348855,45.437406
std,216506.495284,32.969603,22.873098,28.451241,1.15113,27.138306
min,0.0,0.0,1.3,0.0,0.0,0.0
25%,187499.75,35.73,39.41,28.38,0.0,23.17835
50%,374999.5,63.84,60.05,53.58,1.0,43.37946
75%,562499.25,94.07,79.53,76.6,2.0,64.81158
max,749999.0,325.24,119.46,119.91,103.91,119.97


In [4]:
df.head()

Unnamed: 0,id,Podcast_Name,Episode_Title,Episode_Length_minutes,Genre,Host_Popularity_percentage,Publication_Day,Publication_Time,Guest_Popularity_percentage,Number_of_Ads,Episode_Sentiment,Listening_Time_minutes
0,0,Mystery Matters,Episode 98,,True Crime,74.81,Thursday,Night,,0.0,Positive,31.41998
1,1,Joke Junction,Episode 26,119.8,Comedy,66.95,Saturday,Afternoon,75.95,2.0,Negative,88.01241
2,2,Study Sessions,Episode 16,73.9,Education,69.97,Tuesday,Evening,8.97,0.0,Negative,44.92531
3,3,Digital Digest,Episode 45,67.17,Technology,57.22,Monday,Morning,78.7,2.0,Positive,46.27824
4,4,Mind & Body,Episode 86,110.51,Health,80.07,Monday,Afternoon,58.68,3.0,Neutral,75.61031


In [5]:
df.isnull().sum()

id                                  0
Podcast_Name                        0
Episode_Title                       0
Episode_Length_minutes          87093
Genre                               0
Host_Popularity_percentage          0
Publication_Day                     0
Publication_Time                    0
Guest_Popularity_percentage    146030
Number_of_Ads                       1
Episode_Sentiment                   0
Listening_Time_minutes              0
dtype: int64

In [6]:
df['Episode_Length_minutes'] = df['Episode_Length_minutes'].fillna(df['Episode_Length_minutes'].mean())
df['Episode_Length_minutes'].isnull().sum()
df['Guest_Popularity_percentage'] = df['Guest_Popularity_percentage'].fillna(df['Guest_Popularity_percentage'].mean())


In [7]:
X = df.drop(columns=['Episode_Length_minutes'])
y = df['Episode_Length_minutes']


In [8]:
from sklearn.preprocessing import OneHotEncoder
oh = OneHotEncoder()
df['Podcast_Name'] = oh.fit_transform(df[['Podcast_Name']]).toarray()
df['Episode_Title'] = oh.fit_transform(df[['Episode_Title']]).toarray()
df['Genre'] = oh.fit_transform(df[['Genre']]).toarray()
df['Publication_Day'] = oh.fit_transform(df[['Publication_Day']]).toarray()
df['Publication_Time'] = oh.fit_transform(df[['Publication_Time']]).toarray()



In [9]:
from sklearn.model_selection import train_test_split
X = df[['Podcast_Name', 'Episode_Title', 'Genre', 'Publication_Day', 'Publication_Time', 'Guest_Popularity_percentage','Host_Popularity_percentage']]
y = df['Listening_Time_minutes']

In [10]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
lgb_regressor = lgb.LGBMRegressor(
    n_estimators=500,
    learning_rate=0.1,
    max_depth=7,
    num_leaves=31,
    min_data_in_leaf=20,
    feature_fraction=0.8,
    bagging_fraction=0.8,
    bagging_freq=5,
    random_state=42,
    n_jobs=-1,
    verbose=-1  # Suppress training output
)

# Fit the model
lgb_regressor.fit(
    X_train, 
    y_train,
    eval_set=[(X_test, y_test)],
    eval_metric='rmse',
    callbacks=[lgb.early_stopping(50), lgb.log_evaluation(0)]  # Early stopping with patience=50
)

Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[457]	valid_0's rmse: 26.8451	valid_0's l2: 720.659


In [12]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
print("\n=== Making Predictions ===")
y_pred_train = lgb_regressor.predict(X_train)
y_pred_test = lgb_regressor.predict(X_test)

# Calculate metrics
print("\n=== Model Performance ===")
train_mae = mean_absolute_error(y_train, y_pred_train)
train_mse = mean_squared_error(y_train, y_pred_train)
train_rmse = np.sqrt(train_mse)
train_r2 = r2_score(y_train, y_pred_train)

test_mae = mean_absolute_error(y_test, y_pred_test)
test_mse = mean_squared_error(y_test, y_pred_test)
test_rmse = np.sqrt(test_mse)
test_r2 = r2_score(y_test, y_pred_test)
print(test_r2)


=== Making Predictions ===

=== Model Performance ===
0.02061366046427171


In [13]:
import mlflow.lightgbm
from mlflow.models import infer_signature


In [15]:
mlflow.set_tracking_uri(uri="http://127.0.0.1:8080")
mlflow.set_experiment("Podcast Listening Time Prediction")

2025/09/10 20:56:22 INFO mlflow.tracking.fluent: Experiment with name 'Podcast Listening Time Prediction' does not exist. Creating a new experiment.


<Experiment: artifact_location='mlflow-artifacts:/201757001456484594', creation_time=1757552182996, experiment_id='201757001456484594', last_update_time=1757552182996, lifecycle_stage='active', name='Podcast Listening Time Prediction', tags={}>

In [16]:
with mlflow.start_run():
    mlflow.lightgbm.log_model(lgb_regressor, "model")
    signature = infer_signature(X_train, lgb_regressor.predict(X_train))
    mlflow.lightgbm.log_model(lgb_regressor, "model", signature=signature)
    mlflow.log_params(lgb_regressor.get_params())
    mlflow.log_metric("train_mae", train_mae)
    mlflow.log_metric("train_mse", train_mse)
    mlflow.log_metric("train_rmse", train_rmse)
    mlflow.log_metric("train_r2", train_r2)
    mlflow.log_metric("test_mae", test_mae)
    mlflow.log_metric("test_mse", test_mse)
    mlflow.log_metric("test_rmse", test_rmse)
    mlflow.log_metric("test_r2", test_r2)



🏃 View run trusting-cow-279 at: http://127.0.0.1:8080/#/experiments/201757001456484594/runs/cad5ba907960403f9e1ffde989808651
🧪 View experiment at: http://127.0.0.1:8080/#/experiments/201757001456484594
