In [None]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold
import csv
import matplotlib as plt 
import seaborn as sns


In [2]:
# Feature definitions
features = ["Episode_Length_minutes", "Number_of_Ads", "Host_Popularity_percentage",
            "Guest_Popularity_percentage", "Episode_Sentiment", "Publication_Day",
            "Publication_Time", "Genre", "Podcast_Name"]

features_to_encode = ["Episode_Sentiment", "Publication_Day", "Publication_Time", "Genre", "Podcast_Name"]

# Label encoders
encoders = {feature: LabelEncoder() for feature in features_to_encode}


In [3]:
'''
lr = 0.375
n_estimators = 1200
max_bin = 1024

common_params = {
    "objective": "reg:squarederror",
    "n_estimators": n_estimators,
    "max_depth": 6,
    "learning_rate": lr,
    "max_bin": max_bin,
    "verbosity": 0
}

models = []
for _ in range(10):
    model = xgb.XGBRegressor(**common_params)
    model.fit(x, y, eval_set=[(x, y)], verbose=True)
    models.append(model)

model1, model2, model3, model4, model5, model6, model7, model8, model9, model10 = models
'''


'\nlr = 0.375\nn_estimators = 1200\nmax_bin = 1024\n\ncommon_params = {\n    "objective": "reg:squarederror",\n    "n_estimators": n_estimators,\n    "max_depth": 6,\n    "learning_rate": lr,\n    "max_bin": max_bin,\n    "verbosity": 0\n}\n\nmodels = []\nfor _ in range(10):\n    model = xgb.XGBRegressor(**common_params)\n    model.fit(x, y, eval_set=[(x, y)], verbose=True)\n    models.append(model)\n\nmodel1, model2, model3, model4, model5, model6, model7, model8, model9, model10 = models\n'

In [4]:
def preprocess(df, is_train=True):
    df["Episode_Title"] = df["Episode_Title"].str[8:]

    # Impute missing values
    for col in ["Episode_Length_minutes", "Guest_Popularity_percentage", "Number_of_Ads"]:
        df[[col]] = SimpleImputer(strategy="median").fit_transform(df[[col]])

    # Weekend flag

    # Label encode categorical variables
    for feature in features_to_encode:
        if is_train:
            df[feature] = encoders[feature].fit_transform(df[feature])
        else:
            df[feature] = encoders[feature].transform(df[feature])

    # Feature engineering
    df["Ads_Per_Minute"] = df["Number_of_Ads"] / df["Episode_Length_minutes"]
    df["People_Popularity"] = df["Host_Popularity_percentage"] + df["Guest_Popularity_percentage"]
    df["Linear"] = df["Episode_Length_minutes"] * 0.728

    return df


In [5]:
engineered = ["Ads_Per_Minute", "People_Popularity", "Linear"]
def get_train_data(file):
    df = pd.read_csv(file)
    y = df[["Listening_Time_minutes"]].to_numpy().astype(np.float32)
    df = preprocess(df, is_train=True)
    x = df[features + engineered].to_numpy().astype(np.float32)
    return x, y

def get_test_data(file):
    df = pd.read_csv(file)
    df = preprocess(df, is_train=False)
    x = df[features + engineered].to_numpy().astype(np.float32)
    return x



In [6]:
x, y = get_train_data("train.csv")
X_test = get_test_data("test.csv")


In [7]:
common_params = {
    "objective": "regression",
    "n_estimators": 1000,
    "learning_rate": 0.03,
    "max_bin": 1024,
    "num_leaves": 1024,
    "max_depth": -1,
    "subsample": 0.7,
    "colsample_bytree": 0.7,
    "random_state": 42
}


In [None]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)
models = []

for train_index, val_index in kf.split(x):
    x_train, x_val = x[train_index], x[val_index]
    y_train, y_val = y[train_index], y[val_index]
    
    model = lgb.LGBMRegressor(**common_params)
    model.fit(x_train, y_train, eval_set=[(x_val, y_val)], eval_metric="rmse", callbacks=[lgb.log_evaluation(100)])
    models.append(model)


  y = column_or_1d(y, warn=True)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.010196 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 6221
[LightGBM] [Info] Number of data points in the train set: 600000, number of used features: 12
[LightGBM] [Info] Start training from score 45.447808




[100]	valid_0's rmse: 13.0082	valid_0's l2: 169.212
[200]	valid_0's rmse: 12.8625	valid_0's l2: 165.443
[300]	valid_0's rmse: 12.822	valid_0's l2: 164.404
[400]	valid_0's rmse: 12.8002	valid_0's l2: 163.845
[500]	valid_0's rmse: 12.7824	valid_0's l2: 163.39
[600]	valid_0's rmse: 12.7666	valid_0's l2: 162.985
[700]	valid_0's rmse: 12.7565	valid_0's l2: 162.728
[800]	valid_0's rmse: 12.7465	valid_0's l2: 162.473
[900]	valid_0's rmse: 12.7379	valid_0's l2: 162.255
[1000]	valid_0's rmse: 12.7306	valid_0's l2: 162.068


  y = column_or_1d(y, warn=True)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.029057 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6221
[LightGBM] [Info] Number of data points in the train set: 600000, number of used features: 12
[LightGBM] [Info] Start training from score 45.421359




[100]	valid_0's rmse: 13.0451	valid_0's l2: 170.174
[200]	valid_0's rmse: 12.9011	valid_0's l2: 166.439
[300]	valid_0's rmse: 12.8645	valid_0's l2: 165.496
[400]	valid_0's rmse: 12.842	valid_0's l2: 164.916
[500]	valid_0's rmse: 12.8208	valid_0's l2: 164.372


In [None]:
preds = sum(model.predict(X_test) for model in models) / len(models)

with open("submission_lgbm.csv", "w", newline='') as f:
    writer = csv.writer(f)
    writer.writerow(["id", "prediction"])
    for i, pred in enumerate(preds, start=750000):
        writer.writerow([i, pred])




In [None]:
features_to_plot = ["Episode_Length_minutes", "Host_Popularity_percentage", 
                    "Guest_Popularity_percentage", "Listening_Time_minutes", "Number_of_Ads"]
df = pd.read_csv("train.csv")

plt.figure(figsize=(15, 10))
for i, feature in enumerate(features_to_plot, 1):
    plt.subplot(2, 3, i)
    sns.histplot(df[feature], kde=True)
    plt.title(f"Distribution of {feature}")
plt.tight_layout()
plt.show()

In [None]:
correlation_matrix = df[features_to_plot].corr()
plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title("Correlation Heatmap")
plt.show()