In [26]:
import torch 
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader
import pandas as pd
import numpy as np
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, IterativeImputer
import xgboost as xgb

In [None]:
def get_data_loader(file, features, features_to_encode):
    df = pd.read_csv(file)
    df["Episode_Title"] = df["Episode_Title"].str[8:]
    y = df[["Listening_Time_minutes"]]
    df = df[features]
    imputer = SimpleImputer(strategy="median")
    df[["Episode_Length_minutes"]] = imputer.fit_transform(df[["Episode_Length_minutes"]])
    df[["Guest_Popularity_percentage"]] = imputer.fit_transform(df[["Guest_Popularity_percentage"]])
    df[["Number_of_Ads"]] = imputer.fit_transform(df[["Number_of_Ads"]])
    df["Is_Weekend"] = (df["Publication_Day"] == "Saturday") | (df["Publication_Day"] == "Sunday")
    def one_hot(df, feature):
        encoded = pd.get_dummies(df[[feature]])
        result = pd.concat([df, encoded], axis=1)
        result = result.drop([feature], axis=1)
        return(result) 
    for to_encode in features_to_encode:
        df = one_hot(df, to_encode)
    df["Ads_Per_Minute"] = df['Number_of_Ads'] / df["Episode_Length_minutes"]
    df["People_Popularity"] = df["Host_Popularity_percentage"] + df["Guest_Popularity_percentage"]
    df["Linear"] = df["Episode_Length_minutes"] * 0.728

    x = df
    x = x.to_numpy().astype(np.float32)
    y = y.to_numpy().astype(np.float32)
    return x, y

features=["Episode_Length_minutes", "Number_of_Ads", "Host_Popularity_percentage", "Guest_Popularity_percentage",
          "Episode_Sentiment", "Publication_Day", "Publication_Time", "Genre", "Podcast_Name"]
features_to_encode = ["Episode_Sentiment", "Publication_Day", "Publication_Time",  "Genre", "Podcast_Name"]
x, y = get_data_loader("train.csv", 
    features=features,
    features_to_encode=features_to_encode )

In [28]:
model = xgb.XGBRegressor(
    objective="reg:squarederror",
    n_estimators=3200,
    max_depth=6,
    learning_rate=0.375,
    max_bin = 100,
    verbosity=0,
)


model.fit(
    x, y,
    eval_set = [(x, y)],
    verbose = True, 
) 

[0]	validation_0-rmse:19.88256
[1]	validation_0-rmse:16.16365
[2]	validation_0-rmse:14.43927
[3]	validation_0-rmse:13.69824
[4]	validation_0-rmse:13.38928
[5]	validation_0-rmse:13.25870
[6]	validation_0-rmse:13.20235
[7]	validation_0-rmse:13.17400
[8]	validation_0-rmse:13.15802
[9]	validation_0-rmse:13.14789
[10]	validation_0-rmse:13.14092
[11]	validation_0-rmse:13.13532
[12]	validation_0-rmse:13.12421
[13]	validation_0-rmse:13.11706
[14]	validation_0-rmse:13.11439
[15]	validation_0-rmse:13.10887
[16]	validation_0-rmse:13.10551
[17]	validation_0-rmse:13.10156
[18]	validation_0-rmse:13.09753
[19]	validation_0-rmse:13.09185
[20]	validation_0-rmse:13.08881
[21]	validation_0-rmse:13.08340
[22]	validation_0-rmse:13.07766
[23]	validation_0-rmse:13.07358
[24]	validation_0-rmse:13.06900
[25]	validation_0-rmse:13.06547
[26]	validation_0-rmse:13.06202
[27]	validation_0-rmse:13.05866
[28]	validation_0-rmse:13.05200
[29]	validation_0-rmse:13.04990
[30]	validation_0-rmse:13.04528
[31]	validation_0-

In [None]:
def get_test_loader(file, features, features_to_encode):
    df = pd.read_csv(file)
    def one_hot(df, feature):
        encoded = pd.get_dummies(df[[feature]])
        result = pd.concat([df, encoded], axis=1)
        result = result.drop([feature], axis=1)
        return(result) 
    df["Episode_Title"] = df["Episode_Title"].str[8:]
    df = df[features]
    imputer = SimpleImputer(strategy="median")
    df[["Episode_Length_minutes"]] = imputer.fit_transform(df[["Episode_Length_minutes"]])
    df[["Guest_Popularity_percentage"]] = imputer.fit_transform(df[["Guest_Popularity_percentage"]])
    df[["Number_of_Ads"]] = imputer.fit_transform(df[["Number_of_Ads"]])
    df["Is_Weekend"] = (df["Publication_Day"] == "Saturday") | (df["Publication_Day"] == "Sunday")
    for to_encode in features_to_encode:
        df = one_hot(df, to_encode)
    df["Ads_Per_Minute"] = df['Number_of_Ads'] / df["Episode_Length_minutes"]
    df["People_Popularity"] = df["Host_Popularity_percentage"] + df["Guest_Popularity_percentage"]
    df["Linear"] = df["Episode_Length_minutes"] * 0.728
    x = df
    x_tensor = x.to_numpy().astype(np.float32)
    return x_tensor



features=["Episode_Length_minutes", "Number_of_Ads", "Host_Popularity_percentage", "Guest_Popularity_percentage",
          "Episode_Sentiment", "Publication_Day", "Publication_Time", "Genre", "Podcast_Name" ]
features_to_encode = ["Episode_Sentiment", "Publication_Day", "Publication_Time",  "Genre", "Podcast_Name"]
test_dataloader = get_test_loader("test.csv", 
    features=features,
    features_to_encode=features_to_encode )

X_test = get_test_loader("test.csv", features, features_to_encode)

# Predict
preds = model.predict(X_test)

# Write to submission file
import csv

with open("submission_xgb.csv", "w", newline='') as f:
    writer = csv.writer(f)
    writer.writerow(["id", "prediction"])
    for i, pred in enumerate(preds, start=750000):
        writer.writerow([i, pred])
