In [58]:
#todo KNN imputation if the missing rate is high.
#todo Create new features that may capture underlying patterns (e.g., interaction terms, polynomial features).
#todo create separate file for data preparation

In [59]:
##
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split, cross_val_score
from TargetEncoder import TargetEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

In [60]:
warnings.filterwarnings('ignore')

In [61]:
##
pd.options.display.float_format = '{:,.2f}'.format
pd.options.display.max_columns = None

In [62]:
##
submission = pd.read_csv('data/sample_submission.csv')
test = pd.read_csv('data/test.csv')
train = pd.read_csv('data/train.csv')
print(train.shape)

(750000, 12)


In [63]:
drop_cols = ['id']
target_col = ['Listening_Time_minutes']
cat_cols = ['Podcast_Name', 'Episode_Title', 'Genre', 'Publication_Day', 'Publication_Time', 'Episode_Sentiment']
num_cols = [col for col in train.columns if col not in drop_cols + cat_cols + target_col]

In [64]:
train.shape

(750000, 12)

In [65]:
X = train.drop(drop_cols + target_col, axis=1)
y = train['Listening_Time_minutes']

In [66]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Outliers
 

In [67]:
outliers = 'Episode_Length_minutes > 140 or Number_of_Ads > 3'
index_drop = X_train.query(outliers).index
X_train.drop(index_drop, inplace=True)
y_train.drop(index_drop, inplace=True)

# Data Cleaning and Preprocessing
 

In [68]:
def target_encoder(df_train, df_test, cols, enc):
    for col in cols:
        df_train.loc[:, col] = enc.fit_transform(df_train.loc[:, col], y_train).reshape(-1)
        df_test.loc[:, col] = enc.transform(df_test.loc[:, col]).reshape(-1)
    df_train[cols] = df_train[cols].apply(pd.to_numeric, errors='coerce')
    df_test[cols] = df_test[cols].apply(pd.to_numeric, errors='coerce')

    return df_train, df_test


enc = TargetEncoder()


In [69]:
X_train, X_test = target_encoder(X_train, X_test, ['Podcast_Name', 'Episode_Title'], enc)

In [70]:
numer_type_cols = X_train.select_dtypes(include=["number"]).columns
cat_type_cols = X_train.select_dtypes(include=["object"]).columns

# Define transformers
num_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),  # Mean imputation
    ("scaler", StandardScaler())  # Standard scaling
])

cat_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

# Combine preprocessing pipelines
preprocessor = ColumnTransformer([
    ("num", num_pipeline, numer_type_cols),
    ("cat", cat_pipeline, cat_type_cols)
])



In [71]:
# X_train = preprocessor.fit_transform(X_train)
# X_test = preprocessor.transform(X_test)

# Baseline Model:

In [78]:
# Define full pipeline with Linear Regression
baseline_pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("model", RandomForestRegressor(n_jobs=-1,
                                    random_state=42, verbose=100))
])

In [79]:
# Perform Cross-Validation (5-fold)
cv_rmse = np.sqrt(-cross_val_score(baseline_pipeline, X_train, y_train,
                                   scoring="neg_mean_squared_error", cv=5, n_jobs = -1, verbose=100))

# Train on full training set
baseline_pipeline.fit(X_train, y_train)

# Make predictions on test set
y_pred = baseline_pipeline.predict(X_test)

# Calculate RMSE on test set
test_rmse = np.sqrt(mean_squared_error(y_test, y_pred))

# Print Results
print(f"Cross-Validation RMSE: {cv_rmse.mean():.4f} ± {cv_rmse.std():.4f}")
print(f"Test Set RMSE: {test_rmse:.4f}")

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
building tree 1 of 100
building tree 2 of 100
building tree 3 of 100
building tree 4 of 100
building tree 5 of 100
building tree 6 of 100
building tree 7 of 100
building tree 8 of 100
building tree 9 of 100[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    3.9s

building tree 10 of 100[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    4.7s

building tree 11 of 100
[Parallel(n_jobs=-1)]: Done   3 tasks      | elapsed:    4.7s
building tree 12 of 100
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:    4.8s
building tree 13 of 100[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    5.2s

building tree 14 of 100[Parallel(n_jobs=-1)]: Done   6 tasks      | elapsed:    5.6s

building tree 15 of 100
[Parallel(n_jobs=-1)]: Done   7 tasks      | elapsed:    5.8s
building tree 16 of 100[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    5.8s

building tree 17 of 100[Parallel(n_jobs=-1)]: Do