In [1]:
#todo Create new features that may capture underlying patterns (e.g., interaction terms, polynomial features).
#todo create separate file for data preparation

In [2]:
##
import pandas as pd
import numpy as np
import warnings

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, KNNImputer, IterativeImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split, cross_val_score
from TargetEncoder import TargetEncoder
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error

In [3]:
warnings.filterwarnings('ignore')

In [4]:
##
pd.options.display.float_format = '{:,.2f}'.format
pd.options.display.max_columns = None

In [5]:
##
submission = pd.read_csv('data/sample_submission.csv')
test = pd.read_csv('data/test.csv')
train = pd.read_csv('data/train.csv')
print(train.shape)

In [6]:
drop_cols = ['id']
target_col = ['Listening_Time_minutes']
cat_cols = ['Podcast_Name', 'Episode_Title', 'Genre', 'Publication_Day', 'Publication_Time', 'Episode_Sentiment']
num_cols = [col for col in train.columns if col not in drop_cols + cat_cols + target_col]

In [7]:
train.shape

In [8]:
X = train.drop(drop_cols + target_col, axis=1)
y = train['Listening_Time_minutes']

In [9]:
X['isnull_Episode_Length_minutes'] = X['Episode_Length_minutes'].isnull().astype(int)

# Impute Episode_Length_minutes  

In [10]:
from sklearn.base import BaseEstimator, RegressorMixin, clone
import numpy as np
import pandas as pd

class SplitByNullPipeline(BaseEstimator, RegressorMixin):
    def __init__(self, pipeline_with, pipeline_without, split_column):
        self.pipeline_with = pipeline_with
        self.pipeline_without = pipeline_without
        self.split_column = split_column

    def fit(self, X, y):
        # Ensure X is a DataFrame
        if not isinstance(X, pd.DataFrame):
            raise ValueError("X must be a pandas DataFrame")

        self.mask = X[self.split_column].isnull()

        # Split
        X_with = X[~self.mask]
        y_with = y[~self.mask]

        X_without = X[self.mask].drop(columns=self.split_column)
        y_without = y[self.mask]

        self.pipeline_with_ = clone(self.pipeline_with).fit(X_with, y_with)
        self.pipeline_without_ = clone(self.pipeline_without).fit(X_without, y_without)
        return self

    def predict(self, X):
        # Ensure X is a DataFrame
        if not isinstance(X, pd.DataFrame):
            raise ValueError("X must be a pandas DataFrame")

        mask = X[self.split_column].isnull()

        X_with = X[~mask]
        X_without = X[mask].drop(columns=self.split_column)

        y_pred = np.empty(X.shape[0])
        y_pred[~mask] = self.pipeline_with_.predict(X_with)
        y_pred[mask] = self.pipeline_without_.predict(X_without)

        return y_pred


In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Outliers
 

In [13]:
outliers = 'Episode_Length_minutes > 140 or Number_of_Ads > 3'
index_drop = X_train.query(outliers).index
X_train.drop(index_drop, inplace=True)
y_train.drop(index_drop, inplace=True)

# Data Cleaning and Preprocessing
 

In [14]:
def target_encoder(df_train, y_train, df_test, cols, enc):
    for col in cols:
        df_train.loc[:, col] = enc.fit_transform(df_train.loc[:, col], y_train).reshape(-1)
        df_test.loc[:, col] = enc.transform(df_test.loc[:, col]).reshape(-1)
    df_train[cols] = df_train[cols].apply(pd.to_numeric, errors='coerce')
    df_test[cols] = df_test[cols].apply(pd.to_numeric, errors='coerce')

    return df_train, df_test

In [15]:
def pipe(numer_type_cols, cat_type_cols):
    # Define transformers
    num_pipeline = Pipeline([
        ("imputer", SimpleImputer(strategy='median')),  # Median imputation
        ("scaler", StandardScaler())  # Standard scaling
    ])

    cat_pipeline = Pipeline([
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("encoder", OneHotEncoder(handle_unknown="ignore"))
    ])

    # Combine preprocessing pipelines
    preprocessor = ColumnTransformer([
        ("num", num_pipeline, numer_type_cols),
        ("cat", cat_pipeline, cat_type_cols)
    ])
    return preprocessor


In [16]:
enc = TargetEncoder()
X_train, X_test = target_encoder(X_train, y_train, X_test, ['Podcast_Name', 'Episode_Title'], enc)

In [17]:
numer_type_cols = X_train.select_dtypes(include=["number"]).columns
cat_type_cols = X_train.select_dtypes(include=["object"]).columns

numer_type_cols_without = [col for col in numer_type_cols if col != 'Episode_Length_minutes']

In [18]:
preprocessor_with = pipe(numer_type_cols, cat_type_cols)
preprocessor_without = pipe(numer_type_cols_without, cat_type_cols)

# Baseline Model:

In [19]:

pipeline_with = Pipeline([
    ("preprocessor", preprocessor_with),
    ("model", XGBRegressor(random_state=42))
])

pipeline_without = Pipeline([
    ("preprocessor", preprocessor_without),
    ("model", XGBRegressor(random_state=42))
])

# Create dual pipeline that branches on nulls
baseline_pipeline = SplitByNullPipeline(
    pipeline_with=pipeline_with,
    pipeline_without=pipeline_without,
    split_column='Episode_Length_minutes'
)

In [20]:
baseline_pipeline.fit(X_train, y_train)

In [21]:
# Perform Cross-Validation (5-fold)
cv_rmse = np.sqrt(-cross_val_score(baseline_pipeline, X_train, y_train,
                                   scoring="neg_mean_squared_error", cv=5, n_jobs=-1, verbose=100))

# Train on full training set
baseline_pipeline.fit(X_train, y_train)

# Make predictions on test set
y_pred = baseline_pipeline.predict(X_test)

# Calculate RMSE on test set
test_rmse = np.sqrt(mean_squared_error(y_test, y_pred))

# Print Results
print(f"Cross-Validation RMSE: {cv_rmse.mean():.4f} ± {cv_rmse.std():.4f}")
print(f"Test Set RMSE: {test_rmse:.4f}")

# Final model for submission

In [22]:
outliers = 'Episode_Length_minutes > 140 or Number_of_Ads > 3'
index_drop = X.query(outliers).index
X.drop(index_drop, inplace=True)
y.drop(index_drop, inplace=True)

In [23]:
enc = TargetEncoder()
X, X_test = target_encoder(X, y, test, ['Podcast_Name', 'Episode_Title'], enc)

In [24]:
baseline_pipeline.fit(X, y)

In [25]:
# Make predictions on test set
submission['Listening_Time_minutes'] = baseline_pipeline.predict(X_test)


In [None]:
submission.to_csv('submission.csv', index=False)