In [47]:
#todo KNN imputation if the missing rate is high.
#todo Create new features that may capture underlying patterns (e.g., interaction terms, polynomial features).
#todo create separate file for data preparation

In [48]:
##
import pandas as pd
import numpy as np
import warnings

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split, cross_val_score
from TargetEncoder import TargetEncoder
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error

In [49]:
warnings.filterwarnings('ignore')

In [50]:
##
pd.options.display.float_format = '{:,.2f}'.format
pd.options.display.max_columns = None

In [51]:
##
submission = pd.read_csv('data/sample_submission.csv')
test = pd.read_csv('data/test.csv')
train = pd.read_csv('data/train.csv')
print(train.shape)

(750000, 12)


In [52]:
drop_cols = ['id']
target_col = ['Listening_Time_minutes']
cat_cols = ['Podcast_Name', 'Episode_Title', 'Genre', 'Publication_Day', 'Publication_Time', 'Episode_Sentiment']
num_cols = [col for col in train.columns if col not in drop_cols + cat_cols + target_col]

In [53]:
train.shape

(750000, 12)

In [54]:
X = train.drop(drop_cols + target_col, axis=1)
y = train['Listening_Time_minutes']

In [55]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Outliers
 

In [56]:
outliers = 'Episode_Length_minutes > 140 or Number_of_Ads > 3'
index_drop = X_train.query(outliers).index
X_train.drop(index_drop, inplace=True)
y_train.drop(index_drop, inplace=True)

# Data Cleaning and Preprocessing
 

In [57]:
def target_encoder(df_train, y_train, df_test, cols, enc):
    for col in cols:
        df_train.loc[:, col] = enc.fit_transform(df_train.loc[:, col], y_train).reshape(-1)
        df_test.loc[:, col] = enc.transform(df_test.loc[:, col]).reshape(-1)
    df_train[cols] = df_train[cols].apply(pd.to_numeric, errors='coerce')
    df_test[cols] = df_test[cols].apply(pd.to_numeric, errors='coerce')

    return df_train, df_test

In [58]:
def pipe(numer_type_cols, cat_type_cols):
    
    # Define transformers
    num_pipeline = Pipeline([
        ("imputer", SimpleImputer(strategy="median")),  # Mean imputation
        ("scaler", StandardScaler())  # Standard scaling
    ])
    
    cat_pipeline = Pipeline([
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("encoder", OneHotEncoder(handle_unknown="ignore"))
    ])
    
    # Combine preprocessing pipelines
    preprocessor = ColumnTransformer([
        ("num", num_pipeline, numer_type_cols),
        ("cat", cat_pipeline, cat_type_cols)
    ])
    return preprocessor


In [59]:
enc = TargetEncoder()
X_train, X_test = target_encoder(X_train, y_train, X_test, ['Podcast_Name', 'Episode_Title'], enc)

In [60]:
numer_type_cols = X_train.select_dtypes(include=["number"]).columns
cat_type_cols = X_train.select_dtypes(include=["object"]).columns

In [61]:
preprocessor = pipe(numer_type_cols, cat_type_cols)

# Baseline Model:

In [62]:
# Define full pipeline with Linear Regression
baseline_pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("model", CatBoostRegressor(random_state=42))
])

In [63]:
# Perform Cross-Validation (5-fold)
cv_rmse = np.sqrt(-cross_val_score(baseline_pipeline, X_train, y_train,
                                   scoring="neg_mean_squared_error", cv=5, n_jobs=-1, verbose=100))

# Train on full training set
baseline_pipeline.fit(X_train, y_train)

# Make predictions on test set
y_pred = baseline_pipeline.predict(X_test)

# Calculate RMSE on test set
test_rmse = np.sqrt(mean_squared_error(y_test, y_pred))

# Print Results
print(f"Cross-Validation RMSE: {cv_rmse.mean():.4f} ± {cv_rmse.std():.4f}")
print(f"Test Set RMSE: {test_rmse:.4f}")

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[CV] START .....................................................................
Learning rate set to 0.108596
0:	learn: 25.0201513	total: 159ms	remaining: 2m 38s
1:	learn: 23.1890075	total: 211ms	remaining: 1m 45s
2:	learn: 21.6023213	total: 301ms	remaining: 1m 40s
3:	learn: 20.2433067	total: 372ms	remaining: 1m 32s
4:	learn: 19.0812137	total: 433ms	remaining: 1m 26s
5:	learn: 18.0961276	total: 515ms	remaining: 1m 25s
6:	learn: 17.2706390	total: 557ms	remaining: 1m 19s
7:	learn: 16.5793899	total: 616ms	remaining: 1m 16s
8:	learn: 15.9943997	total: 691ms	remaining: 1m 16s
9:	learn: 15.5073793	total: 752ms	remaining: 1m 14s
10:	learn: 15.1053396	total: 796ms	remaining: 1m 11s
11:	learn: 14.7715271	total: 837ms	remaining: 1m 8s
12:	learn: 14.4947178	total: 870ms	remaining: 1m 6s
13:	learn: 14.2613208	total: 911ms	remaining: 1m 4s
14:	learn: 14.0759436	total: 945ms	remaining: 1m 2s
15:	learn: 13.9275167	total: 972

# Final model for submission

In [64]:
outliers = 'Episode_Length_minutes > 140 or Number_of_Ads > 3'
index_drop = X.query(outliers).index
X.drop(index_drop, inplace=True)
y.drop(index_drop, inplace=True)

In [65]:
enc = TargetEncoder()
X, X_test = target_encoder(X, y, test, ['Podcast_Name', 'Episode_Title'], enc)

In [66]:
baseline_pipeline.fit(X, y)

Learning rate set to 0.11653
0:	learn: 24.8562214	total: 32ms	remaining: 32s
1:	learn: 22.9128621	total: 49.5ms	remaining: 24.7s
2:	learn: 21.2539748	total: 66.5ms	remaining: 22.1s
3:	learn: 19.8583993	total: 83.8ms	remaining: 20.9s
4:	learn: 18.6865553	total: 102ms	remaining: 20.4s
5:	learn: 17.6883907	total: 121ms	remaining: 20s
6:	learn: 16.8688830	total: 152ms	remaining: 21.6s
7:	learn: 16.1985093	total: 172ms	remaining: 21.4s
8:	learn: 15.6402402	total: 194ms	remaining: 21.3s
9:	learn: 15.1832547	total: 217ms	remaining: 21.5s
10:	learn: 14.8117741	total: 236ms	remaining: 21.2s
11:	learn: 14.5055558	total: 253ms	remaining: 20.8s
12:	learn: 14.2595795	total: 269ms	remaining: 20.4s
13:	learn: 14.0595672	total: 286ms	remaining: 20.1s
14:	learn: 13.8983550	total: 306ms	remaining: 20.1s
15:	learn: 13.7717043	total: 323ms	remaining: 19.9s
16:	learn: 13.6692070	total: 340ms	remaining: 19.7s
17:	learn: 13.5876555	total: 358ms	remaining: 19.5s
18:	learn: 13.5191113	total: 376ms	remaining: 1

In [67]:
# Make predictions on test set
submission['Listening_Time_minutes'] = baseline_pipeline.predict(X_test)


In [68]:
submission.to_csv('submission.csv', index=False)