In [1]:
#todo KNN imputation if the missing rate is high.
#todo Create new features that may capture underlying patterns (e.g., interaction terms, polynomial features).
#todo create separate file for data preparation

In [2]:
##
import pandas as pd
import numpy as np
import warnings

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split, cross_val_score
from TargetEncoder import TargetEncoder
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error

In [3]:
warnings.filterwarnings('ignore')

In [4]:
##
pd.options.display.float_format = '{:,.2f}'.format
pd.options.display.max_columns = None

In [5]:
##
submission = pd.read_csv('data/sample_submission.csv')
test = pd.read_csv('data/test.csv')
train = pd.read_csv('data/train.csv')
print(train.shape)

(750000, 12)


In [6]:
drop_cols = ['id']
target_col = ['Listening_Time_minutes']
cat_cols = ['Podcast_Name', 'Episode_Title', 'Genre', 'Publication_Day', 'Publication_Time', 'Episode_Sentiment']
num_cols = [col for col in train.columns if col not in drop_cols + cat_cols + target_col]

In [7]:
train.shape

(750000, 12)

In [75]:
X = train.drop(drop_cols + target_col, axis=1)
y = train['Listening_Time_minutes']

In [76]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Outliers
 

In [10]:
outliers = 'Episode_Length_minutes > 140 or Number_of_Ads > 3'
index_drop = X_train.query(outliers).index
X_train.drop(index_drop, inplace=True)
y_train.drop(index_drop, inplace=True)

# Baseline Model:

In [79]:
model = CatBoostRegressor(random_state=42, cat_features = cat_cols)

In [80]:
summary = model.select_features(X_train, y_train, 
                      eval_set=(X_test, y_test),
                      features_for_select='0-9',
                      num_features_to_select=5,
                      steps=2,
                      train_final_model=False,                            
                      logging_level='Silent')

In [81]:
# Посмотрим на список отобранных фичей (не отранжирован по важности)
print(summary['selected_features_names'])
# И на лучшее значение лосса
print(f"Best loss: {summary['loss_graph']['loss_values'][-1]}")

['Episode_Title', 'Episode_Length_minutes', 'Host_Popularity_percentage', 'Guest_Popularity_percentage', 'Number_of_Ads']
Best loss: 13.026719406495692


In [73]:
# В summary сохраняется полный отчет работы алгоритма
summary

{'selected_features': [0, 1, 2, 3, 4, 5, 20, 29],
 'eliminated_features_names': ['Genre_News',
  'Publication_Day_Sunday',
  'Genre_Health',
  'Genre_Comedy',
  'Publication_Day_Tuesday',
  'Genre_Music',
  'Episode_Sentiment_Negative',
  'Publication_Day_Friday',
  'Publication_Day_Saturday',
  'Genre_Business',
  'Publication_Time_Morning',
  'Publication_Time_Afternoon',
  'Publication_Day_Wednesday',
  'Genre_Lifestyle',
  'Genre_Sports',
  'Episode_Sentiment_Neutral',
  'Genre_True Crime',
  'Genre_Education',
  'Publication_Day_Monday',
  'Publication_Time_Night',
  'Publication_Time_Evening',
  'Genre_Technology'],
 'loss_graph': {'main_indices': [0, 15],
  'removed_features_count': [0,
   1,
   2,
   3,
   4,
   5,
   6,
   7,
   8,
   9,
   10,
   11,
   12,
   13,
   14,
   15,
   16,
   17,
   18,
   19,
   20,
   21,
   22],
  'loss_values': [12.984754239775102,
   12.984321046140481,
   12.983997079217131,
   12.98402684389806,
   12.984085474105473,
   12.984152002249521,

In [84]:
# Perform Cross-Validation (5-fold)
X_train = X_train[summary['selected_features_names']]
X_test = X_test[summary['selected_features_names']]
model = CatBoostRegressor(random_state=42, cat_features = ['Episode_Title'])

cv_rmse = np.sqrt(-cross_val_score(model, X_train, y_train,
                                   scoring="neg_mean_squared_error", cv=5, n_jobs=-1, verbose=100))

# Train on full training set
model.fit(X_train, y_train)

# Make predictions on test set
y_pred = model.predict(X_test)

# Calculate RMSE on test set
test_rmse = np.sqrt(mean_squared_error(y_test, y_pred))

# Print Results
print(f"Cross-Validation RMSE: {cv_rmse.mean():.4f} ± {cv_rmse.std():.4f}")
print(f"Test Set RMSE: {test_rmse:.4f}")

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[CV] START .....................................................................
[CV] END .................................. score: (test=nan) total time=   0.0s
[CV] START .....................................................................
[CV] END .................................. score: (test=nan) total time=   0.1s
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:  1.8min
[CV] START .....................................................................
[CV] END .................................. score: (test=nan) total time=   0.0s
[CV] START .....................................................................
Learning rate set to 0.108596
0:	learn: 25.0406067	total: 157ms	remaining: 2m 36s
1:	learn: 23.2233455	total: 247ms	remaining: 2m 3s
2:	learn: 21.6553406	total: 395ms	remaining: 2m 11s
3:	learn: 20.3107529	total: 468ms	remaining: 1m 56s
4:	learn: 19.1631181	total: 556ms	remaining: 1m 50s
5:	lea

# Final model for submission

In [None]:
# outliers = 'Episode_Length_minutes > 140 or Number_of_Ads > 3'
# index_drop = X.query(outliers).index
# X.drop(index_drop, inplace=True)
# y.drop(index_drop, inplace=True)

In [None]:
# enc = TargetEncoder()
# X, X_test = target_encoder(X, y, test, ['Podcast_Name', 'Episode_Title'], enc)

In [None]:
# baseline_pipeline.fit(X, y)

In [None]:
# # Make predictions on test set
# submission['Listening_Time_minutes'] = baseline_pipeline.predict(X_test)


In [None]:
# submission.to_csv('submission.csv', index=False)