In [1]:
#todo KNN imputation if the missing rate is high.
#todo Create new features that may capture underlying patterns (e.g., interaction terms, polynomial features).
#todo create separate file for data preparation

In [2]:
##
import pandas as pd
import numpy as np
import warnings

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split, cross_val_score
from TargetEncoder import TargetEncoder
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error

In [3]:
warnings.filterwarnings('ignore')

In [4]:
##
pd.options.display.float_format = '{:,.2f}'.format
pd.options.display.max_columns = None

In [5]:
##
submission = pd.read_csv('data/sample_submission.csv')
test = pd.read_csv('data/test.csv')
train = pd.read_csv('data/train.csv')
print(train.shape)

(750000, 12)


In [6]:
drop_cols = ['id']
target_col = ['Listening_Time_minutes']
cat_cols = ['Podcast_Name', 'Episode_Title', 'Genre', 'Publication_Day', 'Publication_Time', 'Episode_Sentiment']
num_cols = [col for col in train.columns if col not in drop_cols + cat_cols + target_col]

In [7]:
train.shape

(750000, 12)

In [8]:
X = train.drop(drop_cols + target_col, axis=1)
y = train['Listening_Time_minutes']

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Outliers
 

In [10]:
outliers = 'Episode_Length_minutes > 140 or Number_of_Ads > 3'
index_drop = X_train.query(outliers).index
X_train.drop(index_drop, inplace=True)
y_train.drop(index_drop, inplace=True)

# Data Cleaning and Preprocessing
 

In [11]:
def target_encoder(df_train, y_train, df_test, cols, enc):
    for col in cols:
        df_train.loc[:, col] = enc.fit_transform(df_train.loc[:, col], y_train).reshape(-1)
        df_test.loc[:, col] = enc.transform(df_test.loc[:, col]).reshape(-1)
    df_train[cols] = df_train[cols].apply(pd.to_numeric, errors='coerce')
    df_test[cols] = df_test[cols].apply(pd.to_numeric, errors='coerce')

    return df_train, df_test

In [12]:
def pipe(numer_type_cols, cat_type_cols):
    
    # Define transformers
    num_pipeline = Pipeline([
        ("imputer", SimpleImputer(strategy="median")),  # Mean imputation
        ("scaler", StandardScaler())  # Standard scaling
    ])
    
    cat_pipeline = Pipeline([
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("encoder", OneHotEncoder(handle_unknown="ignore"))
    ])
    
    # Combine preprocessing pipelines
    preprocessor = ColumnTransformer([
        ("num", num_pipeline, numer_type_cols),
        ("cat", cat_pipeline, cat_type_cols)
    ])
    return preprocessor


# Baseline Model:

In [67]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

enc = TargetEncoder()
X_train, X_test = target_encoder(X_train, y_train, X_test, ['Podcast_Name', 'Episode_Title'], enc)

numer_type_cols = X_train.select_dtypes(include=["number"]).columns
cat_type_cols = X_train.select_dtypes(include=["object"]).columns

preprocessor = pipe(numer_type_cols, cat_type_cols)
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)


In [68]:
model = CatBoostRegressor(random_state=42)

In [69]:
not_transformed = preprocessor.transformers_[0][2]
transformed_columns = preprocessor.named_transformers_['cat'].named_steps['encoder'].get_feature_names_out(input_features=cat_type_cols)
all_cols = np.concatenate([not_transformed, transformed_columns])

In [70]:
X_train = pd.DataFrame(X_train, columns=all_cols)
X_test = pd.DataFrame(X_test, columns=all_cols)

In [64]:
summary = model.select_features(X_train, y_train, 
                      eval_set=(X_test, y_test),
                      features_for_select='0-29',
                      num_features_to_select=8,
                      steps=2,
                      train_final_model=False,
                      logging_level='Silent')

In [65]:
# Посмотрим на список отобранных фичей (не отранжирован по важности)
print(summary['selected_features_names'])
# И на лучшее значение лосса
print(f"Best loss: {summary['loss_graph']['loss_values'][-1]}")

['Podcast_Name', 'Episode_Title', 'Episode_Length_minutes', 'Host_Popularity_percentage', 'Guest_Popularity_percentage', 'Number_of_Ads', 'Genre_Sports', 'Genre_Technology']
Best loss: 12.987376461193623


In [66]:
# В summary сохраняется полный отчет работы алгоритма
summary

{'selected_features': [0, 1, 2, 3, 4, 5, 13, 14],
 'eliminated_features_names': ['Genre_News',
  'Genre_Health',
  'Genre_Comedy',
  'Genre_Music',
  'Genre_Business',
  'Genre_Lifestyle',
  'Genre_True Crime',
  'Genre_Education'],
 'loss_graph': {'main_indices': [0, 5],
  'removed_features_count': [0, 1, 2, 3, 4, 5, 6, 7, 8],
  'loss_values': [12.984754239775102,
   12.983841122571281,
   12.983899103517807,
   12.984017882815612,
   12.984677280696673,
   12.985948673330189,
   12.986328436519159,
   12.986792619260147,
   12.987376461193623]},
 'eliminated_features': [12, 9, 7, 11, 6, 10, 15, 8],
 'selected_features_names': ['Podcast_Name',
  'Episode_Title',
  'Episode_Length_minutes',
  'Host_Popularity_percentage',
  'Guest_Popularity_percentage',
  'Number_of_Ads',
  'Genre_Sports',
  'Genre_Technology']}

In [None]:
# # Perform Cross-Validation (5-fold)
# cv_rmse = np.sqrt(-cross_val_score(baseline_pipeline, X_train, y_train,
#                                    scoring="neg_mean_squared_error", cv=5, n_jobs=-1, verbose=100))
# 
# # Train on full training set
# baseline_pipeline.fit(X_train, y_train)
# 
# # Make predictions on test set
# y_pred = baseline_pipeline.predict(X_test)
# 
# # Calculate RMSE on test set
# test_rmse = np.sqrt(mean_squared_error(y_test, y_pred))
# 
# # Print Results
# print(f"Cross-Validation RMSE: {cv_rmse.mean():.4f} ± {cv_rmse.std():.4f}")
# print(f"Test Set RMSE: {test_rmse:.4f}")

# Final model for submission

In [None]:
# outliers = 'Episode_Length_minutes > 140 or Number_of_Ads > 3'
# index_drop = X.query(outliers).index
# X.drop(index_drop, inplace=True)
# y.drop(index_drop, inplace=True)

In [None]:
# enc = TargetEncoder()
# X, X_test = target_encoder(X, y, test, ['Podcast_Name', 'Episode_Title'], enc)

In [None]:
# baseline_pipeline.fit(X, y)

In [None]:
# # Make predictions on test set
# submission['Listening_Time_minutes'] = baseline_pipeline.predict(X_test)


In [None]:
# submission.to_csv('submission.csv', index=False)