In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from IPython.display import display
from pandas.api.types import CategoricalDtype

from sklearn.impute import SimpleImputer
from category_encoders import MEstimateEncoder
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.feature_selection import mutual_info_regression
from sklearn.model_selection import KFold, cross_val_score
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from pathlib import Path
from scipy.stats import skew 
import optuna


In [None]:
# disini bisa buat fitur episode_number
def clean(df):

    df['Episode_Number'] = df['Episode_Title'].str.extract(r'(\d+)').astype(float)
    df = df.drop('Episode_Title', axis=1)

    return df

In [None]:
def encode(df):
    # The nominative (unordered) categorical features
    features_nom = [
    'Podcast_Name',
    'Genre',
    'Publication_Day',
    ]
    
    features_ord = ['Episode_Sentiment', 'Publication_Time']

    ordered_levels = {
        'Episode_Sentiment': ['Negative', 'Neutral', 'Positive'],
        'Publication_Time': ['Morning', 'Afternoon', 'Evening', 'Night']
    }

    # Add a None level for missing values
    ordered_levels = {key: ["None"] + value for key, value in
                  ordered_levels.items()}

    
    # Nominal categories
    for name in features_nom:
        df[name] = df[name].astype("category")
        # Add a None category for missing values
        if "None" not in df[name].cat.categories:
            df[name] = df[name].cat.add_categories("None")
    # Ordinal categories
    for name, levels in ordered_levels.items():
        df[name] = df[name].astype(CategoricalDtype(levels,
                                                    ordered=True))
    
    return df

In [None]:
def impute(df):
    for name in df.select_dtypes("number"):
        df[name] = df[name].fillna(0)
    for name in df.select_dtypes("category"):
        df[name] = df[name].fillna("None")
    return df

def impute_upgraded(df):
    for name in df.select_dtypes("number").columns:
        df[name] = df[name].fillna(df[name].median())
    for name in df.select_dtypes("category").columns:
        df[name] = df[name].fillna(df[name].mode().iloc[0])  # mode bisa punya banyak nilai
    return df

def impute_fillna_mean(df):
    # numerical features
    for name in df.select_dtypes("number"):
        df[name] = df[name].fillna(df[name].mean())
    
    # categorical features
    for name in df.select_dtypes("category"):
        df[name] = df[name].fillna(df[name].mode().iloc[0])

    return df


def impute_fillna_median(df):
    # numerical features
    for name in df.select_dtypes("number"):
        df[name] = df[name].fillna(df[name].median())
 
    # categorical features
    for name in df.select_dtypes("category"):
        df[name] = df[name].fillna(df[name].mode().iloc[0])

    return df


def impute_simple_mean(df):
    #numerical features
    num_features = df.select_dtypes(include=['int64', 'float64']).columns
    num_imputer = SimpleImputer(strategy='mean')
    df[num_features] = num_imputer.fit_transform(df[num_features])

    #categorical features
    cat_features = df.select_dtypes(include=['object', 'category']).columns
    cat_imputer = SimpleImputer(strategy='most_frequent')
    df[cat_features] = cat_imputer.fit_transform(df[cat_features])

    for col in cat_features:
        df[col] = df[col].astype('category').cat.codes

    return df   

def impute_simple_median(df):
    #numerical features
    num_features = df.select_dtypes(include=['int64', 'float64']).columns
    imputer = SimpleImputer(strategy='median')
    df[num_features] = imputer.fit_transform(df[num_features])

    #categorical features
    cat_features = df.select_dtypes(include=['object', 'category']).columns
    imputer = SimpleImputer(strategy='most_frequent')
    df[cat_features] = imputer.fit_transform(df[cat_features])

    for col in cat_features:
        df[col] = df[col].astype('category').cat.codes
    return df 


In [None]:
data_dir = 'input/'
# data_dir = Path("/kaggle/input/playground-series-s5e4/")

df_train = pd.read_csv(data_dir + 'train.csv', index_col="id")
df_test = pd.read_csv(data_dir + 'test.csv', index_col="id")
df_org = pd.read_csv(data_dir + 'org.csv')

# Gabungkan train dan original
original_clean = df_org.reset_index(drop=True)
df_train = df_train.reset_index(drop=True)

# Gabung train dan original, lalu beri index baru berurutan
df_train = pd.concat([df_train, original_clean], ignore_index=True)
df_train.index.name = 'id'

# Reset dan beri index baru untuk test agar index tidak bentrok
df_test = df_test.reset_index(drop=True)
df_test.index = range(len(df_train), len(df_train) + len(df_test))
df_test.index.name = 'id'

# Gabungkan semua untuk preprocessing
df = pd.concat([df_train, df_test])

# Preprocessing
df = clean(df)
df = encode(df)
df = impute_upgraded(df)

# Bagi kembali menjadi train dan test
df_train = df.iloc[:len(df_train)].copy()
df_test = df.iloc[len(df_train):].copy()


In [None]:
df_test

In [None]:

def load_data_for_baseLine():
    # data_dir = 'input/'
    data_dir = 'input/'
    data_dir = Path("/kaggle/input/playground-series-s5e4/")

    df_train = pd.read_csv(data_dir + 'train.csv', index_col="id")
    df_test = pd.read_csv(data_dir + 'test.csv', index_col="id")

    # Simpan panjang data asli untuk df_train dan df_test
    train_len = len(df_train)
    test_len = len(df_test)

    # Gabungkan df_train dan df_test
    df = pd.concat([df_train, df_test])

    # Preprocessing
    df = clean(df)
    df = encode(df)
    #df = impute_fillna_median(df)
    #df = impute_upgraded(df)
    df = impute(df)

    # df['is_weekend'] = df['Publication_Day'].apply(lambda x: 1 if x in ['Saturday', 'Sunday'] else 0)

    
    # df['Number_of_Ads_log'] = np.log1p(df['Number_of_Ads'])
    # df['Guest_Popularity_percentage_log'] = np.log1p(df['Guest_Popularity_percentage'])
    # df['Host_Popularity_percentage_log'] = np.log1p(df['Host_Popularity_percentage'])
    # df['Episode_Length_minutes_log'] = np.log1p(df['Episode_Length_minutes']) 
    
    # df = df.drop(columns=['Number_of_Ads'])
    # df = df.drop(columns=['Guest_Popularity_percentage'])
    # df = df.drop(columns=['Host_Popularity_percentage'])
    # df = df.drop(columns=['Episode_Length_minutes'])
    

    # Pisahkan kembali df_train dan df_test berdasarkan panjang data asli
    df_train = df.iloc[:train_len, :]
    df_test = df.iloc[train_len:train_len + test_len, :]

    # df_train = outlier_check(df_train, 'Episode_Length_minutes', log=True, return_filtered=True, plot=False)



    return df_train, df_test

In [None]:
df_train, df_test = load_data_for_baseLine()

In [None]:
X = df_train.copy()
y = X.pop("Listening_Time_minutes")
X_test = df_test.copy()
X_test = X_test.drop(columns=["Listening_Time_minutes"], errors='ignore')  # drop kalau ada

In [None]:
for colname in X.select_dtypes("category"):
        X[colname] = X[colname].cat.codes

for colname in X_test.select_dtypes("category"):
        X_test[colname] = X_test[colname].cat.codes

In [None]:
#Baseline :
# XGB : 13.207 RMSE
# Random Forest : 12.68481 
# LGB : 13.207 RMSE

# MRSE : 12.83734 RMSE
xgb_params = {
    'random_state': 0,
    'n_estimators': 565,
    'max_depth': 14,
    'learning_rate': 0.04222221,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'random_state': 42,    
    'tree_method':'hist', 
    # 'tree_method':'gpu_hist', 
    'n_jobs': -1  
}

# MRESE : 12.798993843624153.
xgb_params_2 = {
  'max_depth': 12,
  'learning_rate': 0.05858702616823876,
  'subsample': 0.9356075676850377,
  'colsample_bytree': 0.7895819265828284,
  'gamma': 1.7903575391246762
}

#MRESE : 12.65171 
random_forest_params = {
    'random_state': 0,
    'n_estimators': 500,
    'max_depth': 25,
    'min_samples_split': 2,
    'min_samples_leaf': 1,
    'max_features': 'sqrt',
    'bootstrap': True,
    'n_jobs': -1
}



#xgbmodel 
xgb_model = XGBRegressor(
    random_state = 0,
    n_estimators = 565,
    max_depth= 14,
    learning_rate = 0.04222221,
    subsample= 0.8,
    colsample_bytree = 0.8,   
    n_jobs= -1 
)

#lgbmodel
lgbm_model = LGBMRegressor(
        random_state = 0,
        n_iter=1000,
        max_depth=-1,
        num_leaves=1024,
        colsample_bytree=0.7,
        learning_rate=0.03,
        objective='l2',
        verbosity=-1,
        max_bin=1024,
)

In [None]:
def score_dataset(X, y, model=XGBRegressor(random_state=0)):
    
    # Cross-validation pakai RMSE
    score = cross_val_score(
        model, X, y, 
        cv=5,
        scoring="neg_mean_squared_error"
    )

    score = -1 * score.mean()
    score = np.sqrt(score)  #matriknya make rmse
    return score

In [None]:

X = df_train.copy()
y = X.pop("Listening_Time_minutes")

baseline_score = score_dataset(X, y)
print(f"Baseline score: {baseline_score:.5f} RMSE")


In [None]:
#Baseline With KFold

from sklearn.metrics import mean_squared_error

def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))


def score_dataset_Kfold(X,y, X_test):

    X = X.copy()
    X_test = X_test.copy()
    
    for colname in X.select_dtypes("category"):
        X[colname] = X[colname].cat.codes
        
    n_splits = 5
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    scores = []
    test_preds = np.zeros(len(X_test))


    for fold, (train_idx, val_idx) in enumerate(kf.split(X, y)):
        print(f"Training fold {fold + 1}/{n_splits}...")    
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]   
        model = XGBRegressor()
        model.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=100)    
        val_pred = model.predict(X_val)
        score = rmse(y_val, val_pred)
        scores.append(score)
        test_preds += model.predict(X_test) / n_splits      
        print(f"Fold {fold + 1} RMSE: {score:.4f}")

    print(f'Optimized Cross-validated RMSE score: {np.mean(scores):.5f} +/- {np.std(scores):.5f}')
    print(f'Max RMSE score: {np.max(scores):.5f}')
    print(f'Min RMSE score: {np.min(scores):.5f}')

    return test_preds

In [None]:
pred =score_dataset_Kfold(X, y, X_test)

In [None]:
import optuna
import xgboost as xgb

dtrain = xgb.DMatrix(X, label=y)

def objective(trial):
    param = {
        'objective': 'reg:squarederror',
        'eval_metric': 'rmse',
        'n_estimators': 10000,  # gede, biar dihentikan early
        'max_depth': trial.suggest_int('max_depth', 3, 20),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'gamma': trial.suggest_float('gamma', 0, 5),
    }

    # XGBoost CV dengan early stopping
    cv_result = xgb.cv(
        params=param,
        dtrain=dtrain,
        num_boost_round=10000,
        nfold=3,
        early_stopping_rounds=50,
        metrics="rmse",
        seed=0,
        verbose_eval=False
    )

    # Ambil RMSE minimum (fold terbaik)
    best_rmse = cv_result['test-rmse-mean'].min()
    return best_rmse

# Buat studi Optuna
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=50)

# Hasil
print("Best parameters:", study.best_params)
print("Best RMSE:", study.best_value)


### Random Forest

- baseline score :  12.68481 RMSE

In [None]:
from sklearn.ensemble import RandomForestRegressor


# Define the objective function for Optuna
def objective(trial):
    # Sample hyperparameters using Optuna
    n_estimators = trial.suggest_int('n_estimators', 50, 500)  # number of trees
    max_depth = trial.suggest_int('max_depth', 3, 20)  # max depth of each tree
    min_samples_split = trial.suggest_int('min_samples_split', 2, 20)  # minimum samples to split a node
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 20)  # minimum samples per leaf

    # Create a RandomForestRegressor with the sampled hyperparameters
    model = RandomForestRegressor(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        random_state=42
    )
    
    # Use cross-validation to evaluate the model
    score = cross_val_score(model, X, y, cv=3, scoring='neg_root_mean_squared_error')  # We use RMSE here
    return -np.mean(score)  # Negative because cross_val_score returns negative values

# Create an Optuna study to minimize RMSE
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=50)  # Perform 50 trials

# Print the best hyperparameters and the best RMSE found
print("Random FOrest Best parameters:", study.best_params)
print("Best RMSE:", study.best_value)


In [None]:
from sklearn.ensemble import RandomForestRegressor


baseline_score = score_dataset(X, y, RandomForestRegressor(random_state=0))
print(f"RandomForest Baseline score: {baseline_score:.5f} RMSE")

In [None]:

score = score_dataset(X, y, RandomForestRegressor(**random_forest_params))
print(f"RandomForest params score: {score:.5f} RMSE")

In [None]:
def score_stacking_Kfold(X, y, X_test, base_models, meta_model):
    """
    X: training features
    y: training targets
    X_test: test features
    base_models: list of models
    meta_model: model for stacking
    """
    X = X.copy()
    X_test = X_test.copy()

    n_splits = 5
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    scores = []
    test_preds = np.zeros(len(X_test))

    for fold, (train_idx, val_idx) in enumerate(kf.split(X, y)):
        print(f"Training fold {fold + 1}/{n_splits}...")
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

        # 1. Buat meta-features untuk training dan validasi
        X_train_meta = np.zeros((len(X_train), len(base_models)))
        X_val_meta = np.zeros((len(X_val), len(base_models)))
        X_test_meta = np.zeros((len(X_test), len(base_models)))

        for i, model in enumerate(base_models):
            model.fit(X_train, y_train)
            X_train_meta[:, i] = model.predict(X_train)
            X_val_meta[:, i] = model.predict(X_val)
            X_test_meta[:, i] += model.predict(X_test) / n_splits  # Rata-rata nanti

        # 2. Train meta-model
        meta_model.fit(X_train_meta, y_train)

        # 3. Predict dan hitung skor di validation
        val_pred = meta_model.predict(X_val_meta)
        score = rmse(y_val, val_pred)
        scores.append(score)

        # 4. Predict untuk test set
        test_preds += meta_model.predict(X_test_meta) / n_splits

        print(f"Fold {fold + 1} RMSE: {score:.4f}")

    print(f'Optimized Cross-validated RMSE score: {np.mean(scores):.5f} +/- {np.std(scores):.5f}')
    print(f'Max RMSE score: {np.max(scores):.5f}')
    print(f'Min RMSE score: {np.min(scores):.5f}')

    return test_preds


In [None]:
def stacking_regression_with_rmse(base_models, meta_model, X_train, y_train, X_test, n_splits=5):

    # Prepare arrays
    n_train, n_test = X_train.shape[0], X_test.shape[0]
    n_models = len(base_models)
    
    base_predictions_train = np.zeros((n_train, n_models))
    base_predictions_test = np.zeros((n_test, n_models))
    
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    scores = []
    
    for i, model in enumerate(base_models):
        test_fold_predictions = np.zeros((n_test, n_splits))

        for j, (train_idx, valid_idx) in enumerate(kf.split(X_train)):
            X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[valid_idx]
            y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[valid_idx]

            model.fit(X_tr, y_tr)
            base_predictions_train[valid_idx, i] = model.predict(X_val)
            test_fold_predictions[:, j] = model.predict(X_test)

        base_predictions_test[:, i] = test_fold_predictions.mean(axis=1)

    # Train meta model
    meta_model.fit(base_predictions_train, y_train)
    
    # Predict on training set
    train_pred = meta_model.predict(base_predictions_train)
    final_prediction = meta_model.predict(base_predictions_test)

    # Evaluate RMSE
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    for fold, (train_idx, valid_idx) in enumerate(kf.split(base_predictions_train)):
        X_val = base_predictions_train[valid_idx]
        y_val = y_train.iloc[valid_idx]
        val_pred = meta_model.predict(X_val)
        score = rmse(y_val, val_pred)
        scores.append(score)
        print(f"Fold {fold+1} RMSE: {score:.4f}")

    print(f'Optimized Cross-validated RMSE score: {np.mean(scores):.5f} +/- {np.std(scores):.5f}')
    print(f'Max RMSE score: {np.max(scores):.5f}')
    print(f'Min RMSE score: {np.min(scores):.5f}')

    return final_prediction

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge

base_models = [
    xgb_model,
    RandomForestRegressor(random_state=0),
    lgbm_model
]

meta_model = Ridge(random_state=0)



In [None]:
# final_predictions_2 = stacking_regression_with_rmse(
#     base_models, 
#     meta_model, 
#     X, 
#     y, 
#     X_test,
#     n_splits=5
# )


In [None]:
# final_test_prediction = score_stacking_Kfold(
#     X, 
#     y, 
#     X_test, 
#     base_models, 
#     meta_model)


## Next Upcoming Experiment


1. Manipulasi Text feature "podcast Name" dengan regex atau yg lain

2. Stacking model


3. Interaksi Genre x TimeOfDay → genre tertentu perform lebih bagus pagi/malam?

4. Coba k-means clustering untuk Podcast_Name → beri cluster ID sebagai feature