# drugie podejscie do czyszczenia danych

In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
from scipy.stats import mstats
from sklearn.model_selection import KFold
import warnings
warnings.filterwarnings('ignore')

# Ścieżki do plików
TRAIN_PATH = Path("data-task/train.csv")
TEST_PATH = Path("data-task/test.csv")

# Wczytaj dane
train = pd.read_csv(TRAIN_PATH)
test = pd.read_csv(TEST_PATH)

print(f"Train shape: {train.shape}")
print(f"Test shape: {test.shape}")
print(f"\nTrain columns: {train.columns.tolist()}")

Train shape: (750000, 12)
Test shape: (250000, 11)

Train columns: ['id', 'Podcast_Name', 'Episode_Title', 'Episode_Length_minutes', 'Genre', 'Host_Popularity_percentage', 'Publication_Day', 'Publication_Time', 'Guest_Popularity_percentage', 'Number_of_Ads', 'Episode_Sentiment', 'Listening_Time_minutes']


Analiza braków danych

In [2]:
print("=== BRAKI W TRAIN ===")
print(train.isnull().sum())
print(f"\nProcent braków:")
print((train.isnull().sum() / len(train) * 100).round(2))

print("\n=== BRAKI W TEST ===")
print(test.isnull().sum())
print(f"\nProcent braków:")
print((test.isnull().sum() / len(test) * 100).round(2))

=== BRAKI W TRAIN ===
id                                  0
Podcast_Name                        0
Episode_Title                       0
Episode_Length_minutes          87093
Genre                               0
Host_Popularity_percentage          0
Publication_Day                     0
Publication_Time                    0
Guest_Popularity_percentage    146030
Number_of_Ads                       1
Episode_Sentiment                   0
Listening_Time_minutes              0
dtype: int64

Procent braków:
id                              0.00
Podcast_Name                    0.00
Episode_Title                   0.00
Episode_Length_minutes         11.61
Genre                           0.00
Host_Popularity_percentage      0.00
Publication_Day                 0.00
Publication_Time                0.00
Guest_Popularity_percentage    19.47
Number_of_Ads                   0.00
Episode_Sentiment               0.00
Listening_Time_minutes          0.00
dtype: float64

=== BRAKI W TEST ===
id         

Dodanie flag dla brakujących danych

In [3]:
# Ponieważ test ma braki w tych samych kolumnach co train, flagi będą informatywne!

# Flagi dla Episode_Length_minutes
train["Episode_Length_missing"] = train["Episode_Length_minutes"].isnull().astype(int)
test["Episode_Length_missing"] = test["Episode_Length_minutes"].isnull().astype(int)

# Flagi dla Guest_Popularity_percentage
train["Guest_Pop_missing"] = train["Guest_Popularity_percentage"].isnull().astype(int)
test["Guest_Pop_missing"] = test["Guest_Popularity_percentage"].isnull().astype(int)

print("Flagi dodane!")
print(f"Train - Episode_Length_missing: {train['Episode_Length_missing'].sum()}")
print(f"Test - Episode_Length_missing: {test['Episode_Length_missing'].sum()}")
print(f"Train - Guest_Pop_missing: {train['Guest_Pop_missing'].sum()}")
print(f"Test - Guest_Pop_missing: {test['Guest_Pop_missing'].sum()}")

Flagi dodane!
Train - Episode_Length_missing: 87093
Test - Episode_Length_missing: 28736
Train - Guest_Pop_missing: 146030
Test - Guest_Pop_missing: 48832


Usunięcie braków w Number_of_Ads (tylko train)

In [4]:
print(f"Train przed usunięciem: {len(train)}")
train = train.dropna(subset=["Number_of_Ads"])
print(f"Train po usunięciu: {len(train)}")
print(f"Usunięto: {87093 + 146030 + 1 - len(train)} wierszy")

Train przed usunięciem: 750000
Train po usunięciu: 749999
Usunięto: -516875 wierszy


Imputation brakujących wartości

In [5]:
# Obliczamy statystyki TYLKO na train, potem aplikujemy do obu zbiorów.


# Oblicz statystyki na train
ep_len_median = train["Episode_Length_minutes"].median()
guest_pop_mean = train["Guest_Popularity_percentage"].mean()

print(f"Episode_Length median: {ep_len_median}")
print(f"Guest_Popularity mean: {guest_pop_mean}")

# Wypełnij braki
train["Episode_Length_minutes"].fillna(ep_len_median, inplace=True)
test["Episode_Length_minutes"].fillna(ep_len_median, inplace=True)

train["Guest_Popularity_percentage"].fillna(guest_pop_mean, inplace=True)
test["Guest_Popularity_percentage"].fillna(guest_pop_mean, inplace=True)

print("\nImputation zakończony!")
print(f"Train braki: {train.isnull().sum().sum()}")
print(f"Test braki: {test.isnull().sum().sum()}")

Episode_Length median: 63.84
Guest_Popularity mean: 52.23645285195598

Imputation zakończony!
Train braki: 0
Test braki: 0


Winsorization outlierów (zamiast usuwania!)

In [6]:
# Winsorization dla Episode_Length_minutes


ep_len_winsorized = mstats.winsorize(train["Episode_Length_minutes"], limits=[0.01, 0.01])
ep_lower, ep_upper = ep_len_winsorized.min(), ep_len_winsorized.max()

print(f"Episode_Length bounds: [{ep_lower:.2f}, {ep_upper:.2f}]")

train["Episode_Length_minutes"] = train["Episode_Length_minutes"].clip(ep_lower, ep_upper)
test["Episode_Length_minutes"] = test["Episode_Length_minutes"].clip(ep_lower, ep_upper)

# Winsorization dla Number_of_Ads
ads_winsorized = mstats.winsorize(train["Number_of_Ads"], limits=[0.01, 0.01])
ads_lower, ads_upper = ads_winsorized.min(), ads_winsorized.max()

print(f"Number_of_Ads bounds: [{ads_lower:.2f}, {ads_upper:.2f}]")

train["Number_of_Ads"] = train["Number_of_Ads"].clip(ads_lower, ads_upper)
test["Number_of_Ads"] = test["Number_of_Ads"].clip(ads_lower, ads_upper)

print("\nWinsorization zakończony!")

Episode_Length bounds: [6.78, 118.89]
Number_of_Ads bounds: [0.00, 3.00]

Winsorization zakończony!


Usunięcie duplikatów (tylko train!)

In [7]:
print(f"Train przed usunięciem duplikatów: {len(train)}")
train = train.drop_duplicates()
print(f"Train po usunięciu duplikatów: {len(train)}")

Train przed usunięciem duplikatów: 749999
Train po usunięciu duplikatów: 749999


Parsowanie czasu publikacji

In [8]:
for df in [train, test]:
    # Połącz dzień i czas w datetime
    df["pub_datetime"] = pd.to_datetime(
        df["Publication_Day"] + " " + df["Publication_Time"],
        errors='coerce'
    )
    
    # Wyciągnij komponenty
    df["day_of_week"] = df["pub_datetime"].dt.dayofweek  # 0=Mon, 6=Sun
    df["hour"] = df["pub_datetime"].dt.hour
    df["month"] = df["pub_datetime"].dt.month
    df["day_of_month"] = df["pub_datetime"].dt.day

print("Czas sparsowany!")
print(f"Przykładowe godziny: {train['hour'].head()}")

Czas sparsowany!
Przykładowe godziny: 0   NaN
1   NaN
2   NaN
3   NaN
4   NaN
Name: hour, dtype: float64


Feature Engineering - Podstawowe features

In [None]:
for df in [train, test]:
    # === Czas publikacji ===
    df["is_weekend"] = (df["day_of_week"] >= 5).astype(int)
    df["is_primetime"] = ((df["hour"] >= 17) & (df["hour"] <= 21)).astype(int)
    df["is_morning"] = ((df["hour"] >= 6) & (df["hour"] <= 12)).astype(int)
    df["is_night"] = ((df["hour"] >= 22) | (df["hour"] <= 5)).astype(int)

    # zobrazowac te numery na wykresie i potem spraedzic np jaki jest prime time

    # === Interakcje numeryczne ===
    df["ads_per_minute"] = df["Number_of_Ads"] / (df["Episode_Length_minutes"] + 1)
    df["total_popularity"] = df["Host_Popularity_percentage"] + df["Guest_Popularity_percentage"]
    df["popularity_ratio"] = df["Host_Popularity_percentage"] / (df["Guest_Popularity_percentage"] + 1)
    df["popularity_diff"] = df["Host_Popularity_percentage"] - df["Guest_Popularity_percentage"]
    
    # === Interakcje z flagami missing ===
    df["missing_guest_x_host_pop"] = df["Guest_Pop_missing"] * df["Host_Popularity_percentage"]
    df["missing_length_x_ads"] = df["Episode_Length_missing"] * df["Number_of_Ads"]
    df["missing_guest_x_episode_length"] = df["Guest_Pop_missing"] * df["Episode_Length_minutes"]
    
    # === Interakcje czasowe ===
    df["weekend_x_primetime"] = df["is_weekend"] * df["is_primetime"]
    df["weekend_x_ads"] = df["is_weekend"] * df["Number_of_Ads"]
    
    # === Sentiment jako numeric ===
    sentiment_map = {"positive": 1, "neutral": 0, "negative": -1}
    df["sentiment_numeric"] = df["Episode_Sentiment"].map(sentiment_map).fillna(0)
    
    df["sentiment_x_guest_pop"] = df["sentiment_numeric"] * df["Guest_Popularity_percentage"]
    df["sentiment_x_host_pop"] = df["sentiment_numeric"] * df["Host_Popularity_percentage"]
    df["negative_sentiment_x_ads"] = (df["sentiment_numeric"] == -1).astype(int) * df["Number_of_Ads"]

print(f"Podstawowe features dodane! Train shape: {train.shape}")

Podstawowe features dodane! Train shape: (749999, 36)


Text Features z tytułów

In [None]:
for df in [train, test]:
    # Długość stringów
    df["episode_title_len"] = df["Episode_Title"].astype(str).str.len()
    df["podcast_name_len"] = df["Podcast_Name"].astype(str).str.len()
    
    # Liczba słów
    df["episode_title_words"] = df["Episode_Title"].astype(str).str.split().str.len()
    df["podcast_name_words"] = df["Podcast_Name"].astype(str).str.split().str.len()
    
    # Czy zawiera cyfry/znaki specjalne
    df["title_has_numbers"] = df["Episode_Title"].astype(str).str.contains(r'\d', na=False).astype(int)
    df["title_has_special"] = df["Episode_Title"].astype(str).str.contains(r'[!?#]', na=False).astype(int)
    
    # Czy to odcinek specjalny?
    df["is_special_episode"] = df["Episode_Title"].astype(str).str.contains(
        r'special|bonus|exclusive|interview|live', case=False, na=False
    ).astype(int)
    
    # Uppercase ratio (clickbait?)
    df["title_uppercase_ratio"] = df["Episode_Title"].astype(str).apply(
        lambda x: sum(1 for c in x if c.isupper()) / (len(x) + 1)
    )

print("Text features dodane!")

# do wywalenia, zamiast tego embeddingi

Text features dodane!


Frequency Encoding dla high-cardinality features

In [None]:
# Podcast frequency
podcast_freq = train["Podcast_Name"].value_counts()
train["podcast_frequency"] = train["Podcast_Name"].map(podcast_freq).fillna(0)
test["podcast_frequency"] = test["Podcast_Name"].map(podcast_freq).fillna(0)

# Genre frequency
genre_freq = train["Genre"].value_counts()
train["genre_frequency"] = train["Genre"].map(genre_freq).fillna(0)
test["genre_frequency"] = test["Genre"].map(genre_freq).fillna(0)

# Normalizacja (0-1)
train["podcast_frequency_norm"] = train["podcast_frequency"] / len(train)
test["podcast_frequency_norm"] = test["podcast_frequency"] / len(train)

train["genre_frequency_norm"] = train["genre_frequency"] / len(train)
test["genre_frequency_norm"] = test["genre_frequency"] / len(train)

print("Frequency encoding zakończony!")
print(f"Top 5 podcasts by frequency:\n{podcast_freq.head()}")

Frequency encoding zakończony!
Top 5 podcasts by frequency:
Podcast_Name
Tech Talks       22847
Sports Weekly    20053
Funny Folks      19635
Tech Trends      19549
Fitness First    19488
Name: count, dtype: int64


Agregacje per Podcast

In [12]:
# Statystyki per Podcast (obliczane TYLKO na train!)
podcast_stats = train.groupby("Podcast_Name").agg({
    "Listening_Time_minutes": ["mean", "median", "std", "min", "max"],
    "Episode_Length_minutes": ["mean", "median"],
    "Number_of_Ads": ["mean", "median"],
    "Host_Popularity_percentage": "first",
    "Guest_Popularity_percentage": "mean"
}).reset_index()

# Spłaszcz kolumny
podcast_stats.columns = [
    "Podcast_Name",
    "podcast_avg_listening", "podcast_med_listening", "podcast_std_listening",
    "podcast_min_listening", "podcast_max_listening",
    "podcast_avg_length", "podcast_med_length",
    "podcast_avg_ads", "podcast_med_ads",
    "podcast_host_pop", "podcast_avg_guest_pop"
]

# Wypełnij std NaN (podcasty z 1 odcinkiem)
podcast_stats["podcast_std_listening"].fillna(0, inplace=True)

# Merguj do train i test
train = train.merge(podcast_stats, on="Podcast_Name", how="left")
test = test.merge(podcast_stats, on="Podcast_Name", how="left")

print(f"Podcast stats dodane! Train shape: {train.shape}")

Podcast stats dodane! Train shape: (749999, 59)


Agregacje per Genre

In [13]:
# Statystyki per Genre
genre_stats = train.groupby("Genre").agg({
    "Listening_Time_minutes": ["mean", "median", "std"],
    "Guest_Popularity_percentage": "mean",
    "Episode_Length_minutes": "mean"
}).reset_index()

genre_stats.columns = [
    "Genre",
    "genre_avg_listening", "genre_med_listening", "genre_std_listening",
    "genre_avg_guest_pop", "genre_avg_length"
]

genre_stats["genre_std_listening"].fillna(0, inplace=True)

# Merguj
train = train.merge(genre_stats, on="Genre", how="left")
test = test.merge(genre_stats, on="Genre", how="left")

print(f"Genre stats dodane! Train shape: {train.shape}")

Genre stats dodane! Train shape: (749999, 64)


Relative Features (odcinek vs średnia podcastu/gatunku)

In [14]:
for df in [train, test]:
    # Porównanie z podcast
    df["length_vs_podcast_avg"] = df["Episode_Length_minutes"] / (df["podcast_avg_length"] + 1)
    df["ads_vs_podcast_avg"] = df["Number_of_Ads"] / (df["podcast_avg_ads"] + 1)
    df["guest_pop_vs_podcast_avg"] = df["Guest_Popularity_percentage"] / (df["podcast_avg_guest_pop"] + 1)
    
    # Porównanie z genre
    df["length_vs_genre_avg"] = df["Episode_Length_minutes"] / (df["genre_avg_length"] + 1)
    
    # Czy ten odcinek jest powyżej/poniżej średniej?
    df["above_podcast_avg_length"] = (df["Episode_Length_minutes"] > df["podcast_avg_length"]).astype(int)
    df["above_genre_avg_length"] = (df["Episode_Length_minutes"] > df["genre_avg_length"]).astype(int)

print("Relative features dodane!")

Relative features dodane!


Target Encoding dla kategorycznych (z CV leak protection!)

In [None]:
def target_encode_with_cv(train_df, test_df, cat_col, target_col, n_splits=5, smoothing=10):
    """
    Target encoding z CV żeby uniknąć leakage
    """
    global_mean = train_df[target_col].mean()
    
    # Inicjalizacja
    train_df[f"{cat_col}_target_enc"] = global_mean
    
    # K-Fold CV dla train
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    
    for train_idx, val_idx in kf.split(train_df):
        train_fold = train_df.iloc[train_idx]
        
        # Oblicz statystyki na foldzie treningowym
        agg = train_fold.groupby(cat_col)[target_col].agg(['mean', 'count'])
        
        # Smoothing (Bayesian average)
        smoothed = (agg['count'] * agg['mean'] + smoothing * global_mean) / (agg['count'] + smoothing)
        
        # Mapuj na fold walidacyjny
        train_df.loc[val_idx, f"{cat_col}_target_enc"] = train_df.loc[val_idx, cat_col].map(smoothed).fillna(global_mean)
    
    # Dla test użyj całego train
    agg_full = train_df.groupby(cat_col)[target_col].agg(['mean', 'count'])
    smoothed_full = (agg_full['count'] * agg_full['mean'] + smoothing * global_mean) / (agg_full['count'] + smoothing)
    test_df[f"{cat_col}_target_enc"] = test_df[cat_col].map(smoothed_full).fillna(global_mean)
    
    return train_df, test_df

# Zastosuj target encoding
print("Target encoding w toku (może potrwać ~1min)...")

for col in ["Podcast_Name", "Genre"]:
    train, test = target_encode_with_cv(train, test, col, "Listening_Time_minutes", smoothing=10)
    print(f"✓ {col} zakończony")

print(f"\nTarget encoding zakończony! Train shape: {train.shape}")

# po co to?


Target encoding w toku (może potrwać ~1min)...
✓ Podcast_Name zakończony
✓ Genre zakończony

Target encoding zakończony! Train shape: (749999, 72)


Konwersja object -> string (dla AutoGluon)

In [16]:
for col in train.select_dtypes(include=["object"]).columns:
    if col in train.columns:
        train[col] = train[col].astype("string")
    if col in test.columns:
        test[col] = test[col].astype("string")

print("Konwersja typów zakończona!")
print(f"\nTrain dtypes:\n{train.dtypes.value_counts()}")

Konwersja typów zakończona!

Train dtypes:
float64           46
int64             19
string[python]     6
datetime64[ns]     1
Name: count, dtype: int64


Wypełnienie ewentualnych NaN powstałych przy mergowaniu

In [None]:
# Dla nowych podcastów/gatunków w test, które nie były w train
fill_cols = [col for col in train.columns if 'podcast_' in col or 'genre_' in col]

for col in fill_cols:
    if col in train.columns and col in test.columns:
        train[col].fillna(0, inplace=True)
        test[col].fillna(0, inplace=True)

print(f"Train final missing: {train.isnull().sum().sum()}")
print(f"Test final missing: {test.isnull().sum().sum()}")

# d;aczego tak duzo?


Train final missing: 3749995
Test final missing: 1250000


Podsumowanie finalnych features

In [18]:
print(f"FINALNE STATYSTYKI:")
print(f"Train shape: {train.shape}")
print(f"Test shape: {test.shape}")
print(f"\nLiczba features: {train.shape[1] - 1}")  # -1 bo target
print(f"\nNowe features (przykłady):")
new_features = [col for col in train.columns if col not in pd.read_csv(TRAIN_PATH).columns]
print(f"Dodano {len(new_features)} nowych features")
print("\nPrzykładowe nowe features:")
for feat in new_features[:15]:
    print(f"  - {feat}")

FINALNE STATYSTYKI:
Train shape: (749999, 72)
Test shape: (250000, 71)

Liczba features: 71

Nowe features (przykłady):
Dodano 60 nowych features

Przykładowe nowe features:
  - Episode_Length_missing
  - Guest_Pop_missing
  - pub_datetime
  - day_of_week
  - hour
  - month
  - day_of_month
  - is_weekend
  - is_primetime
  - is_morning
  - is_night
  - ads_per_minute
  - total_popularity
  - popularity_ratio
  - popularity_diff


Zapisanie przetworzonych danych

In [20]:
# Zapisz
output_train = Path("data-generated/train_advanced_features.csv")
output_test = Path("data-generated/test_advanced_features.csv")

train.to_csv(output_train, index=False)
test.to_csv(output_test, index=False)

print(f"Pliki zapisane:")
print(f"Train: {output_train}")
print(f"Test: {output_test}")

Pliki zapisane:
Train: data-generated/train_advanced_features.csv
Test: data-generated/test_advanced_features.csv


Quick check - preview finalnych danych

In [22]:
print("=== TRAIN PREVIEW ===")
print(train.head(3))
print("\n=== TRAIN INFO ===")
print(train.info())
print("\n=== BASIC STATS ===")
print(train.describe())

=== TRAIN PREVIEW ===
   id     Podcast_Name Episode_Title  Episode_Length_minutes       Genre  \
0   0  Mystery Matters    Episode 98                   63.84  True Crime   
1   1    Joke Junction    Episode 26                  118.89      Comedy   
2   2   Study Sessions    Episode 16                   73.90   Education   

   Host_Popularity_percentage Publication_Day Publication_Time  \
0                       74.81        Thursday            Night   
1                       66.95        Saturday        Afternoon   
2                       69.97         Tuesday          Evening   

   Guest_Popularity_percentage  Number_of_Ads  ... genre_avg_guest_pop  \
0                    52.236453            0.0  ...           52.626684   
1                    75.950000            2.0  ...           53.156679   
2                     8.970000            0.0  ...           52.298693   

   genre_avg_length  length_vs_podcast_avg  ads_vs_podcast_avg  \
0         64.459676               0.977233   

Trening w AutoGluon z optymalizacją

In [24]:
from autogluon.tabular import TabularPredictor
import pandas as pd

# Wczytaj dane
train = pd.read_csv("data-generated/train_advanced_features.csv")
test = pd.read_csv("data-generated/test_advanced_features.csv")

print(f"Train shape: {train.shape}")
print(f"Test shape: {test.shape}")

# Usuń kolumny, które nie powinny być w treningu
drop_cols = ["id", "pub_datetime", "Publication_Day", "Publication_Time"]
train_features = train.drop(columns=[col for col in drop_cols if col in train.columns])
test_features = test.drop(columns=[col for col in drop_cols if col in test.columns])

print(f"\nTrain po usunięciu: {train_features.shape}")
print(f"Test po usunięciu: {test_features.shape}")

Train shape: (749999, 72)
Test shape: (250000, 71)

Train po usunięciu: (749999, 68)
Test po usunięciu: (250000, 67)


Konfiguracja AutoGluon z zaawansowanymi parametrami

In [None]:
predictor = TabularPredictor(
    label="Listening_Time_minutes",
    eval_metric="root_mean_squared_error",
    problem_type="regression",
    path="models/autogluon_advanced"  # Zapisz modele
).fit(
    train_data=train_features,
    # time_limit=3600,  # 1 godzina
    presets="best_quality",
    num_bag_folds=5,  # K-fold bagging dla stabilności
    num_bag_sets=1,
    num_stack_levels=1,  # Stacking dla lepszych predykcji
    hyperparameters={
        'GBM': [
            {'extra_trees': True, 'ag_args': {'name_suffix': 'XT'}},
            {},  # Default LightGBM
            {'learning_rate': 0.03, 'num_leaves': 128, 'ag_args': {'name_suffix': 'Custom'}},
        ],
        'CAT': {},  # CatBoost
        'XGB': {},  # XGBoost
        'RF': [
            {'criterion': 'squared_error', 'max_depth': 20, 'ag_args': {'name_suffix': 'Deep'}},
        ],
        'XT': [  # Extra Trees
            {'criterion': 'squared_error', 'ag_args': {'name_suffix': 'MSE'}},
        ],
    },
    excluded_model_types=['KNN', 'NN_TORCH'],  # Usuń słabe modele dla regresji
    verbosity=2
)

# moze da sie jakos lepiej dobrac parametry?

Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.4.0
Python Version:     3.10.13
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #32~20.04.1-Ubuntu SMP Mon Jan 9 18:02:08 UTC 2023
CPU Count:          4
Memory Avail:       11.09 GB / 15.44 GB (71.8%)
Disk Space Avail:   89.56 GB / 131.62 GB (68.0%)
Presets specified: ['best_quality']
Setting dynamic_stacking from 'auto' to True. Reason: Enable dynamic_stacking when use_bag_holdout is disabled. (use_bag_holdout=False)
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=5, num_bag_sets=1
DyStack is enabled (dynamic_stacking=True). AutoGluon will try to determine whether the input data is affected by stacked overfitting and enable or disable stacking as a consequence.
	This is used to identify the optimal `num_stack_levels` value. Copies of AutoGluon will be fit on subsets of the data. Then holdout validation data is used to detect stacked overfitting.
	Running DyStack for up to 900s of th

[36m(_ray_fit pid=5224)[0m [1000]	valid_set's rmse: 13.0175
[36m(_ray_fit pid=5222)[0m [1000]	valid_set's rmse: 13.1137[32m [repeated 2x across cluster] (Ray deduplicates logs by default. Set RAY_DEDUP_LOGS=0 to disable log deduplication, or see https://docs.ray.io/en/master/ray-observability/user-guides/configure-logging.html#log-deduplication for more options.)[0m


[36m(_ray_fit pid=5221)[0m 	Ran out of time, early stopping on iteration 1611. Best iteration is:
[36m(_ray_fit pid=5221)[0m 	[1594]	valid_set's rmse: 13.0182


[36m(_ray_fit pid=5438)[0m [1000]	valid_set's rmse: 13.0658[32m [repeated 2x across cluster][0m
[36m(_ray_fit pid=5438)[0m [2000]	valid_set's rmse: 13.0602


[36m(_ray_fit pid=5438)[0m 	Ran out of time, early stopping on iteration 2561. Best iteration is:[32m [repeated 4x across cluster][0m
[36m(_ray_fit pid=5438)[0m 	[2280]	valid_set's rmse: 13.0586[32m [repeated 4x across cluster][0m
[36m(_dystack pid=4889)[0m 	-13.0526	 = Validation score   (-root_mean_squared_error)
[36m(_dystack pid=4889)[0m 	474.57s	 = Training   runtime
[36m(_dystack pid=4889)[0m 	157.29s	 = Validation runtime
[36m(_dystack pid=4889)[0m Fitting model: LightGBM_BAG_L1 ... Training model for up to 33.38s of the 325.01s of remaining time.
[36m(_dystack pid=4889)[0m 	Fitting 5 child models (S1F1 - S1F5) | Fitting with ParallelLocalFoldFittingStrategy (4 workers, per: cpus=1, gpus=0, memory=13.33%)
[36m(_ray_fit pid=5665)[0m 	Ran out of time, early stopping on iteration 28. Best iteration is:
[36m(_ray_fit pid=5665)[0m 	[28]	valid_set's rmse: 14.3565
[36m(_ray_fit pid=5799)[0m 	Ran out of time, early stopping on iteration 50. Best iteration is:[32

Trening zakończony (EZZZZ????)


Analiza modelu - Feature Importance

In [26]:
# Feature importance
importance = predictor.feature_importance(train_features)
print("\n=== TOP 20 NAJWAŻNIEJSZYCH FEATURES ===")
print(importance.head(20))

# Zapisz importance
importance.to_csv("feature_importance.csv")
print("Feature importance zapisane do: feature_importance.csv")

These features in provided data are not utilized by the predictor and will be ignored: ['day_of_week', 'hour', 'month', 'day_of_month', 'is_weekend', 'is_primetime', 'is_morning', 'is_night', 'weekend_x_primetime', 'weekend_x_ads', 'sentiment_numeric', 'sentiment_x_guest_pop', 'sentiment_x_host_pop', 'negative_sentiment_x_ads', 'episode_title_words', 'title_has_numbers', 'title_has_special', 'is_special_episode', 'podcast_min_listening', 'podcast_med_length', 'podcast_med_ads']
Computing feature importance via permutation shuffling for 46 features using 5000 rows with 5 shuffle sets...
	1606.22s	= Expected runtime (321.24s per shuffle set)
	1368.28s	= Actual runtime (Completed 5 of 5 shuffle sets)



=== TOP 20 NAJWAŻNIEJSZYCH FEATURES ===
                                importance    stddev       p_value  n  \
Episode_Length_minutes           15.413678  0.224082  5.358714e-09  5   
length_vs_genre_avg               1.551974  0.062060  3.061643e-07  5   
Episode_Title                     1.129664  0.076337  2.487102e-06  5   
Podcast_Name                      0.822746  0.069175  5.940585e-06  5   
ads_per_minute                    0.710582  0.100078  4.599167e-05  5   
Host_Popularity_percentage        0.549408  0.042835  4.398192e-06  5   
Episode_Length_missing            0.521614  0.050632  1.052093e-05  5   
length_vs_podcast_avg             0.383378  0.028407  3.590806e-06  5   
Guest_Popularity_percentage       0.231754  0.023376  1.225415e-05  5   
Episode_Sentiment                 0.131353  0.024774  1.449097e-04  5   
guest_pop_vs_podcast_avg          0.126991  0.011756  8.712472e-06  5   
popularity_diff                   0.104842  0.006421  1.680308e-06  5   
total_popu

24. Leaderboard modeli

In [27]:
leaderboard = predictor.leaderboard(train_features, silent=True)
print("\n=== LEADERBOARD MODELI ===")
print(leaderboard)

# print(f"\nNajlepszy model: {predictor.get_model_best()}")
print(f"RMSE najlepszego modelu: {leaderboard['score_val'].iloc[0]:.4f}")


=== LEADERBOARD MODELI ===
                     model  score_test  score_val              eval_metric  \
0          LightGBM_BAG_L1  -11.569711 -12.990736  root_mean_squared_error   
1  RandomForestDeep_BAG_L2  -11.867125 -13.067457  root_mean_squared_error   
2      WeightedEnsemble_L3  -11.910536 -12.944424  root_mean_squared_error   
3    LightGBMCustom_BAG_L2  -11.934366 -12.970425  root_mean_squared_error   
4      WeightedEnsemble_L2  -11.939326 -12.955695  root_mean_squared_error   
5          LightGBM_BAG_L2  -11.994666 -12.960717  root_mean_squared_error   
6        LightGBMXT_BAG_L1  -12.075988 -13.030786  root_mean_squared_error   
7        LightGBMXT_BAG_L2  -12.176237 -12.995737  root_mean_squared_error   
8    LightGBMCustom_BAG_L1  -12.443026 -13.003249  root_mean_squared_error   

   pred_time_test  pred_time_val     fit_time  pred_time_test_marginal  \
0      367.475880     238.353383   711.494490               367.475880   
1      803.108906     485.183036  2183.4444

Predykcja na test

In [28]:
# Predykcja
predictions = predictor.predict(test_features)

# Sprawdź statystyki predykcji
print("\n=== STATYSTYKI PREDYKCJI ===")
print(f"Min: {predictions.min():.2f}")
print(f"Max: {predictions.max():.2f}")
print(f"Mean: {predictions.mean():.2f}")
print(f"Median: {predictions.median():.2f}")

# Sprawdź czy są wartości ujemne (błąd!)
if (predictions < 0).any():
    print(f"UWAGA: {(predictions < 0).sum()} predykcji ujemnych! Clipowanie do 0...")
    predictions = predictions.clip(lower=0)


=== STATYSTYKI PREDYKCJI ===
Min: 0.51
Max: 113.51
Mean: 45.46
Median: 44.62


Zapisanie submission

In [29]:
# Stwórz submission
submission = test[["id"]].copy()
submission["Listening_Time_minutes"] = predictions

# Zapisz
submission.to_csv("submission.csv", index=False)

print("\nSubmission zapisany do: submission.csv")
print(f"Shape: {submission.shape}")
print("\nPierwsze 5 wierszy:")
print(submission.head())

#hopefully malutki rmse (plsplspls)


Submission zapisany do: submission.csv
Shape: (250000, 2)

Pierwsze 5 wierszy:
       id  Listening_Time_minutes
0  750000               53.896751
1  750001               18.823473
2  750002               49.512531
3  750003               81.293648
4  750004               47.252350


In [None]:
# sprawdz:
# jaki jest rozklad danych?
# w sensie jak sie rozkladaja wszystkie waerosci w kolumnach

# dodalbym zamiane tytułu podcastu na jakies embeddingi, zeby model mial wiecej informacji o tresci
# proces embeddingow:
# 1. SentenceTransformers zeby zamienic tkst na wektor
# 2. sprawdzic czy są w jakims stopniu podobne (np cosine similarity)
# 3. cluterowanie na podstawie similarity threshhold np: pary z similarity > 0.8 sa w tym samym clusterze
# 4. dodanie do modelu jako feature, np cluster i potem caly embedding