In [1]:
import os
import pandas as pd
from caafe import CAAFEClassifier

import warnings
warnings.filterwarnings("ignore")

In [2]:
os.environ["OPENAI_API_KEY"] = "you-openai-key"

In [3]:
# File Paths
base_path = "base-path"
train_path = os.path.join(base_path, "train.csv")
test_path = os.path.join(base_path, "test.csv")
submission_path = os.path.join(base_path, "sample_submission.csv")
description_path = os.path.join(base_path, "description.md")

In [4]:
df_train = pd.read_csv(train_path, index_col="id")
df_test = pd.read_csv(test_path,index_col="id")

In [5]:
df_train

Unnamed: 0_level_0,Podcast_Name,Episode_Title,Episode_Length_minutes,Genre,Host_Popularity_percentage,Publication_Day,Publication_Time,Guest_Popularity_percentage,Number_of_Ads,Episode_Sentiment,Listening_Time_minutes
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,Mystery Matters,Episode 98,,True Crime,74.81,Thursday,Night,,0.0,Positive,31.41998
1,Joke Junction,Episode 26,119.80,Comedy,66.95,Saturday,Afternoon,75.95,2.0,Negative,88.01241
2,Study Sessions,Episode 16,73.90,Education,69.97,Tuesday,Evening,8.97,0.0,Negative,44.92531
3,Digital Digest,Episode 45,67.17,Technology,57.22,Monday,Morning,78.70,2.0,Positive,46.27824
4,Mind & Body,Episode 86,110.51,Health,80.07,Monday,Afternoon,58.68,3.0,Neutral,75.61031
...,...,...,...,...,...,...,...,...,...,...,...
749995,Learning Lab,Episode 25,75.66,Education,69.36,Saturday,Morning,,0.0,Negative,56.87058
749996,Business Briefs,Episode 21,75.75,Business,35.21,Saturday,Night,,2.0,Neutral,45.46242
749997,Lifestyle Lounge,Episode 51,30.98,Lifestyle,78.58,Thursday,Morning,84.89,0.0,Negative,15.26000
749998,Style Guide,Episode 47,108.98,Lifestyle,45.39,Thursday,Morning,93.27,0.0,Negative,100.72939


In [6]:
target_column_name = "Listening_Time_minutes"
with open(description_path, 'r', encoding='utf-8') as file:
    dataset_description = file.read()

In [7]:
print(dataset_description)

# Predict Podcast Listening Time

## Overview
Your Goal: Your task it to predict listening time of a podcast episode.

## Evaluation
Submissions are scored on the root mean squared error. RMSE is defined as:

### Root Mean Squared Error (RMSE)

Submissions are scored on the root mean squared error. RMSE is defined as:

\[
\text{RMSE} = \left( \frac{1}{N} \sum_{i=1}^{N} (y_i - \hat{y}_i)^2 \right)^{\frac{1}{2}}
\]

where \(\hat{y}_i\) is the predicted value and \(y_i\) is the original value for each instance \(i\).

## Submission File
For each id in the test set, you must predict the Listening_Time_minutes of the podcast. The file should contain a header and have the following format:
```
id,Listening_Time_minutes
750000,45.437
750001,45.437
750002,45.437
etc.
```


## Dataset Description
The dataset for this competition (both train and test) was generated from a deep learning model trained on the [Podcast Listening Time Prediction](https://www.kaggle.com/datasets/ysthehurricane/podcast

In [8]:
from sklearn.ensemble import RandomForestRegressor

clf_no_feat_eng = RandomForestRegressor(n_estimators=10, max_depth=2)
caafe_clf = CAAFEClassifier(
    base_classifier=clf_no_feat_eng,
    llm_model="gpt-4.1",
    iterations=2,
    n_splits=5,
    n_repeats=1,
)

In [9]:
caafe_clf.fit_pandas(
    df_train,
    dataset_description=dataset_description,
    target_column_name=target_column_name,
    task = "regression",
    metric_to_use="rmse",
)

# *Dataset description:*
 # Predict Podcast Listening Time

## Overview
Your Goal: Your task it to predict listening time of a podcast episode.

## Evaluation
Submissions are scored on the root mean squared error. RMSE is defined as:

### Root Mean Squared Error (RMSE)

Submissions are scored on the root mean squared error. RMSE is defined as:

\[
\text{RMSE} = \left( \frac{1}{N} \sum_{i=1}^{N} (y_i - \hat{y}_i)^2 \right)^{\frac{1}{2}}
\]

where \(\hat{y}_i\) is the predicted value and \(y_i\) is the original value for each instance \(i\).

## Submission File
For each id in the test set, you must predict the Listening_Time_minutes of the podcast. The file should contain a header and have the following format:
```
id,Listening_Time_minutes
750000,45.437
750001,45.437
750002,45.437
etc.
```


## Dataset Description
The dataset for this competition (both train and test) was generated from a deep learning model trained on the [Podcast Listening Time Prediction](https://www.kaggle.com/datasets/ysthehurricane/podcast-listening-time-prediction-dataset) dataset. Feature distributions are close to, but not exactly the same, as the original. Feel free to use the original dataset as part of this competition, both to explore differences as well as to see whether incorporating the original in training improves model performance.

## Files
- train.csv - the training dataset; Listening_Time_minutes is the target
- test.csv - the test dataset; your objective is to predict the Listening_Time_minutes for each row
- sample_submission.csv - a sample submission file in the correct format.

> Fold-1:- Old: 14.846556240648015, New: 14.84655648339926

> Fold-2:- Old: 14.869972722103373, New: 14.869971863210957

> Fold-3:- Old: 14.800105867829267, New: 14.800105599824452

> Fold-4:- Old: 14.79527971106997, New: 14.79527927336654

> Fold-5:- Old: 14.85005614142956, New: 14.850056485383428


*Iteration 1*
```python

# (Episode_Length_minutes_filled: Episode length with missing values imputed by the median per Podcast_Name.)
# Usefulness: Imputing missing episode lengths with the median length for each podcast adds real world knowledge, as podcasts t to have consistent episode lengths and missing values can otherwise introduce noise.
# Input samples: 'Podcast_Name': ['Mystery Matters', 'Joke Junction', 'Study Sessions'], 'Episode_Length_minutes': [np.nan, 73.9, np.nan]
df['Episode_Length_minutes_filled'] = df.groupby('Podcast_Name')['Episode_Length_minutes'].transform(
    lambda x: pd.to_numeric(x, errors='coerce').fillna(pd.to_numeric(x, errors='coerce').median())
)

```


- Performance before adding features rmse : 14.832394136616037
- Performance after adding features rmse  : 14.832393941036926
- Improvement in rmse: 1.9557911024037367e-07
- The code was executed and changes to ´df´ were kept.

> Fold-1:- Old: 14.84655648339926, New: 14.846556241764686

> Fold-2:- Old: 14.869971863210957, New: 14.869971332010904

> Fold-3:- Old: 14.800105599824452, New: 14.800105752855881

> Fold-4:- Old: 14.79527927336654, New: 14.795279351317872

> Fold-5:- Old: 14.850056485383428, New: 14.850056662278943


*Iteration 2*
```python

# (Host_Guest_Popularity_Ratio: Ratio between Host and Guest popularity percentages)
# Usefulness: The relative popularity between host and guest may influence listening time, e.g., more popular hosts relative to guests may retain listeners better, or vice versa. This captures their interaction.
# Input samples: 'Host_Popularity_percentage': [74.81, 66.95, 69.97], 'Guest_Popularity_percentage': [75.95, 8.97, 78.7]
df['Host_Guest_Popularity_Ratio'] = pd.to_numeric(df['Host_Popularity_percentage'], errors='coerce') / pd.to_numeric(df['Guest_Popularity_percentage'], errors='coerce')

```


- Performance before adding features rmse : 14.832393941036926
- Performance after adding features rmse  : 14.832393868045656
- Improvement in rmse: 7.299126991711091e-08
- The code was executed and changes to ´df´ were kept.

In [10]:
pred = caafe_clf.predict(df_test)

In [11]:
len(pred)

250000

In [12]:
submission = pd.read_csv(submission_path)
submission['Listening_Time_minutes'] = pred
submission.to_csv(f"caffe_submission.csv",index=False)
submission

Unnamed: 0,id,Listening_Time_minutes
0,750000,57.908991
1,750001,14.630490
2,750002,39.729210
3,750003,77.685618
4,750004,57.908991
...,...,...
249995,999995,14.630490
249996,999996,57.908991
249997,999997,14.630490
249998,999998,77.685618


In [13]:
print(caafe_clf.code)


# (Episode_Length_minutes_filled: Episode length with missing values imputed by the median per Podcast_Name.)
# Usefulness: Imputing missing episode lengths with the median length for each podcast adds real world knowledge, as podcasts t to have consistent episode lengths and missing values can otherwise introduce noise.
# Input samples: 'Podcast_Name': ['Mystery Matters', 'Joke Junction', 'Study Sessions'], 'Episode_Length_minutes': [np.nan, 73.9, np.nan]
df['Episode_Length_minutes_filled'] = df.groupby('Podcast_Name')['Episode_Length_minutes'].transform(
    lambda x: pd.to_numeric(x, errors='coerce').fillna(pd.to_numeric(x, errors='coerce').median())
)

# (Host_Guest_Popularity_Ratio: Ratio between Host and Guest popularity percentages)
# Usefulness: The relative popularity between host and guest may influence listening time, e.g., more popular hosts relative to guests may retain listeners better, or vice versa. This captures their interaction.
# Input samples: 'Host_Popularity_per