In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_squared_error
import xgboost as xgb
import lightgbm as lgb

In [None]:
train_data=pd.read_csv('/kaggle/input/playground-series-s5e4/train.csv')
test_data=pd.read_csv('/kaggle/input/playground-series-s5e4/test.csv')

In [None]:
test_data

In [None]:
train_data

In [None]:
train_data.describe()

In [None]:
train_data.info()

In [None]:
train_data.isnull().sum()

In [None]:
train_data.isnull().mean()*100

**There are many missing values, so filling with mean/median could change dimensionality and may not be ideal.**

* **For Guest_Popularity_percentage, nearly 1 in 5 values are missing. That’s a red flag for simple imputing.**

In [None]:
train_new=train_data.copy()

# **IterativeImputer from scikit-learn:**

In [None]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

# Select numeric columns only for MICE (avoid strings/categoricals)
num_cols = ['Episode_Length_minutes', 'Host_Popularity_percentage', 
            'Guest_Popularity_percentage', 'Number_of_Ads',]

# Create a copy of the data with only numeric columns
numeric_data = train_new[num_cols]

# Apply iterative imputation
imp = IterativeImputer(random_state=42)
imputed_data = imp.fit_transform(numeric_data)

# Replace back into DataFrame
train_nouse = pd.DataFrame(imputed_data, columns=num_cols)
train_new['Episode_Length_minutes']=train_nouse['Episode_Length_minutes']
train_new['Number_of_Ads']=train_nouse['Number_of_Ads']

In [None]:
# Define the function to categorize popularity
def popular(row):
    if pd.isna(row['Guest_Popularity_percentage']):
        avg_popularity = row['Host_Popularity_percentage']
    else:
        avg_popularity = (row['Host_Popularity_percentage'] + row['Guest_Popularity_percentage']) / 2

    if avg_popularity <= 20:
        return 'Not Very Popular'
    elif 20 < avg_popularity <= 40:
        return 'Not Popular'
    elif 40 < avg_popularity <= 60:
        return 'Average'
    elif 60 < avg_popularity <= 85:
        return 'Popular'
    else:
        return 'Very Popular'

# Apply to training set
train_new['Popular_Level'] = train_new.apply(popular, axis=1)
train_new.drop(columns=['Host_Popularity_percentage', 'Guest_Popularity_percentage'], inplace=True)

In [None]:
train_new.isnull().mean()*100

In [None]:
# Identify all numeric (float or int) columns except the target
numeric_cols = train_new.select_dtypes(include=['float64', 'int64']).columns.tolist()

numeric_cols.remove('Listening_Time_minutes')  # not imputing the target

for col in numeric_cols:
    plt.figure(figsize=(10, 4))
    sns.histplot(train_data[col], bins=50, kde=True, label='Before', color='red', alpha=0.5)
    sns.histplot(train_new[col], bins=50, kde=True, label='After', color='green', alpha=0.5)
    plt.title(f'Histogram of {col} (Before vs After Imputation)')
    plt.legend()
    plt.show()


# **Target Variable Distribution** 

In [None]:
sns.histplot(train_new['Listening_Time_minutes'], bins=50, kde=True)
plt.title('Listening Time Distribution')
plt.xlabel('Listening Time (minutes)')
plt.ylabel('Frequency')
plt.show()

In [None]:
categorical_cols = ['Podcast_Name', 'Genre', 'Publication_Day', 'Episode_Sentiment', 'Popular_Level']

for col in categorical_cols:
    print(train_new[col].value_counts())
    sns.countplot(data=train_new, y=col, order=train_new[col].value_counts().index)
    plt.title(f'Distribution of {col}')
    plt.show()


In [None]:
numerical_cols = ['Episode_Length_minutes', 'Number_of_Ads']

for col in numerical_cols:
    sns.scatterplot(x=train_new[col], y=train_new['Listening_Time_minutes'])
    plt.title(f'{col} vs Listening Time')
    plt.show()

    sns.boxplot(x=pd.cut(train_new[col], bins=10), y=train_new['Listening_Time_minutes'])
    plt.xticks(rotation=45)
    plt.title(f'{col} (binned) vs Listening Time')
    plt.show()


**Check for Outliers**

In [None]:

plt.figure(figsize=(10, 5))
sns.boxplot(x=train_new['Episode_Length_minutes'])
plt.title("Boxplot of Episode_Length_minutes")
plt.show()

In [None]:
Q1 = train_new['Episode_Length_minutes'].quantile(0.25)
Q3 = train_new['Episode_Length_minutes'].quantile(0.75)
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

train_new = train_new[(train_new['Episode_Length_minutes'] >= lower_bound) & (train_new['Episode_Length_minutes'] <= upper_bound)]


In [None]:
for col in categorical_cols:
    sns.boxplot(x=col, y='Listening_Time_minutes', data=train_new)
    plt.xticks(rotation=45)
    plt.title(f'{col} vs Listening Time')
    plt.show()


In [None]:
time_order = ['Morning', 'Afternoon', 'Evening', 'Night']

sns.boxplot(data=train_new, x='Publication_Time', y='Listening_Time_minutes', order=time_order)
plt.title('Listening Time by Time of Day')
plt.xlabel('Time of Day')
plt.ylabel('Listening Time (minutes)')
plt.show()


In [None]:
# Combine both for interaction
train_new['Day_Time'] = train_new['Publication_Day'] + '_' + train_new['Publication_Time']

# Take top frequent combinations if needed
top_combos = train_new['Day_Time'].value_counts().index[:10]  # Optional limit

sns.boxplot(data=train_new[train_new['Day_Time'].isin(top_combos)], x='Day_Time', y='Listening_Time_minutes')
plt.xticks(rotation=45)
plt.title('Listening Time by Day and Time Combo')
plt.xlabel('Day_Time')
plt.ylabel('Listening Time (minutes)')
plt.show()


# **One-Hot Encoding**

In [None]:
# Step 1: Identify object (categorical) columns
categorical_cols = train_new.select_dtypes(include='object').columns.tolist()

# Optional: Drop columns you don't want to encode (like 'Podcast_Name', 'Episode_Title' if too unique)
categorical_cols = [col for col in categorical_cols if col not in ['Podcast_Name', 'Episode_Title']]

# Step 2: One-hot encode those columns
df_encoded = pd.get_dummies(train_new, columns=categorical_cols, drop_first=True)


In [None]:
df_cleand=df_encoded.drop(['Podcast_Name', 'Episode_Title','id'], axis=1)

## **Correlation**

In [None]:
# Check correlation with target
correlation = df_cleand.corr()['Listening_Time_minutes'].sort_values(ascending=False)
print(correlation)


# **Feature Importance from a Tree-based Model**

In [None]:
from sklearn.ensemble import RandomForestRegressor

# Separate features and target
X = df_cleand.drop(columns=['Listening_Time_minutes'])
y = df_cleand['Listening_Time_minutes']

# Train a random forest
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X, y)

# Get feature importances
importances = pd.Series(model.feature_importances_, index=X.columns)
importances_sorted = importances.sort_values(ascending=False)

# Plot top 20
importances_sorted[:20].plot(kind='barh', figsize=(10, 8), title='Top 20 Feature Importances')
plt.gca().invert_yaxis()
plt.show()


In [None]:
from sklearn.feature_selection import SelectKBest, f_regression

selector = SelectKBest(score_func=f_regression, k=20)  # top 20 features
X_new = selector.fit_transform(X, y)

selected_features = X.columns[selector.get_support()]
print("Selected features:", selected_features.tolist())


# **Training on Just Top Features:**

**Train-Test Split**

In [None]:
top_features = ['Episode_Length_minutes', 'Number_of_Ads', 'Genre_Comedy', 'Genre_Music', 'Genre_News', 'Genre_True Crime', 'Publication_Day_Monday', 'Publication_Day_Sunday', 'Publication_Day_Thursday', 'Publication_Day_Tuesday', 'Publication_Time_Evening', 'Publication_Time_Morning',]
X_top = df_cleand[top_features]
y = df_cleand['Listening_Time_minutes']
X_train, X_val, y_train, y_val = train_test_split(X_top, y, test_size=0.2, random_state=42)

**Scale the Data**

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_valid_scaled = scaler.transform(X_val)

**Define Models**

In [None]:
models = {
    "Linear Regression": LinearRegression(),
    "Ridge Regression": Ridge(),
    "Random Forest": RandomForestRegressor(n_estimators=100, random_state=42),
    "Gradient Boosting": GradientBoostingRegressor(n_estimators=100, random_state=42),
    "XGBoost": xgb.XGBRegressor(n_estimators=100, random_state=42),
    "LightGBM": lgb.LGBMRegressor(n_estimators=100, random_state=42),
    "SVM": SVR()
}

In [None]:
for name, model in models.items():
    model.fit(X_train_scaled, y_train)
    preds = model.predict(X_valid_scaled)
    rmse = np.sqrt(mean_squared_error(y_val, preds))
    print(f"{name} RMSE: {rmse:.4f}")

**Train & Evaluate Models**

# **Gradient Boosting with RMSE: 9.9727 is currently your best performing model**

In [None]:
# Define base model
gbr = GradientBoostingRegressor()

# Hyperparameter grid
param_dist = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 4, 5, 6],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'subsample': [0.8, 1.0]
}

# Randomized Search
random_search = RandomizedSearchCV(
    gbr,
    param_distributions=param_dist,
    n_iter=20,
    cv=3,
    scoring='neg_root_mean_squared_error',
    verbose=1,
    random_state=42,
    n_jobs=-1
)


In [None]:
# Fit to training data
random_search.fit(X_train_scaled, y_train)
best_gbr = random_search.best_estimator_

print("✅ Best Parameters:", random_search.best_params_)



In [None]:
# Tracking RMSE at each stage
train_errors = []
valid_errors = []

for y_pred_train, y_pred_valid in zip(best_gbr.staged_predict(X_train), best_gbr.staged_predict(X_val)):
    train_rmse = mean_squared_error(y_train, y_pred_train, squared=False)
    valid_rmse = mean_squared_error(y_val, y_pred_valid, squared=False)
    train_errors.append(train_rmse)
    valid_errors.append(valid_rmse)
    print('train_rmse:',train_rmse,' valid_rmse:',valid_rmse)

In [None]:
# Plotting the loss curve
plt.figure(figsize=(10, 5))
plt.plot(train_errors, label='Train RMSE', color='blue')
plt.plot(valid_errors, label='Validation RMSE', color='orange')
plt.xlabel("Boosting Iterations")
plt.ylabel("RMSE")
plt.title("Gradient Boosting RMSE Over Iterations")
plt.legend()
plt.grid(True)
plt.show()

# Final RMSE
print(f"📉 Final Validation RMSE: {valid_errors[-1]:.4f}")

In [None]:
final_gbr = GradientBoostingRegressor(
    subsample=0.8,
    n_estimators=300,
    min_samples_split=2,
    min_samples_leaf=1,
    max_depth=6,
    learning_rate=0.1,
    random_state=42
)

In [None]:

# Fit on full training data
final_gbr.fit(X_train_scaled, y_train)

# Predict on test data
y_pred_test = final_gbr.predict(X_valid_scaled)

# If you have actual values (y_test), evaluate RMSE
# If not, just print sample predictions
try:
    rmse_test = mean_squared_error(y_test, y_pred_test, squared=False)
    print(f"✅ Final Test RMSE: {rmse_test:.4f}")
except:
    print("📌 Sample Predictions on Test Data:")
    print(y_pred_test[:10])

In [None]:
test_data

In [None]:
test_data.isnull().mean()*100

In [None]:
# Select numeric columns only for MICE (avoid strings/categoricals)
num_cols = ['Episode_Length_minutes', 'Host_Popularity_percentage', 
            'Guest_Popularity_percentage', 'Number_of_Ads',]

# Create a copy of the data with only numeric columns
numeric_data = test_data[num_cols]

# Apply iterative imputation
imp = IterativeImputer(random_state=42)
imputed_data = imp.fit_transform(numeric_data)

# Replace back into DataFrame
test_nouse = pd.DataFrame(imputed_data, columns=num_cols)
test_data['Episode_Length_minutes']=test_nouse['Episode_Length_minutes']
test_data['Number_of_Ads']=test_nouse['Number_of_Ads']

In [None]:
test_data['Popular_Level'] = test_data.apply(popular, axis=1)
test_data.drop(columns=['Host_Popularity_percentage', 'Guest_Popularity_percentage'], inplace=True)

In [None]:
# Combine both for interaction
test_data['Day_Time'] = test_data['Publication_Day'] + '_' + test_data['Publication_Time']


In [None]:
# Step 1: Identify object (categorical) columns
categorical_cols = test_data.select_dtypes(include='object').columns.tolist()

# Optional: Drop columns you don't want to encode (like 'Podcast_Name', 'Episode_Title' if too unique)
categorical_cols = [col for col in categorical_cols if col not in ['Podcast_Name', 'Episode_Title']]

# Step 2: One-hot encode those columns
test_encoded = pd.get_dummies(test_data, columns=categorical_cols, drop_first=True)


In [None]:
test_cleand=test_encoded.drop(['Podcast_Name', 'Episode_Title','id'], axis=1)

In [None]:
top_features = ['Episode_Length_minutes', 'Number_of_Ads', 'Genre_Comedy', 'Genre_Music', 'Genre_News', 'Genre_True Crime', 'Publication_Day_Monday', 'Publication_Day_Sunday', 'Publication_Day_Thursday', 'Publication_Day_Tuesday', 'Publication_Time_Evening', 'Publication_Time_Morning',]
test_df= test_cleand[top_features]

In [None]:
scaler = StandardScaler()
test_df_scaled = scaler.fit_transform(test_df)

In [None]:
pred_test = final_gbr.predict(test_df_scaled)

In [None]:
# 6. Create submission DataFrame
submission = pd.DataFrame({
    'id': test_data['id'],
    'Listening_Time_minutes': pred_test
})

# 7. Save to CSV
submission.to_csv('submission.csv', index=False)
print("✅ Submission file 'submission.csv' created.")