In [None]:
# ==============================================
# RANDOM FOREST PERFORMANCE & WEATHER IMPORTANCE BY PRODUCT FAMILY
# Two analyses:
# 1. Controlled for promotion and holiday effects
# 2. Without controlling for promotion and holiday
# ==============================================

# 1. IMPORT LIBRARIES
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import statsmodels.api as sm

# 2. LOAD DATA
df = pd.read_csv("ecuador_weather_data.csv")

# --- Compute average daily temperature ---
df['temperature_2m_avg'] = (df['temperature_2m_max'] + df['temperature_2m_min']) / 2

# ============================
# ANALYSIS 1: CONTROLLED FOR PROMOTION + HOLIDAY
# ============================
# Drop rows with missing values in critical columns
df_ctrl = df.dropna(subset=['sales', 'family', 'temperature_2m_avg', 'precipitation_sum', 'onpromotion', 'is_holiday'])

# Storage for Random Forest results
results_ctrl = []

# Loop through each product family
for family in df_ctrl['family'].unique():
    fam_data = df_ctrl[df_ctrl['family'] == family]

    # --- Step 1: Remove effects of promotion and holiday ---
    control_features = ['onpromotion', 'is_holiday']
    X_ctrl_features = sm.add_constant(fam_data[control_features])
    y_sales = fam_data['sales']

    model_ctrl = sm.OLS(y_sales, X_ctrl_features).fit()
    sales_residuals = model_ctrl.resid  # residual sales after controlling for promotion + holiday

    # --- Step 2: Train Random Forest on residuals using weather features ---
    weather_features = ['temperature_2m_avg', 'precipitation_sum']
    X_weather = fam_data[weather_features]

    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(
        X_weather, sales_residuals, test_size=0.2, random_state=42
    )

    # Random Forest Regressor
    rf = RandomForestRegressor(n_estimators=200, max_depth=15, random_state=42)
    rf.fit(X_train, y_train)
    rf_pred = rf.predict(X_test)

    # Performance metrics
    rf_r2 = r2_score(y_test, rf_pred)
    rf_rmse = np.sqrt(mean_squared_error(y_test, rf_pred))

    # Store results
    results_ctrl.append({
        'family': family,
        'rf_r2': rf_r2,
        'rf_rmse': rf_rmse
    })

# Create DataFrame of results sorted by R2
results_ctrl_df = pd.DataFrame(results_ctrl).sort_values(by='rf_r2', ascending=False)

print("\n=== RANDOM FOREST PERFORMANCE BY FAMILY (CONTROLLED FOR PROMOTION + HOLIDAY) ===")
print(results_ctrl_df)

# ============================
# ANALYSIS 2: WITHOUT CONTROL FOR PROMOTION + HOLIDAY
# ============================
# Drop rows with missing values in critical columns
df_no_ctrl = df.dropna(subset=['sales', 'family', 'temperature_2m_avg', 'precipitation_sum'])

# Storage for Random Forest results
results_no_ctrl = []

# Loop through each product family
for family in df_no_ctrl['family'].unique():
    fam_data = df_no_ctrl[df_no_ctrl['family'] == family]

    # Features: only weather variables
    weather_features = ['temperature_2m_avg', 'precipitation_sum']
    X_weather = fam_data[weather_features]
    y_sales = fam_data['sales']

    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(
        X_weather, y_sales, test_size=0.2, random_state=42
    )

    # Random Forest Regressor
    rf = RandomForestRegressor(n_estimators=200, max_depth=15, random_state=42)
    rf.fit(X_train, y_train)
    rf_pred = rf.predict(X_test)

    # Performance metrics
    rf_r2 = r2_score(y_test, rf_pred)
    rf_rmse = np.sqrt(mean_squared_error(y_test, rf_pred))

    # Store results
    results_no_ctrl.append({
        'family': family,
        'rf_r2': rf_r2,
        'rf_rmse': rf_rmse
    })

# Create DataFrame of results sorted by R2
results_no_ctrl_df = pd.DataFrame(results_no_ctrl).sort_values(by='rf_r2', ascending=False)

print("\n=== RANDOM FOREST PERFORMANCE BY FAMILY (WITHOUT CONTROL FOR PROMOTION + HOLIDAY) ===")
print(results_no_ctrl_df.head(15))



=== RANDOM FOREST PERFORMANCE BY FAMILY (CONTROLLED FOR PROMOTION + HOLIDAY) ===
            family     rf_r2      rf_rmse
5     FROZEN FOODS  0.303325   260.257037
3             DELI  0.275618   176.988243
9          POULTRY  0.233941   331.833622
0        BEVERAGES  0.217741  1855.743009
6        GROCERY I  0.204480  2508.526234
2            DAIRY  0.200584   585.468652
11         PRODUCE  0.175958  1795.757796
12         SEAFOOD  0.155175    28.181722
10  PREPARED FOODS  0.132513    95.892261
7       GROCERY II  0.102474    32.375283
1     BREAD/BAKERY  0.102469   340.278265
4             EGGS  0.045845   156.247329
8            MEATS -0.176491   351.115960

=== RANDOM FOREST PERFORMANCE BY FAMILY (WITHOUT CONTROL FOR PROMOTION + HOLIDAY) ===
            family     rf_r2      rf_rmse
5     FROZEN FOODS  0.324982   261.769228
3             DELI  0.273531   179.142843
0        BEVERAGES  0.259336  1939.355299
6        GROCERY I  0.229168  2562.535824
9          POULTRY  0.228043   34