In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt


In [2]:
df = pd.read_csv(r"..\synthetic data\nights_features.csv",index_col=0)

In [3]:
# --- 1. Columns to exclude from the model ---
cols_to_drop = ['REGON','D8R1', 'WON', 'POW', 'GMN', 'D8R01', 'D8R02','date']

In [4]:
# --- 2. Summary results list ---
results = []


In [5]:

# --- 3. Loop over each KKR ---
for kkr_value in df['KKR'].unique():
    print(f"\n{'='*40}")
    print(f"RANDOM FOREST MODEL FOR KKR = {kkr_value}")
    print(f"{'='*40}")

    # Filter KKR-specific data
    df_kkr = df[df['KKR'] == kkr_value].copy()

    # Remove missing target values
    df_kkr = df_kkr.dropna(subset=['D8R2'])

    if len(df_kkr) < 10:
        print("Too few observations – skipping.")
        continue

    # Define features and target
    X = df_kkr.drop(columns=cols_to_drop, errors='ignore')
    y = df_kkr['D8R2']

    # Remove missing values in features
    X = X.dropna(axis=0)
    y = y.loc[X.index]

    if len(X) < 5:
        print("Too few data points after cleaning – skipping.")
        continue

    # --- 4. Split train/test sets ---
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.3, random_state=42
    )

    # --- 5. Train Random Forest ---
    rf_model = RandomForestRegressor(
        n_estimators=100,  # number of trees
        max_depth=10,      # can tune this
        random_state=42,
        n_jobs=-1          # use all cores
    )
    rf_model.fit(X_train, y_train)

    # --- 6. Predict and evaluate ---
    y_pred = rf_model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    print(f"MSE: {mse:.2f}")
    print(f"R²: {r2:.3f}")

    # --- 7. Feature importance ---
    feat_importance = pd.Series(rf_model.feature_importances_, index=X.columns)
    print("\nTop 10 features:")
    print(feat_importance.sort_values(ascending=False).head(10))

    # --- 8. Store summary results ---
    results.append({
        'KKR': kkr_value,
        'MSE': round(mse, 2),
        'R2': round(r2, 3)
    })

# --- 9. Summary table for all KKRs ---
results_df = pd.DataFrame(results)
print("\n=== RANDOM FOREST SUMMARY BY KKR ===")
print(results_df)



RANDOM FOREST MODEL FOR KKR = 276
MSE: 494.71
R²: 0.996

Top 10 features:
D8R2             0.997365
lag_5            0.000399
lag_6            0.000294
D8R2_mean_pow    0.000210
RO               0.000200
lag_4            0.000190
lag_12           0.000168
lag_9            0.000152
lag_11           0.000116
D8R2_mean_won    0.000116
dtype: float64

RANDOM FOREST MODEL FOR KKR = 804
MSE: 509.50
R²: 0.991

Top 10 features:
D8R2             0.981565
mean_9m          0.003415
lag_8            0.002394
D8R2_mean_pow    0.001842
lag_10           0.001778
lag_3            0.001771
mean_6m          0.000880
lag_11           0.000824
RO               0.000518
quarter          0.000507
dtype: float64

RANDOM FOREST MODEL FOR KKR = 380
MSE: 315.79
R²: 0.977

Top 10 features:
D8R2             0.969093
lag_3            0.004246
mean_9m          0.003667
lag_4            0.003028
mean_3m          0.002977
mean_6m          0.002433
D8R2_mean_gmn    0.002001
lag_7            0.001832
mean_12m         