In [15]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt


In [16]:
df = pd.read_csv(r"..\synthetic data\tourists_features.csv",index_col=0)

In [17]:
# --- 1. Columns to exclude from the model ---
cols_to_drop = ['REGON','D8R1', 'WON', 'POW', 'GMN', 'D8R01', 'D8R02','date']

In [18]:
# --- 2. Summary results list ---
results = []


In [19]:

# --- 3. Loop over each KKR ---
for kkr_value in df['KKR'].unique():
    print(f"\n{'='*40}")
    print(f"RANDOM FOREST MODEL FOR KKR = {kkr_value}")
    print(f"{'='*40}")

    # Filter KKR-specific data
    df_kkr = df[df['KKR'] == kkr_value].copy()

    # Remove missing target values
    df_kkr = df_kkr.dropna(subset=['D8R1'])

    if len(df_kkr) < 10:
        print("Too few observations – skipping.")
        continue

    # Define features and target
    X = df_kkr.drop(columns=cols_to_drop, errors='ignore')
    y = df_kkr['D8R1']

    # Remove missing values in features
    X = X.dropna(axis=0)
    y = y.loc[X.index]

    if len(X) < 5:
        print("Too few data points after cleaning – skipping.")
        continue

    # --- 4. Split train/test sets ---
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.3, random_state=42
    )

    # --- 5. Train Random Forest ---
    rf_model = RandomForestRegressor(
        n_estimators=100,  # number of trees
        max_depth=10,      # can tune this
        random_state=42,
        n_jobs=-1          # use all cores
    )
    rf_model.fit(X_train, y_train)

    # --- 6. Predict and evaluate ---
    y_pred = rf_model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    print(f"MSE: {mse:.2f}")
    print(f"R²: {r2:.3f}")

    # --- 7. Feature importance ---
    feat_importance = pd.Series(rf_model.feature_importances_, index=X.columns)
    print("\nTop 10 features:")
    print(feat_importance.sort_values(ascending=False).head(10))

    # --- 8. Store summary results ---
    results.append({
        'KKR': kkr_value,
        'MSE': round(mse, 2),
        'R2': round(r2, 3)
    })

# --- 9. Summary table for all KKRs ---
results_df = pd.DataFrame(results)
print("\n=== RANDOM FOREST SUMMARY BY KKR ===")
print(results_df)



RANDOM FOREST MODEL FOR KKR = 643
MSE: 3205.36
R²: 0.008

Top 10 features:
mean_3m          0.191540
lag_4            0.070784
mean_9m          0.063227
lag_5            0.056106
lag_6            0.052490
mean_12m         0.050134
lag_3            0.049654
mean_6m          0.043294
lag_10           0.041779
D8R1_mean_won    0.039201
dtype: float64

RANDOM FOREST MODEL FOR KKR = 276
MSE: 8333.37
R²: 0.605

Top 10 features:
mean_3m          0.510387
lag_8            0.074693
lag_4            0.074626
D8R1_mean_won    0.053391
lag_1            0.051121
D8R1_mean_pow    0.034707
lag_2            0.029528
lag_3            0.020336
lag_6            0.019127
mean_6m          0.015052
dtype: float64

RANDOM FOREST MODEL FOR KKR = 804
MSE: 5143.24
R²: -0.001

Top 10 features:
mean_3m              0.125900
D8R1_mean_pow        0.096428
lag_8                0.081721
lag_9                0.079947
D8R1_mean_gmn        0.070699
lag_11               0.068532
lag_12               0.036722
lag_5      