In [9]:

import shap_select                
from shap_select import shap_select
import shap
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import pandas as pd
import numpy as np

# Load data
df = pd.read_csv("Final Dataset.csv")
df = df[df['Region'].isin([2,4,5,7])].reset_index(drop=True)


df = df.sort_values(['Year', 'Month']).reset_index(drop=True)
df['Solar Radiation Lag1'] = df['Solar Radiation'].shift(1)
df = df.dropna()


In [2]:
df.head(10)

Unnamed: 0,Region,Year,Month,SoilTemperature-10,SoilTemperature-30,SoilTemperature-50,Rainfall,Temperature,Min Temperature,Max Temperature,Humidity,Sunshine,Wind Speed,Surface Pressure,Solar Radiation,SoilTemp30_SMA,Solar Radiation Lag1
1,5,2001,1,17.41,18.75,19.37,0.0,15.94,9.5,24.18,74.48,7.87,2.18,101137.01,163.63,25.944167,172.76
2,7,2001,1,17.33,19.99,19.6,5.0,15.91,10.05,23.95,78.23,7.15,2.19,101277.11,171.44,28.371667,163.63
4,4,2001,2,21.43,20.03,21.55,11.0,22.61,15.04,27.32,74.43,6.05,1.8,100945.35,188.91,26.710833,199.25
5,5,2001,2,20.85,20.91,20.67,0.0,19.78,12.71,28.49,71.96,8.3525,1.89,100872.92,193.47,26.098333,188.91
6,7,2001,2,20.71,22.11,20.8,0.0,19.97,13.93,28.07,75.46,6.03,1.88,101036.7,194.44,28.485,193.47
8,4,2001,3,25.46,23.47,24.84,16.0,26.62,18.3,31.09,70.19,7.82,1.88,100793.77,203.81,26.616667,210.73
9,5,2001,3,25.94,25.28,24.66,9.0,24.69,17.25,33.55,63.33,8.3525,2.17,100688.82,218.12,26.179167,203.81
10,7,2001,3,25.52,26.33,24.52,21.0,24.25,17.7,32.39,65.77,8.04,2.07,100872.3,212.7,28.388333,218.12
12,4,2001,4,29.3,27.13,28.22,156.0,29.06,22.41,32.56,74.9,7.21,2.34,100573.68,245.04,26.671667,251.75
13,5,2001,4,31.86,30.76,29.61,13.0,29.32,22.81,35.988,63.33,8.3525,2.63,100398.76,253.16,26.43,245.04


In [12]:
X = df[['Temperature','Min Temperature','Max Temperature',
        'Wind Speed','Sunshine','Humidity','Rainfall','Year','Month','Surface Pressure',
        'Solar Radiation','Solar Radiation Lag1']]
y = df['SoilTemperature-50']

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

base_model = RandomForestRegressor(
    n_estimators=300, max_depth=15, max_features='sqrt',
    min_samples_split=5, min_samples_leaf=2, bootstrap=False,
    random_state=42
)

base_model.fit(X_train, y_train)

selected_df = shap_select(
    base_model,
    X_val,
    y_val,
    task="regression",
    threshold=0.10,
    alpha=1e-6
)

print(selected_df[['feature name', 'coefficient', 'stat.significance', 'selected']])

chosen_feats = selected_df[selected_df["selected"] == 1]["feature name"].tolist()
print("\nSelected features:", chosen_feats)

final_model = RandomForestRegressor(
    n_estimators=300, max_depth=15, max_features='sqrt',
    min_samples_split=5, min_samples_leaf=2, bootstrap=False,
    random_state=42
)
final_model.fit(X_train[chosen_feats], y_train)

y_pred_test = final_model.predict(X_test[chosen_feats])
y_pred_train = final_model.predict(X_train[chosen_feats])

print(f"\nFinal Model Evaluation (with SHAP-SELECT):")
print(f"RMSE (test):  {mean_squared_error(y_test, y_pred_test):.3f}")
print(f"R²   (test):  {r2_score(y_test, y_pred_test):.3f}")
print(f"R²   (train): {r2_score(y_train, y_pred_train):.3f}")

            feature name  coefficient  stat.significance  selected
0        Solar Radiation   120.318285       4.800637e-09         1
1   Solar Radiation Lag1    76.120666       1.410331e-03         1
2                   Year    46.199599       1.212038e-01         0
3                  Month     3.345877       2.762047e-01         0
4               Humidity     2.717876       8.190374e-01         0
5            Temperature    -0.724153       7.496391e-01        -1
6        Max Temperature    -2.693713       6.920639e-01        -1
7               Rainfall    -7.853907       5.460437e-01        -1
8       Surface Pressure    -8.740305       1.067461e-01        -1
9        Min Temperature   -15.292686       4.876905e-04        -1
10            Wind Speed   -73.862803       1.244854e-04        -1
11              Sunshine  -192.419231       1.512568e-05        -1

Selected features: ['Solar Radiation', 'Solar Radiation Lag1']

Final Model Evaluation (with SHAP-SELECT):
RMSE (test):  12.123


In [6]:
print(selected_df.columns)


Index(['feature name', 't-value', 'stat.significance', 'coefficient',
       'selected'],
      dtype='object')
