In [None]:
# ------------------------------------------------------------
# 0) ইনস্টল (শুধু একবার লাগবে)
# !pip install shap-select shap shapely statsmodels -q
# ------------------------------------------------------------
import shap_select                 # shap-select লাইব্রেরি
from shap_select import shap_select
import shap
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

# Load data
df = pd.read_csv("Final Dataset.csv")
df = df[df['Region'].isin([2,4,5,7])].reset_index(drop=True)
df.tail(10)

df = df.sort_values(['Year', 'Month']).reset_index(drop=True)
df['Solar Radiation Lag1'] = df['Solar Radiation'].shift(1)
df = df.dropna()


X = df[['Year','Month','Temperature','Min Temperature','Max Temperature',
        'Wind Speed','Sunshine','Humidity','Rainfall','Surface Pressure',
        'Solar Radiation','Solar Radiation Lag1']]
y = df['SoilTemperature-50']


X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)
X_val,  X_test, y_val,  y_test  = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)


base_model = RandomForestRegressor(
    n_estimators=300, max_depth=15, max_features='sqrt',
    min_samples_split=5, min_samples_leaf=2, bootstrap=False,
    random_state=42
)
base_model.fit(X_train, y_train)


selected_df = shap_select(
    base_model,
    X_val,
    y_val,
    task="regression",     # soil temperature = continuous
    threshold=0.05,        # 5 % significance (paper default)
    alpha=1e-6             #  L1 penality to handle collinearity
)

print(selected_df[['feature', 'coef', 'p_value', 'selected']])

chosen_feats = selected_df[selected_df["selected"] == 1].feature.tolist()
print("\nChosen features:", chosen_feats)

# model training using new feature
final_model = RandomForestRegressor(
    n_estimators=300, max_depth=15, max_features='sqrt',
    min_samples_split=5, min_samples_leaf=2, bootstrap=False,
    random_state=42
)
final_model.fit(X_train[chosen_feats], y_train)


from sklearn.metrics import mean_squared_error, r2_score
pred_test  = final_model.predict(X_test[chosen_feats])
pred_train = final_model.predict(X_train[chosen_feats])

print(f"RMSE  (test): {mean_squared_error(y_test,  pred_test,  squared=False):.3f}")
print(f"R²    (test): {r2_score(y_test,  pred_test):.3f}")
print(f"R²  (train): {r2_score(y_train, pred_train):.3f}")
