In [14]:
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split, RandomizedSearchCV, KFold
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

# -------------------- 1) โหลดข้อมูล --------------------
path = Path("new data.xlsx")
df = pd.read_excel(path, sheet_name="Sheet2")
df = df.replace('-', np.nan)

if "Date" in df.columns:
    df["Date"] = pd.to_datetime(df["Date"], errors="coerce")
    df["Year"] = df["Date"].dt.year
    df["Month"] = df["Date"].dt.month
    df["Day"] = df["Date"].dt.day
    df = df.drop(columns=["Date"])

for c in df.columns:
    if c != "dBZ":
        df[c] = pd.to_numeric(df[c], errors="ignore")

df = df[~df["dBZ"].isna()].copy()
y = df["dBZ"]
X = df.drop(columns=["dBZ"])
numeric_cols = X.select_dtypes(include=[np.number]).columns.tolist()
X = X[numeric_cols]

# -------------------- 2) Pipeline --------------------
numeric_features = X.columns.tolist()
preprocess = ColumnTransformer(
    transformers=[("num", Pipeline([("imputer", SimpleImputer(strategy="median")),
                                     ("scaler", StandardScaler())]), numeric_features)],
    remainder="drop"
)

# -------------------- 3) Split Train/Test --------------------
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# -------------------- 4) Hyperparameter search space --------------------
rf = Pipeline([("prep", preprocess),
               ("model", RandomForestRegressor(random_state=42))])

xgb = Pipeline([("prep", preprocess),
                ("model", XGBRegressor(random_state=42, n_jobs=2, objective="reg:squarederror"))])

param_dist_rf = {
    "model__n_estimators": [100, 200, 400, 600],
    "model__max_depth": [None, 5, 10, 20],
    "model__min_samples_split": [2, 5, 10],
    "model__min_samples_leaf": [1, 2, 4],
    "model__max_features": ["auto", "sqrt", "log2"]
}

param_dist_xgb = {
    "model__n_estimators": [100, 200, 400],
    "model__max_depth": [3, 4, 5, 6],
    "model__learning_rate": [0.01, 0.05, 0.1, 0.2],
    "model__subsample": [0.7, 0.8, 0.9, 1.0],
    "model__colsample_bytree": [0.7, 0.8, 0.9, 1.0],
    "model__reg_lambda": [0.5, 1.0, 1.5, 2.0]
}

# -------------------- 5) RandomizedSearchCV --------------------
cv = KFold(n_splits=3, shuffle=True, random_state=42)

search_rf = RandomizedSearchCV(rf, param_dist_rf, n_iter=20, scoring="neg_root_mean_squared_error",
                               n_jobs=-1, cv=cv, verbose=1, random_state=42)
search_xgb = RandomizedSearchCV(xgb, param_dist_xgb, n_iter=20, scoring="neg_root_mean_squared_error",
                                n_jobs=-1, cv=cv, verbose=1, random_state=42)

print("🔎 กำลังค้นหา Hyperparameters ของ Random Forest...")
search_rf.fit(X_train, y_train)
print("RF best params:", search_rf.best_params_)
print("RF best CV score (RMSE):", -search_rf.best_score_)

print("\n🔎 กำลังค้นหา Hyperparameters ของ XGBoost...")
search_xgb.fit(X_train, y_train)
print("XGB best params:", search_xgb.best_params_)
print("XGB best CV score (RMSE):", -search_xgb.best_score_)

# -------------------- 6) ประเมินบน Test set --------------------
best_rf = search_rf.best_estimator_
best_xgb = search_xgb.best_estimator_

for name, model in [("Random Forest", best_rf), ("XGBoost", best_xgb)]:
    y_pred = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    print(f"\n{name} Test Metrics")
    print("RMSE:", rmse)
    print("MAE:", mae)
    print("R2 :", r2)

# -------------------- 7) เลือกโมเดลที่ดีที่สุด --------------------
final_model = best_rf if r2_score(y_test, best_rf.predict(X_test)) > r2_score(y_test, best_xgb.predict(X_test)) else best_xgb
print("\n✅ โมเดลที่เลือก:", "Random Forest" if final_model == best_rf else "XGBoost")

# -------------------- 8) บันทึกโมเดล --------------------
import joblib
Path("model_tuned").mkdir(exist_ok=True)
joblib.dump(final_model, "model_tuned/best_model.pkl")


  df = df.replace('-', np.nan)
  df[c] = pd.to_numeric(df[c], errors="ignore")


🔎 กำลังค้นหา Hyperparameters ของ Random Forest...
Fitting 3 folds for each of 20 candidates, totalling 60 fits


9 fits failed out of a total of 60.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
3 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\HP\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\LocalCache\local-packages\Python312\site-packages\sklearn\model_selection\_validation.py", line 859, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\HP\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\LocalCache\local-packages\Python312\site-packages\sklearn\base.py", line 1365, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\HP\AppData\Local\Package

RF best params: {'model__n_estimators': 600, 'model__min_samples_split': 2, 'model__min_samples_leaf': 1, 'model__max_features': 'log2', 'model__max_depth': None}
RF best CV score (RMSE): 16.492593165811712

🔎 กำลังค้นหา Hyperparameters ของ XGBoost...
Fitting 3 folds for each of 20 candidates, totalling 60 fits
XGB best params: {'model__subsample': 0.9, 'model__reg_lambda': 0.5, 'model__n_estimators': 400, 'model__max_depth': 5, 'model__learning_rate': 0.01, 'model__colsample_bytree': 0.7}
XGB best CV score (RMSE): 15.52607254452583

Random Forest Test Metrics
RMSE: 14.045687215364431
MAE: 11.855822222222223
R2 : 0.4518200776624157

XGBoost Test Metrics
RMSE: 12.167189548327931
MAE: 9.639799466133118
R2 : 0.5886441414176657

✅ โมเดลที่เลือก: XGBoost


['model_tuned/best_model.pkl']