In [1]:
import os

In [2]:
%pwd

'c:\\Users\\NARINDER\\Desktop\\new project\\research'

In [3]:
os.chdir('..')

In [4]:
%pwd

'c:\\Users\\NARINDER\\Desktop\\new project'

In [5]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class BaseModelConfig:
    root_dir: Path
    base_model_path: Path

In [6]:
import yaml
from pathlib import Path
from typing import Any, Dict
from src.floodClassifier.constants import *

class ConfigurationManager:
    def __init__(self, config_path = CONFIG_FILE_PATH, params_path = PARAMS_FILE_PATH):
        self.config_path = Path(config_path)
        self.params_path = Path(params_path)
        if not self.config_path.exists():
            raise FileNotFoundError(f"Config file not found: {self.config_path}")
        if not self.params_path.exists():
            raise FileNotFoundError(f"Params file not found: {self.params_path}")
        self._config = yaml.safe_load(self.config_path.read_text())
        self._params = yaml.safe_load(self.params_path.read_text())

    def get_prepare_base_model_config(self) -> Dict[str, Any]:
        pb = self._config.get("prepare_base_model", {})
        return {
            "root_dir": pb.get("root_dir", "artifacts/prepareBaseModel"),
            "base_model_path": pb.get("base_model_path", "artifacts/prepareBaseModel/base_model.pkl")
        }

    def get_arima_params(self) -> Dict[str, Any]:
        model_cfg = self._params.get("MODEL", {})
        tuning_cfg = self._params.get("TUNING", {})
        forecast_cfg = self._params.get("FORECAST", {})
        metrics = self._params.get("METRICS", [])
        return {
            "type": model_cfg.get("TYPE", "arima"),
            "p": int(model_cfg.get("P", 1)),
            "d": int(model_cfg.get("D", 0)),
            "q": int(model_cfg.get("Q", 0)),
            "seasonal": bool(model_cfg.get("SEASONAL", False)),
            "m": int(model_cfg.get("M", 0)),
            "enforce_stationarity": bool(model_cfg.get("ENFORCE_STATIONARITY", True)),
            "enforce_invertibility": bool(model_cfg.get("ENFORCE_INVERTIBILITY", True)),
            "tuning_enabled": bool(tuning_cfg.get("ENABLED", False)),
            "tuning_search": tuning_cfg.get("SEARCH", "grid"),
            "p_values": tuning_cfg.get("P_VALUES", []),
            "d_values": tuning_cfg.get("D_VALUES", []),
            "q_values": tuning_cfg.get("Q_VALUES", []),
            "horizon": int(forecast_cfg.get("HORIZON", 30)),
            "conf_int": float(forecast_cfg.get("CONF_INT", 0.95)),
            "metrics": metrics
        }
    
    def get_xgboost_config(self) -> Dict[str, any]:
            xcfg = self._config.get("xgboost", {})
            xp = self._params.get("XGBOOST", {})
            tuning_cfg = self._params.get("TUNING", {}).get("XGBOOST", {})
            train_cfg = self._params.get("TRAINING", {}) or {}

            root_dir = Path(xcfg.get("root_dir", "artifacts/xgboost"))
            model_file = xcfg.get("model_file", "xgb_model.pkl")
            model_path = root_dir / model_file

            params = {
                "objective": xp.get("OBJECTIVE", xp.get("objective", "binary:logistic")),
                "n_estimators": int(xp.get("N_ESTIMATORS", xp.get("n_estimators", 100))),
                "learning_rate": float(xp.get("LEARNING_RATE", xp.get("learning_rate", 0.1))),
                "max_depth": int(xp.get("MAX_DEPTH", xp.get("max_depth", 6))),
                "subsample": float(xp.get("SUBSAMPLE", xp.get("subsample", 1.0))),
                "colsample_bytree": float(xp.get("COLSAMPLE_BYTREE", xp.get("colsample_bytree", 1.0))),
                "reg_alpha": float(xp.get("REG_ALPHA", xp.get("reg_alpha", 0.0))),
                "reg_lambda": float(xp.get("REG_LAMBDA", xp.get("reg_lambda", 1.0))),
                "seed": int(xp.get("SEED", xp.get("seed", 42))),
                "n_jobs": int(xp.get("N_JOBS", xp.get("n_jobs", -1))),
                "verbosity": int(xp.get("VERBOSITY", xp.get("verbosity", 1)))
            }

            train = {
                "test_size": float(train_cfg.get("TEST_SIZE", 0.2)),
                "num_boost_round": int(train_cfg.get("NUM_BOOST_ROUND", params["n_estimators"])),
                "early_stopping_rounds": int(train_cfg.get("EARLY_STOPPING_ROUNDS", 50)),
                "eval_metric": train_cfg.get("EVAL_METRIC", "logloss"),
                "fit_kwargs": train_cfg.get("FIT_KWARGS", {})
            }

            tuning = {
                "enabled": bool(tuning_cfg.get("ENABLED", False)),
                "search": tuning_cfg.get("SEARCH", "grid"),
                "param_grid": tuning_cfg.get("PARAM_GRID", tuning_cfg.get("param_grid", {})),
                "cv": int(tuning_cfg.get("CV", 3))
            }

            return {
                "root_dir": str(root_dir),
                "model_file": model_file,
                "model_path": str(model_path),
                "params": params,
                "train": train,
                "tuning": tuning
            }


In [7]:
import os
import pickle
from pathlib import Path
import numpy as np
import pandas as pd
from statsmodels.tsa.statespace.sarimax import SARIMAX
from typing import Optional, Dict
import xgboost as xgb
from src.floodClassifier import logger

class PrepareBaseModel:
    def __init__(self, prepare_cfg: Dict[str, str], arima_params: Dict[str, any]):
        self.root_dir = Path(prepare_cfg["root_dir"])
        self.base_model_path = Path(prepare_cfg["base_model_path"])
        self.arima_params = arima_params
        os.makedirs(self.root_dir, exist_ok=True)

    def _ensure_series(self, y: pd.Series) -> pd.Series:
        if not isinstance(y, pd.Series):
            y = pd.Series(y)
        if y.index.dtype == object:
            try:
                y.index = pd.to_datetime(y.index)
            except Exception:
                pass
        return y.astype(float)

    def build_arima(self, y: pd.Series, exog: Optional[pd.DataFrame] = None):
        y = self._ensure_series(y)
        p = self.arima_params["p"]
        d = self.arima_params["d"]
        q = self.arima_params["q"]
        seasonal = self.arima_params["seasonal"]
        m = self.arima_params["m"] if seasonal else 0

        model = SARIMAX(
            endog=y,
            exog=exog,
            order=(p, d, q),
            seasonal_order=(0, 0, 0, 0) if not seasonal else (p, d, q, m),
            enforce_stationarity=self.arima_params.get("enforce_stationarity", True),
            enforce_invertibility=self.arima_params.get("enforce_invertibility", True),
            simple_differencing=False
        )
        return model

    def fit_and_save(self, y: pd.Series, exog: Optional[pd.DataFrame] = None, save_overwrite: bool = True):
        model = self.build_arima(y, exog=exog)
        fitted = model.fit(disp=False)
        if self.base_model_path.exists() and not save_overwrite:
            raise FileExistsError(f"Base model already exists at {self.base_model_path}")
        with open(self.base_model_path, "wb") as f:
            pickle.dump(fitted, f)
        return fitted

    def load(self):
        if not self.base_model_path.exists():
            raise FileNotFoundError(f"No base model at {self.base_model_path}")
        with open(self.base_model_path, "rb") as f:
            return pickle.load(f)

    def run_from_df(self, df: pd.DataFrame, target_col: str = "y", exog_cols: Optional[list] = None):
        if target_col not in df.columns:
            raise KeyError(f"target_col '{target_col}' not found in dataframe")
        y = df[target_col]
        exog = df[exog_cols] if exog_cols else None
        fitted = self.fit_and_save(y, exog=exog)
        return fitted
    
    def prepare_xgboost_model(
        self,
        X: pd.DataFrame,
        y: pd.Series,
        params_override: Optional[Dict] = None,
        fit_kwargs: Optional[Dict] = None,
        save_overwrite: bool = True,):
        """
        Train an XGBoost model using configuration from ConfigurationManager.get_xgboost_config()
        and save the fitted model to the configured path.

        Returns the fitted model instance.
        """
        # load config for xgboost
        cfg = ConfigurationManager().get_xgboost_config()
        model_cfg = cfg.get("params", {}).copy()
        train_cfg = cfg.get("train", {}) or {}

        # allow runtime overrides
        if params_override:
            model_cfg.update(params_override)

        fit_opts = train_cfg.get("fit_kwargs", {}) or {}
        if fit_kwargs:
            fit_opts.update(fit_kwargs)

        # choose classifier vs regressor by objective string
        objective = str(model_cfg.get("objective", "")).lower()
        if objective.startswith("reg:") or "reg" in objective or "squarederror" in objective:
            ModelClass = xgb.XGBRegressor
        else:
            ModelClass = xgb.XGBClassifier

        # instantiate and fit
        model = ModelClass(**model_cfg)
        model.fit(X, y, **fit_opts)

        # determine save location from config and persist
        model_path = Path(cfg.get("model_path", self.root_dir / "xgb_model.pkl"))
        model_path.parent.mkdir(parents=True, exist_ok=True)
        if model_path.exists() and not save_overwrite:
            raise FileExistsError(f"XGBoost model already exists at {model_path}")

        with open(model_path, "wb") as f:
            pickle.dump(model, f)

        try:
            logger.info(f"XGBoost model trained and saved to: {model_path}")
        except Exception:
            pass

        return model


In [None]:
from src.floodClassifier import logger

try:
    config = ConfigurationManager()
    prepare_cfg = config.get_prepare_base_model_config()
    arima_params = config.get_arima_params()
    preparer = PrepareBaseModel(prepare_cfg, arima_params)

    csv_path = r"artifacts\data_ingestion\FloodPrediction.csv"

    date_year_col = "Year"
    date_month_col = "Month"
    target_col = "Flood?"
    exog_cols = ["Altitude", "Max_Temp", "Min_Temp", "Relative_Humidity"]
    usecols = [date_year_col, date_month_col, target_col] + exog_cols
    df = pd.read_csv(csv_path, usecols=usecols)

    try:
        df["Month"] = df[date_month_col].astype(int)
        df["Year"] = df[date_year_col].astype(int)
        df["_date"] = pd.to_datetime(dict(year=df["Year"], month=df["Month"], day=1))
    except Exception:
        df["_date"] = pd.to_datetime(df[date_year_col].astype(str) + "-" + df[date_month_col].astype(str) + "-01", errors="coerce")

    df = df.dropna(subset=["_date"]).copy()
    df = df.set_index("_date")
    df = df.sort_index()

    if target_col not in df.columns:
        raise KeyError(f"Target column '{target_col}' not found in file: {csv_path}")

    df = df.dropna(subset=[target_col])

    df[target_col] = pd.to_numeric(df[target_col], errors="coerce")
    if df[target_col].isna().any():
        raise ValueError("Target column contains non-numeric or uncoercible values after parsing")

    if exog_cols:
        for col in exog_cols:
            df[col] = pd.to_numeric(df[col], errors="coerce")
        df[exog_cols] = df[exog_cols].fillna(method="ffill").fillna(method="bfill")

    if not df.index.is_monotonic_increasing:
        df = df.sort_index()

    required_obs = max(arima_params.get("p", 1), arima_params.get("d", 0), arima_params.get("q", 1)) + 1
    if len(df) < required_obs:
        raise ValueError(f"Not enough observations ({len(df)}) to fit ARIMA with order p,d,q requiring at least {required_obs}")

    fitted_model = preparer.run_from_df(df, target_col=target_col, exog_cols=exog_cols)
    logger.info(f"ARIMA base model fitted and saved to: {prepare_cfg['base_model_path']}")

except Exception as e:
    raise e

  df[exog_cols] = df[exog_cols].fillna(method="ffill").fillna(method="bfill")
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


[2025-11-21 02:55:22,635: INFO: 1717932242: ARIMA base model fitted and saved to: artifacts/prepareBaseModel/base_model.h5]


In [None]:
import logging
import pickle
import pandas as pd
from pathlib import Path
from xgboost import XGBClassifier

csv_path = r"artifacts\data_ingestion\FloodPrediction.csv"
target_col = "Flood?"

try:
    # --- Load config ---
    config = ConfigurationManager()
    xgb_cfg = config.get_xgboost_config()
    params = xgb_cfg["params"]
    train_cfg = xgb_cfg["train"]

    model_path = Path(xgb_cfg["model_path"])
    model_path.parent.mkdir(parents=True, exist_ok=True)

    # --- Load dataset ---
    df = pd.read_csv(csv_path)
    if target_col not in df.columns:
        raise KeyError(f"Target column '{target_col}' not found in dataset")

    y = df[target_col].replace([np.nan, np.inf, -np.inf], 0).astype(int)
    X = df[["Max_Temp", "Min_Temp", "Altitude", "Relative_Humidity"]]

    # --- Train/test split ---
    from sklearn.model_selection import train_test_split
    X_train, X_val, y_train, y_val = train_test_split(
        X, y, test_size=train_cfg["test_size"], shuffle=False
    )

    # --- Initialize and train model ---
    model = XGBClassifier(**params)
    model.fit(
        X_train,
        y_train,
        eval_set=[(X_val, y_val)]
    )

    # --- Save model ---
    with open(model_path, "wb") as f:
        pickle.dump(model, f)

    logger.info(f"XGBoost model trained and saved to {model_path}")

except Exception as e:
    logger.exception("Pipeline run failed")
    raise

[0]	validation_0-logloss:0.42463
[1]	validation_0-logloss:0.37510
[2]	validation_0-logloss:0.33759
[3]	validation_0-logloss:0.30877
[4]	validation_0-logloss:0.28474
[5]	validation_0-logloss:0.26547
[6]	validation_0-logloss:0.24893
[7]	validation_0-logloss:0.23492
[8]	validation_0-logloss:0.22305
[9]	validation_0-logloss:0.21327
[10]	validation_0-logloss:0.20474
[11]	validation_0-logloss:0.19737
[12]	validation_0-logloss:0.19152
[13]	validation_0-logloss:0.18598
[14]	validation_0-logloss:0.18126
[15]	validation_0-logloss:0.17724
[16]	validation_0-logloss:0.17325
[17]	validation_0-logloss:0.17038
[18]	validation_0-logloss:0.16770
[19]	validation_0-logloss:0.16559
[20]	validation_0-logloss:0.16356
[21]	validation_0-logloss:0.16199
[22]	validation_0-logloss:0.16041
[23]	validation_0-logloss:0.15910
[24]	validation_0-logloss:0.15804
[25]	validation_0-logloss:0.15690
[26]	validation_0-logloss:0.15614
[27]	validation_0-logloss:0.15543
[28]	validation_0-logloss:0.15493
[29]	validation_0-loglos

In [9]:
import pickle
from pathlib import Path
import pandas as pd
import numpy as np
from sklearn.metrics import mean_absolute_error, mean_squared_error
from src.floodClassifier.constants import *

config = ConfigurationManager()
prepare_cfg = config.get_prepare_base_model_config()
arima_params = config.get_arima_params()

base_model_path = Path(prepare_cfg["base_model_path"])
csv_path = r"C:\Users\NARINDER\Desktop\new project\artifacts\data_ingestion\FloodPrediction.csv"

date_year_col = "Year"
date_month_col = "Month"
target_col = "Flood?"
exog_cols = ["Rainfall", "Max_Temp"]
test_fraction = 0.2
forecast_horizon = arima_params.get("horizon", 30)

df = pd.read_csv(csv_path, usecols=[date_year_col, date_month_col, target_col] + exog_cols)
df["Month"] = df[date_month_col].astype(int)
df["Year"] = df[date_year_col].astype(int)
df["_date"] = pd.to_datetime(dict(year=df["Year"], month=df["Month"], day=1))
df = df.dropna(subset=["_date"])
df = df.set_index("_date").sort_index()
df[target_col] = pd.to_numeric(df[target_col], errors="coerce")
df = df.dropna(subset=[target_col])
if exog_cols:
    for c in exog_cols:
        df[c] = pd.to_numeric(df[c], errors="coerce")
    df[exog_cols] = df[exog_cols].fillna(method="ffill").fillna(method="bfill")

n = len(df)
n_test = max(1, int(n * test_fraction))
train_df = df.iloc[:-n_test]
test_df = df.iloc[-n_test:]

y_train = train_df[target_col]
y_test = test_df[target_col]
exog_train = train_df[exog_cols] if exog_cols else None
exog_test = test_df[exog_cols] if exog_cols else None

if not base_model_path.exists():
    raise FileNotFoundError(f"Base model not found at {base_model_path}")

with open(base_model_path, "rb") as f:
    fitted = pickle.load(f)

steps = len(test_df)
try:
    fc = fitted.get_forecast(steps=steps, exog=exog_test)
    y_pred = fc.predicted_mean
    conf_int = fc.conf_int(alpha=1 - arima_params.get("conf_int", 0.95))
except Exception:
    start = test_df.index[0]
    end = test_df.index[-1]
    y_pred = fitted.predict(start=start, end=end, exog=exog_test)
    conf_int = None

import numpy as np
from sklearn.metrics import mean_absolute_error, mean_squared_error

try:
    y_pred = pd.Series(y_pred)
except Exception:
    y_pred = pd.Series(list(y_pred))

print("DEBUG pre-align lengths -> y_test:", len(y_test), "y_pred:", len(y_pred))
try:
    print("DEBUG pre-align index ranges -> y_test:", y_test.index.min(), "to", y_test.index.max())
except Exception:
    print("DEBUG y_test has no index")
try:
    print("DEBUG y_pred index sample:", getattr(y_pred, "index", None)[:3], "len:", len(y_pred))
except Exception:
    pass

try:
    y_pred_idx = pd.Series(y_pred.values, index=test_df.index)
except Exception:
    y_pred_idx = pd.Series(y_pred.values[:len(test_df)], index=test_df.index[:len(y_pred.values[:len(test_df)])])

y_test_s = y_test.astype(float)

combined = pd.concat([y_test_s, y_pred_idx], axis=1)
combined.columns = ["y_true", "y_pred"]
combined = combined.dropna(how="any")

if combined.empty:
    print("DEBUG: index alignment produced empty combined. Trying positional fallback.")
    min_len = min(len(y_test_s), len(y_pred))
    if min_len > 0:
        y_true_pos = y_test_s.values[-min_len:].astype(float)
        y_pred_pos = y_pred.values[-min_len:].astype(float)
        combined = pd.DataFrame({"y_true": y_true_pos, "y_pred": y_pred_pos})
    else:
        combined = pd.DataFrame()

if combined.empty:
    print("DEBUG: positional fallback produced empty. Trying element-wise non-NaN overlap.")
    a = y_test_s.values
    b = y_pred.values
    min_len = min(len(a), len(b))
    if min_len > 0:
        a = a[-min_len:]
        b = b[-min_len:]
        mask = np.isfinite(a) & np.isfinite(b)
        if mask.any():
            combined = pd.DataFrame({"y_true": a[mask], "y_pred": b[mask]})
        else:
            combined = pd.DataFrame()

if combined.empty:
    print("DEBUG: final diagnostics -> len(y_test):", len(y_test), "len(y_pred):", len(y_pred))
    print("DEBUG sample y_test tail:", y_test.tail(5))
    print("DEBUG sample y_pred tail:", pd.Series(y_pred).tail(5))
    raise ValueError("No overlapping non-NaN samples available for evaluation after all fallbacks")

y_true = combined["y_true"].astype(float).values
y_pred_vals = combined["y_pred"].astype(float).values

mae = mean_absolute_error(y_true, y_pred_vals)
rmse = mean_squared_error(y_true, y_pred_vals)

print(f"Evaluated samples: {len(y_true)}")
print(f"MAE: {mae:.4f}")
print(f"RMSE: {rmse:.4f}")

print(fitted.summary())

# future_steps = min(forecast_horizon, 12)
# if exog_cols:
#     last_exog = df[exog_cols].iloc[-1:]
#     future_exog = pd.concat([last_exog]*future_steps, ignore_index=True)
#     future_index = pd.date_range(start=df.index[-1] + pd.offsets.MonthBegin(1), periods=future_steps, freq="MS")
#     future_exog.index = future_index
# else:
#     future_exog = None
#     future_index = pd.date_range(start=df.index[-1] + pd.offsets.MonthBegin(1), periods=future_steps, freq="MS")

# try:
#     future_fc = fitted.get_forecast(steps=future_steps, exog=future_exog)
#     future_pred = pd.Series(future_fc.predicted_mean.values, index=future_index)
#     future_ci = future_fc.conf_int(alpha=1 - arima_params.get("conf_int", 0.95))
# except Exception:
#     future_pred = pd.Series(fitted.predict(start=future_index[0], end=future_index[-1], exog=future_exog),
#                             index=future_index)
#     future_ci = None

# print("\nFuture predictions (quick):")
# print(future_pred)

  df[exog_cols] = df[exog_cols].fillna(method="ffill").fillna(method="bfill")
  return get_prediction_index(
  return get_prediction_index(


DEBUG pre-align lengths -> y_test: 898 y_pred: 915
DEBUG pre-align index ranges -> y_test: 2004-07-01 00:00:00 to 2013-12-01 00:00:00
DEBUG y_pred index sample: DatetimeIndex(['2004-07-01', '2004-07-01', '2004-07-01'], dtype='datetime64[ns]', name='_date', freq=None) len: 915
Evaluated samples: 898
MAE: 0.2438
RMSE: 0.1965
                               SARIMAX Results                                
Dep. Variable:                 Flood?   No. Observations:                 4493
Model:               SARIMAX(1, 1, 1)   Log Likelihood                2168.264
Date:                Fri, 21 Nov 2025   AIC                          -4322.528
Time:                        07:51:20   BIC                          -4277.658
Sample:                             0   HQIC                         -4306.716
                               - 4493                                         
Covariance Type:                  opg                                         
                        coef    std err    

In [12]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import numpy as np

# --- Get predicted probabilities on validation set ---
y_prob = model.predict_proba(X_val)[:, 1]

# --- Clip to [0,1] and threshold at 0.5 ---
y_prob = np.clip(y_prob, 0.0, 1.0)
y_pred_label = (y_prob >= 0.5).astype(int)
y_true_label = y_val.astype(int)

# --- Metrics ---
print("accuracy", accuracy_score(y_true_label, y_pred_label))
print("precision", precision_score(y_true_label, y_pred_label, zero_division=0))
print("recall", recall_score(y_true_label, y_pred_label, zero_division=0))
print("f1", f1_score(y_true_label, y_pred_label, zero_division=0))
print("confusion:\n", confusion_matrix(y_true_label, y_pred_label))

accuracy 0.9437819420783645
precision 0.8
recall 0.9586374695863747
f1 0.872163807415606
confusion:
 [[3090  197]
 [  34  788]]


In [13]:
import pickle
import numpy as np
import pandas as pd

# Path to your saved model
MODEL_PATH = r"artifacts\xgboost\xgb_model.pkl"

# Load the trained model
with open(MODEL_PATH, "rb") as f:
    model = pickle.load(f)

# Define the feature order (must match training)
FEATURES = ["Max_Temp", "Min_Temp", "Rainfall", "Relative_Humidity", "Wind_Speed"]

def get_manual_input():
    """
    Collect manual input for each feature.
    Returns a DataFrame with one row.
    """
    values = []
    for feat in FEATURES:
        val = float(input(f"Enter {feat}: "))
        values.append(val)
    # Create a DataFrame with the same feature names
    return pd.DataFrame([values], columns=FEATURES)

def predict_flood(input_df, threshold=0.5):
    """
    Generate prediction for new input.
    Returns probability and binary label.
    """
    prob = model.predict_proba(input_df)[:, 1][0]  # probability of Flood (class 1)
    label = int(prob >= threshold)
    return prob, label

if __name__ == "__main__":
    # Step 1: Get manual input
    new_input = get_manual_input()

    # Step 2: Predict
    prob, label = predict_flood(new_input)

    # Step 3: Show results
    print("\n--- Prediction Result ---")
    print(f"Input features: {new_input.to_dict(orient='records')[0]}")
    print(f"Flood probability: {prob:.3f}")
    print(f"Predicted label: {'Flood' if label == 1 else 'No Flood'}")


--- Prediction Result ---
Input features: {'Max_Temp': 39.0, 'Min_Temp': 29.0, 'Rainfall': 30.0, 'Relative_Humidity': 56.0, 'Wind_Speed': 44.0}
Flood probability: 0.000
Predicted label: No Flood
