In [1]:
import os

In [2]:
%pwd

'c:\\Users\\NARINDER\\Desktop\\new project\\research'

In [3]:
os.chdir('..')

In [4]:
%pwd

'c:\\Users\\NARINDER\\Desktop\\new project'

In [5]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class XGBoostModelConfig:
    root_dir: Path
    base_model_path: Path

In [6]:
import yaml
from pathlib import Path
from typing import Any, Dict
from src.floodClassifier.constants import *

class ConfigurationManager:
    def __init__(self, config_path = CONFIG_FILE_PATH, params_path = PARAMS_FILE_PATH):
        self.config_path = Path(config_path)
        self.params_path = Path(params_path)
        if not self.config_path.exists():
            raise FileNotFoundError(f"Config file not found: {self.config_path}")
        if not self.params_path.exists():
            raise FileNotFoundError(f"Params file not found: {self.params_path}")
        self._config = yaml.safe_load(self.config_path.read_text())
        self._params = yaml.safe_load(self.params_path.read_text())

    def get_prepare_base_model_config(self) -> Dict[str, Any]:
        pb = self._config.get("prepare_base_model", {})
        return {
            "root_dir": pb.get("root_dir", "artifacts/prepareBaseModel"),
            "base_model_path": pb.get("base_model_path", "artifacts/prepareBaseModel/base_model.pkl")
        }
    def get_xgboost_config(self) -> Dict[str, any]:
            xcfg = self._config.get("xgboost", {})
            xp = self._params.get("XGBOOST", {})
            tuning_cfg = self._params.get("TUNING", {}).get("XGBOOST", {})
            train_cfg = self._params.get("TRAINING", {}) or {}

            root_dir = Path(xcfg.get("root_dir", "artifacts/xgboost"))
            model_file = xcfg.get("model_file", "xgb_model.pkl")
            model_path = root_dir / model_file

            params = {
                "objective": xp.get("OBJECTIVE", xp.get("objective", "binary:logistic")),
                "n_estimators": int(xp.get("N_ESTIMATORS", xp.get("n_estimators", 100))),
                "learning_rate": float(xp.get("LEARNING_RATE", xp.get("learning_rate", 0.1))),
                "max_depth": int(xp.get("MAX_DEPTH", xp.get("max_depth", 6))),
                "subsample": float(xp.get("SUBSAMPLE", xp.get("subsample", 1.0))),
                "colsample_bytree": float(xp.get("COLSAMPLE_BYTREE", xp.get("colsample_bytree", 1.0))),
                "reg_alpha": float(xp.get("REG_ALPHA", xp.get("reg_alpha", 0.0))),
                "reg_lambda": float(xp.get("REG_LAMBDA", xp.get("reg_lambda", 1.0))),
                "seed": int(xp.get("SEED", xp.get("seed", 42))),
                "n_jobs": int(xp.get("N_JOBS", xp.get("n_jobs", -1))),
                "verbosity": int(xp.get("VERBOSITY", xp.get("verbosity", 1)))
            }

            train = {
                "test_size": float(train_cfg.get("TEST_SIZE", 0.2)),
                "num_boost_round": int(train_cfg.get("NUM_BOOST_ROUND", params["n_estimators"])),
                "early_stopping_rounds": int(train_cfg.get("EARLY_STOPPING_ROUNDS", 50)),
                "eval_metric": train_cfg.get("EVAL_METRIC", "logloss"),
                "fit_kwargs": train_cfg.get("FIT_KWARGS", {})
            }

            tuning = {
                "enabled": bool(tuning_cfg.get("ENABLED", False)),
                "search": tuning_cfg.get("SEARCH", "grid"),
                "param_grid": tuning_cfg.get("PARAM_GRID", tuning_cfg.get("param_grid", {})),
                "cv": int(tuning_cfg.get("CV", 3))
            }

            return {
                "root_dir": str(root_dir),
                "model_file": model_file,
                "model_path": str(model_path),
                "params": params,
                "train": train,
                "tuning": tuning
            }

In [7]:
import pickle
from pathlib import Path
import numpy as np
import pandas as pd
from typing import Dict
from src.floodClassifier import logger
from xgboost import XGBClassifier
from src.floodClassifier.constants import *

class PrepareBaseModel:
    def __init__(self) -> None:
        pass

    def train_and_save_model(self):
        csv_path = r"artifacts\data_ingestion\FloodPrediction.csv"
        target_col = "Flood?"

        try:
            # --- Load config ---
            config = ConfigurationManager()
            xgb_cfg = config.get_xgboost_config()
            params = xgb_cfg["params"]
            train_cfg = xgb_cfg["train"]

            model_path = Path(xgb_cfg["model_path"])
            model_path.parent.mkdir(parents=True, exist_ok=True)

            # --- Load dataset ---
            df = pd.read_csv(csv_path)
            if target_col not in df.columns:
                raise KeyError(f"Target column '{target_col}' not found in dataset")

            y = df[target_col].replace([np.nan, np.inf, -np.inf], 0).astype(int)
            X = df[["Max_Temp", "Min_Temp", "Rainfall", "Relative_Humidity", "Wind_Speed"]]

            # --- Train/test split ---
            from sklearn.model_selection import train_test_split
            X_train, X_val, y_train, y_val = train_test_split(
                X, y, test_size=train_cfg["test_size"], shuffle=False
            )

            # --- Initialize and train model ---
            model = XGBClassifier(**params)
            model.fit(
                X_train,
                y_train,
                eval_set=[(X_val, y_val)]
            )

            # --- Save model ---
            with open(model_path, "wb") as f:
                pickle.dump(model, f)

            logger.info(f"XGBoost model trained and saved to {model_path}")

        except Exception as e:
            logger.exception("Pipeline run failed")
            raise

In [8]:
prepare_base_model = PrepareBaseModel()
prepare_base_model.train_and_save_model()

[0]	validation_0-logloss:0.42463
[1]	validation_0-logloss:0.37510
[2]	validation_0-logloss:0.33759
[3]	validation_0-logloss:0.30877
[4]	validation_0-logloss:0.28474
[5]	validation_0-logloss:0.26547
[6]	validation_0-logloss:0.24893
[7]	validation_0-logloss:0.23492
[8]	validation_0-logloss:0.22305
[9]	validation_0-logloss:0.21327
[10]	validation_0-logloss:0.20474
[11]	validation_0-logloss:0.19737
[12]	validation_0-logloss:0.19152
[13]	validation_0-logloss:0.18598
[14]	validation_0-logloss:0.18126
[15]	validation_0-logloss:0.17724
[16]	validation_0-logloss:0.17325
[17]	validation_0-logloss:0.17038
[18]	validation_0-logloss:0.16770
[19]	validation_0-logloss:0.16559
[20]	validation_0-logloss:0.16356
[21]	validation_0-logloss:0.16199
[22]	validation_0-logloss:0.16041
[23]	validation_0-logloss:0.15910
[24]	validation_0-logloss:0.15804
[25]	validation_0-logloss:0.15690
[26]	validation_0-logloss:0.15614
[27]	validation_0-logloss:0.15543
[28]	validation_0-logloss:0.15493
[29]	validation_0-loglos

In [10]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import numpy as np
# Path to your saved model
MODEL_PATH = r"artifacts\xgboost\xgb_model.pkl"

# Load the trained model
with open(MODEL_PATH, "rb") as f:
    model = pickle.load(f)

csv_path = r"artifacts\data_ingestion\FloodPrediction.csv"
df = pd.read_csv(csv_path)
target_col = "Flood?"

y = df[target_col].replace([np.nan, np.inf, -np.inf], 0).astype(int)
X = df[["Max_Temp", "Min_Temp", "Rainfall", "Relative_Humidity", "Wind_Speed"]]
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, shuffle=False
)

# --- Get predicted probabilities on validation set ---
y_prob = model.predict_proba(X_val)[:, 1]

# --- Clip to [0,1] and threshold at 0.5 ---
y_prob = np.clip(y_prob, 0.0, 1.0)
y_pred_label = (y_prob >= 0.5).astype(int)
y_true_label = y_val.astype(int)

# --- Metrics ---
print("accuracy", accuracy_score(y_true_label, y_pred_label))
print("precision", precision_score(y_true_label, y_pred_label, zero_division=0))
print("recall", recall_score(y_true_label, y_pred_label, zero_division=0))
print("f1", f1_score(y_true_label, y_pred_label, zero_division=0))
print("confusion:\n", confusion_matrix(y_true_label, y_pred_label))

accuracy 0.9437819420783645
precision 0.8
recall 0.9586374695863747
f1 0.872163807415606
confusion:
 [[3090  197]
 [  34  788]]


In [15]:
# Load the trained model
with open(MODEL_PATH, "rb") as f:
    model = pickle.load(f)

model.get_params()

{'objective': 'binary:logistic',
 'base_score': None,
 'booster': None,
 'callbacks': None,
 'colsample_bylevel': None,
 'colsample_bynode': None,
 'colsample_bytree': 1.0,
 'device': None,
 'early_stopping_rounds': None,
 'enable_categorical': False,
 'eval_metric': None,
 'feature_types': None,
 'feature_weights': None,
 'gamma': None,
 'grow_policy': None,
 'importance_type': None,
 'interaction_constraints': None,
 'learning_rate': 0.1,
 'max_bin': None,
 'max_cat_threshold': None,
 'max_cat_to_onehot': None,
 'max_delta_step': None,
 'max_depth': 6,
 'max_leaves': None,
 'min_child_weight': None,
 'missing': nan,
 'monotone_constraints': None,
 'multi_strategy': None,
 'n_estimators': 100,
 'n_jobs': -1,
 'num_parallel_tree': None,
 'random_state': None,
 'reg_alpha': 0.0,
 'reg_lambda': 1.0,
 'sampling_method': None,
 'scale_pos_weight': None,
 'subsample': 1.0,
 'tree_method': None,
 'validate_parameters': None,
 'verbosity': 1,
 'seed': 42}

In [12]:
import pickle
import numpy as np
import pandas as pd

# Path to your saved model
MODEL_PATH = r"artifacts\xgboost\xgb_model.pkl"

# Load the trained model
with open(MODEL_PATH, "rb") as f:
    model = pickle.load(f)

# Define the feature order (must match training)
FEATURES = ["Max_Temp", "Min_Temp", "Rainfall", "Relative_Humidity", "Wind_Speed"]

def get_manual_input():
    """
    Collect manual input for each feature.
    Returns a DataFrame with one row.
    """
    values = []
    for feat in FEATURES:
        val = float(input(f"Enter {feat}: "))
        values.append(val)
    # Create a DataFrame with the same feature names
    return pd.DataFrame([values], columns=FEATURES)

def predict_flood(input_df, threshold=0.5):
    """
    Generate prediction for new input.
    Returns probability and binary label.
    """
    prob = model.predict_proba(input_df)[:, 1][0]  # probability of Flood (class 1)
    label = int(prob >= threshold)
    return prob, label

if __name__ == "__main__":
    # Step 1: Get manual input
    new_input = get_manual_input()

    # Step 2: Predict
    prob, label = predict_flood(new_input)

    # Step 3: Show results
    print("\n--- Prediction Result ---")
    print(f"Input features: {new_input.to_dict(orient='records')[0]}")
    print(f"Flood probability: {prob:.3f}")
    print(f"Predicted label: {'Flood' if label == 1 else 'No Flood'}")


--- Prediction Result ---
Input features: {'Max_Temp': 33.0, 'Min_Temp': 26.0, 'Rainfall': 575.0, 'Relative_Humidity': 85.0, 'Wind_Speed': 1.57}
Flood probability: 0.888
Predicted label: Flood
