In [None]:

# 5️⃣ Load merged dataset (if already created)
MERGED_PATH = DATA_DIR / "merged_dataset.csv"
WEATHER_PATH = DATA_DIR / "weather_uttarakhand.csv" # Assuming weather data is in the same directory
FIRMS_PATH = DATA_DIR / "firms_uttarakhand.csv" # Assuming FIRMS data is in the same directory


if MERGED_PATH.exists():
    df = pd.read_csv(MERGED_PATH, parse_dates=["date"], low_memory=False)
    # Recalculate dryness_idx and temp_range when loading from CSV
    df["dryness_idx"] = df["temperature_2m_max"] - 0.05 * df["relative_humidity_2m_mean"]
    df["temp_range"] = df["temperature_2m_max"] - df["temperature_2m_min"]
    print("✅ Loaded merged dataset:", df.shape)
else:
    # fallback: create from weather + FIRMS
    print("⚠️ No merged file found! Creating new one...")
    firms = pd.read_csv(FIRMS_PATH, parse_dates=["acq_date"])
    weather = pd.read_csv(WEATHER_PATH, parse_dates=["date"])
    firms["acq_date"] = pd.to_datetime(firms["acq_date"]).dt.normalize()
    firms["fire_today"] = 1

    # Round coordinates before merging
    firms["latitude"] = firms["latitude"].round(2)
    firms["longitude"] = firms["longitude"].round(2)
    weather["lat"] = weather["lat"].round(2)
    weather["lon"] = weather["lon"].round(2)

    df = weather.merge(firms, left_on=["lat","lon","date"], right_on=["latitude","longitude","acq_date"], how="left")
    df["fire_today"] = df["fire_today"].fillna(0).astype(int)
    df = df.sort_values(["tile_id","date"])
    df["risk_next7d"] = df.groupby("tile_id")["fire_today"].shift(-7).rolling(7,min_periods=1).max().fillna(0).astype(int)
    df["dryness_idx"] = df["temperature_2m_max"] - 0.05 * df["relative_humidity_2m_mean"]
    df["temp_range"] = df["temperature_2m_max"] - df["temperature_2m_min"]
    df.to_csv(MERGED_PATH, index=False)
    print("✅ Merged dataset saved:", df.shape)

⚠️ No merged file found! Creating new one...


FileNotFoundError: [Errno 2] No such file or directory: '/content/drive/MyDrive/forest_fire_ai/data/firms_uttarakhand.csv'

In [None]:
# Recreate dryness and temp range columns if missing
if 'dryness_idx' not in df.columns:
    df["dryness_idx"] = df["temperature_2m_max"] - 0.05 * df["relative_humidity_2m_mean"]

if 'temp_range' not in df.columns:
    df["temp_range"] = df["temperature_2m_max"] - df["temperature_2m_min"]

print("✅ Added derived features: dryness_idx, temp_range")
print(df[["dryness_idx","temp_range"]].head())


✅ Added derived features: dryness_idx, temp_range
   dryness_idx  temp_range
0      16.9605       13.24
1      16.9605       13.24
2      16.9605       13.24
3      16.5575       12.73
4      16.5575       12.73


In [None]:
# --- Train ML Models ---
features = [
    "temperature_2m_mean","temperature_2m_max","temperature_2m_min",
    "relative_humidity_2m_mean","windspeed_10m_mean","precipitation_sum",
    "dryness_idx","temp_range"
]
df = df.dropna(subset=features)
X, y = df[features], df["risk_next7d"].fillna(0).astype(int)

from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score

Xtr, Xte, ytr, yte = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
Xr, yr = SMOTE(random_state=42).fit_resample(Xtr, ytr)

models = {
    "RandomForest": RandomForestClassifier(n_estimators=300, random_state=42, class_weight="balanced"),
    "GradientBoost": GradientBoostingClassifier(n_estimators=400, learning_rate=0.05, max_depth=6)
}

best_auc, best_model = 0, None
for name, model in models.items():
    print(f"\n🔹 Training {name}...")
    model.fit(Xr, yr)
    prob = model.predict_proba(Xte)[:,1]
    pred = (prob > 0.5).astype(int)
    auc = roc_auc_score(yte, prob)
    acc = accuracy_score(yte, pred)
    f1  = f1_score(yte, pred)
    print(f"{name}: AUC={auc:.3f} | ACC={acc:.3f} | F1={f1:.3f}")
    if auc > best_auc:
        best_auc, best_model = auc, model

print("\n✅ Best Model:", type(best_model).__name__)
print("🔥 Training Complete with High Accuracy!")



🔹 Training RandomForest...
RandomForest: AUC=0.866 | ACC=0.941 | F1=0.968

🔹 Training GradientBoost...
GradientBoost: AUC=0.895 | ACC=0.901 | F1=0.945

✅ Best Model: GradientBoostingClassifier
🔥 Training Complete with High Accuracy!


In [2]:
import joblib
joblib.dump(best_model, "/content/drive/MyDrive/Colab Notebooks/data/model.pkl")
print("✅ Model saved!")


NameError: name 'best_model' is not defined

In [3]:
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score

# --- Train ML Models ---
features = [
    "temperature_2m_mean","temperature_2m_max","temperature_2m_min",
    "relative_humidity_2m_mean","windspeed_10m_mean","precipitation_sum",
    "dryness_idx","temp_range"
]
# Ensure df is defined, if not, load it.
if 'df' not in locals() or df is None:
    # Assuming MERGED_PATH and related variables are defined in the notebook
    # If not, you might need to include the data loading logic here as well
    try:
        df = pd.read_csv(MERGED_PATH, parse_dates=["date"], low_memory=False)
        df["dryness_idx"] = df["temperature_2m_max"] - 0.05 * df["relative_humidity_2m_mean"]
        df["temp_range"] = df["temperature_2m_max"] - df["temperature_2m_min"]
        print("✅ Loaded merged dataset:", df.shape)
    except FileNotFoundError:
         print("Error: merged_dataset.csv not found. Please run the data loading cell first.")
         # Exit or handle the error appropriately if the file is essential
         exit() # Exiting the cell execution if the file is not found


df = df.dropna(subset=features)
X, y = df[features], df["risk_next7d"].fillna(0).astype(int)


Xtr, Xte, ytr, yte = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
Xr, yr = SMOTE(random_state=42).fit_resample(Xtr, ytr)

models = {
    "RandomForest": RandomForestClassifier(n_estimators=300, random_state=42, class_weight="balanced"),
    "GradientBoost": GradientBoostingClassifier(n_estimators=400, learning_rate=0.05, max_depth=6)
}

best_auc, best_model = 0, None
for name, model in models.items():
    print(f"\n🔹 Training {name}...")
    model.fit(Xr, yr)
    prob = model.predict_proba(Xte)[:,1]
    pred = (prob > 0.5).astype(int)
    auc = roc_auc_score(yte, prob)
    acc = accuracy_score(yte, pred)
    f1  = f1_score(yte, pred)
    print(f"{name}: AUC={auc:.3f} | ACC={acc:.3f} | F1={f1:.3f}")
    if auc > best_auc:
        best_auc, best_model = auc, model

print("\n✅ Best Model:", type(best_model).__name__)
print("🔥 Training Complete with High Accuracy!")

# --- Plot Feature Importance ---
if best_model is not None:
    feat_imp = pd.Series(best_model.feature_importances_, index=features).sort_values(ascending=True)
    feat_imp.plot(kind='barh', figsize=(8,5))
    plt.title('Feature Importance')
    plt.show()
else:
    print("Error: No best model was trained.")

NameError: name 'MERGED_PATH' is not defined