<a href="https://colab.research.google.com/github/2403A51L34/hack01/blob/main/Copy_of_Welcome_To_Colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# smart_health_pipeline.py
# Easy, robust pipeline for early warning using your uploaded CSV.
# Edit `cases_col` below to the exact column name if auto-detection fails.

import os
from pathlib import Path
import zipfile
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix, mean_absolute_error, mean_squared_error
import joblib

# ---------- Configuration ----------
DATA_DIR = Path("/content/")
ZIP_NAME = "archive (2).zip"   # the uploaded zip
EXTRACT_DIR = DATA_DIR / "extracted_dataset"
EXTRACT_DIR.mkdir(exist_ok=True)
# If your dataset has a known cases column, set it explicitly:
# e.g. cases_col = "Number of cases per 100,000 people"
cases_col_override = None
# ---------- End configuration ----------

zip_path = DATA_DIR / ZIP_NAME
if not zip_path.exists():
    # fallback: look for CSVs directly in /mnt/data
    csvs = list(DATA_DIR.glob("*.csv"))
    if not csvs:
        raise FileNotFoundError(f"Could not find {ZIP_NAME} or any CSV in {DATA_DIR}.")
    csv_path = csvs[0]
else:
    # extract CSVs inside zip
    with zipfile.ZipFile(zip_path, 'r') as z:
        csvs_in_zip = [f for f in z.namelist() if f.lower().endswith('.csv')]
        if not csvs_in_zip:
            raise FileNotFoundError("No CSV files found inside the zip.")
        # extract first CSV
        z.extract(csvs_in_zip[0], path=EXTRACT_DIR)
        csv_path = EXTRACT_DIR / csvs_in_zip[0]

print("Using CSV:", csv_path)
df = pd.read_csv(csv_path)
print("Loaded shape:", df.shape)
print("Columns:", df.columns.tolist())

# detect a cases-like column (unless user overrode)
cases_col = None
if cases_col_override:
    if cases_col_override in df.columns:
        cases_col = cases_col_override
    else:
        raise ValueError(f"cases_col_override '{cases_col_override}' not found in columns.")
else:
    # common names to check
    candidates = ["cases", "case_count", "count", "cases_reported", "patients", "no_of_cases", "num_cases",
                  "cases_per_100000", "cases per 100000", "number of cases"]
    for c in candidates:
        if c in df.columns:
            cases_col = c
            break
    if cases_col is None:
        # fallback heuristic: look for any column name containing 'case' or 'count' or 'patient'
        for col in df.columns:
            if any(k in col.lower() for k in ["case", "count", "patient"]):
                cases_col = col
                break

if cases_col is None:
    print("WARNING: No clear cases/count column detected. You should set `cases_col_override` to the correct column name.")
    # Still continue: create a numeric dummy column (will be zeros) to keep pipeline safe
    df['cases_numeric'] = 0
else:
    df['cases_numeric'] = pd.to_numeric(df[cases_col], errors='coerce').fillna(0)
    print("Using cases column:", cases_col)

# define outbreak threshold (75th percentile)
threshold = float(np.percentile(df['cases_numeric'], 75))
print("Outbreak threshold (75th percentile):", threshold)

# features: simple lags and rolling mean (works without time index)
for lag in [1, 3, 7]:
    df[f'lag_{lag}'] = df['cases_numeric'].shift(lag).fillna(method='bfill')
df['rolling_mean_7'] = df['cases_numeric'].rolling(window=7, min_periods=1).mean().fillna(method='bfill')

# include any numeric environmental columns we can guess (rain, temp, water, ph)
env_candidates = [c for c in df.columns if any(k in c.lower() for k in ['rain', 'temp', 'water', 'ph', 'quality', 'turbidity', 'chlorine'])]
for c in env_candidates:
    df[c + '_num'] = pd.to_numeric(df[c], errors='coerce').fillna(0)

feature_cols = [c for c in df.columns if c.startswith('lag_') or c.startswith('rolling_') or c.endswith('_num')]
print("Feature columns:", feature_cols)

X = df[feature_cols].fillna(0)
# create outbreak label if possible
outbreak_present = (df['cases_numeric'].nunique() > 1)
if outbreak_present:
    df['outbreak'] = (df['cases_numeric'] > threshold).astype(int)
    y = df['outbreak']
else:
    # All-zero or single-value cases => no binary outbreak possible
    y = None

# Train & evaluate
if y is None or y.nunique() < 2:
    print("No binary outbreak target available. Falling back to regression to predict case counts.")
    X_train, X_test, y_train, y_test = train_test_split(X, df['cases_numeric'], test_size=0.2, random_state=42)
    reg = RandomForestRegressor(n_estimators=100, random_state=42)
    reg.fit(X_train, y_train)
    preds = reg.predict(X_test)
    mae = mean_absolute_error(y_test, preds)
    rmse = mean_squared_error(y_test, preds) # Removed squared=False
    print("Regression MAE:", mae, "RMSE:", rmse)
    # flag predictions above threshold as alerts
    alerts = X_test.copy()
    alerts['pred_cases'] = preds
    alerts['pred_alert'] = (alerts['pred_cases'] > threshold).astype(int)
    alerts['actual_cases'] = y_test.values
    # Save regressor
    joblib.dump({'regressor': reg, 'features': feature_cols, 'threshold': threshold}, DATA_DIR / "outbreak_regressor.joblib")
    print("Saved regressor to:", DATA_DIR / "outbreak_regressor.joblib")
    # Show top predicted alerts
    print("Top predicted alerts (by predicted cases):")
    print(alerts.sort_values('pred_cases', ascending=False).head(10))
else:
    print("Binary outbreak target available. Running classification.")
    df['outbreak'] = (df['cases_numeric'] > threshold).astype(int)
    X_train, X_test, y_train, y_test = train_test_split(X, df['outbreak'], test_size=0.2, random_state=42, stratify=df['outbreak'])
    clf = RandomForestClassifier(n_estimators=100, random_state=42)
    clf.fit(X_train, y_train)
    preds = clf.predict(X_test)
    # safe probability extraction
    if hasattr(clf, "predict_proba") and getattr(clf, "n_classes_", None) and clf.n_classes_ > 1:
        proba = clf.predict_proba(X_test)[:,1]
    else:
        proba = preds.astype(float)  # fallback
    print("Classification report:")
    print(classification_report(y_test, preds, zero_division=0))
    try:
        print("ROC AUC:", roc_auc_score(y_test, proba))
    except Exception:
        print("ROC AUC not available for this split.")
    print("Confusion matrix:\n", confusion_matrix(y_test, preds))
    joblib.dump({'model': clf, 'features': feature_cols, 'threshold': threshold}, DATA_DIR / "outbreak_classifier.joblib")
    print("Saved classifier to:", DATA_DIR / "outbreak_classifier.joblib")
    alerts = X_test.copy()
    alerts['pred_proba'] = proba
    alerts['pred_outbreak'] = (alerts['pred_proba'] > 0.5).astype(int)
    alerts['actual_outbreak'] = y_test.values
    print("Top predicted outbreak probabilities:")
    print(alerts.sort_values('pred_proba', ascending=False).head(10))

print("Pipeline finished. Adjust 'cases_col_override' to point to the correct case-count column to enable classification-mode outbreak warnings.")

FileNotFoundError: Could not find archive (2).zip or any CSV in /content.