In [14]:
import joblib
import pickle

clf = joblib.load('disease_classifier.pkl')
lbl_disease = joblib.load('label_encoder_disease.pkl')
model = joblib.load('case_regressor.pkl')
lbl_district = joblib.load('label_encoder_district.pkl')

# Using pickle
with open('disease_classifier.pkl', 'rb') as f:
    clf = pickle.load(f)

with open('label_encoder_disease.pkl', 'rb') as f:
    lbl_disease = pickle.load(f)

In [15]:
obj = joblib.load('label_encoder_disease.pkl')
print(type(obj))

<class 'sklearn.preprocessing._label.LabelEncoder'>


In [16]:
lbl_disease = joblib.load('label_encoder_disease.pkl')
print(type(lbl_disease)) 

<class 'sklearn.preprocessing._label.LabelEncoder'>


In [18]:
import pandas as pd
import numpy as np
import joblib

# -------------------------
# 1. Load models and encoders
# -------------------------
model = joblib.load('case_regressor.pkl')                # Regression model
clf = joblib.load('disease_classifier.pkl')              # Classification model
lbl_district = joblib.load('label_encoder_district.pkl') # District encoder
lbl_disease = joblib.load('label_encoder_disease.pkl')   # Disease encoder

# -------------------------
# 2. Load new data
# -------------------------
df_new = pd.read_csv(r"C:\Users\PINKY\OneDrive\Desktop\SIH\data\new_data.csv")  # Update this path

# Ensure date column is in datetime format
df_new['date'] = pd.to_datetime(df_new['date'])
df_new = df_new.sort_values('date').reset_index(drop=True)

# -------------------------
# 3. Preprocess new data
# -------------------------

# Encode district as usual (assuming all districts are known)
df_new['district_enc'] = lbl_district.transform(df_new['district'])

# Handle disease encoding with fallback for unknown diseases
disease_map = {label: idx for idx, label in enumerate(lbl_disease.classes_)}
default_label = -1  # fallback for unknown diseases

df_new['disease_enc'] = df_new['disease'].map(disease_map).fillna(default_label).astype(int)

# Log unknown diseases
unknown_diseases = set(df_new.loc[df_new['disease_enc'] == -1, 'disease'])
if unknown_diseases:
    print("⚠ Warning: Unknown diseases found and encoded as -1:", unknown_diseases)

# Create time-based features
df_new['dayofyear'] = df_new['date'].dt.dayofyear
df_new['month'] = df_new['date'].dt.month
df_new['week'] = df_new['date'].dt.isocalendar().week.astype(int)
df_new['weekday'] = df_new['date'].dt.weekday
df_new['is_weekend'] = df_new['weekday'].isin([5, 6]).astype(int)

# Create lag and rolling features
for lag in [1, 2, 3, 7, 14]:
    df_new[f'lag_{lag}'] = df_new.groupby(['district', 'disease'])['cases'].shift(lag)

for window in [3, 7, 14]:
    df_new[f'roll_mean_{window}'] = df_new.groupby(['district', 'disease'])['cases'].shift(1).rolling(window).mean()
    df_new[f'roll_std_{window}'] = df_new.groupby(['district', 'disease'])['cases'].shift(1).rolling(window).std()

df_new['cases_diff_1'] = df_new.groupby(['district', 'disease'])['cases'].diff(1)

# Drop rows where lag or rolling features are NaN
df_new = df_new.dropna().reset_index(drop=True)

# -------------------------
# 4. Prepare feature columns
# -------------------------
env_cols = [
    'turbidity_NTU','water_surface_temp_C','chlorophyll_a_mg_m3',
    'NDWI','NDVI','EVI','SPM_mg_L',
    'surface_reflectance_B3','surface_reflectance_B4','surface_reflectance_B5'
]

feature_cols = (
    ['district_enc','disease_enc','dayofyear','month','week','weekday','is_weekend'] +
    [f'lag_{l}' for l in [1,2,3,7,14]] +
    [f'roll_mean_{w}' for w in [3,7,14]] +
    [f'roll_std_{w}' for w in [3,7,14]] +
    ['cases_diff_1'] + env_cols
)

X_new = df_new[feature_cols]

# -------------------------
# 5. Make predictions
# -------------------------

# Regression – predict case count
df_new['predicted_cases'] = model.predict(X_new)

# Classification – predict disease name
predicted_classes = clf.predict(X_new)

# Always decode the predictions — even if input disease was unknown
df_new['predicted_disease'] = lbl_disease.inverse_transform(predicted_classes)

# -------------------------
# 6. Display or save results
# -------------------------
print(df_new[['date','district','disease','cases','predicted_cases','predicted_disease']].head())

# Optionally, save to file
df_new.to_excel('predictions_results.xlsx', index=False)

        date district        disease  cases  predicted_cases predicted_disease
0 2023-02-05   Kamrup  Leptospirosis      4         4.019889     Leptospirosis
1 2023-02-08   Kamrup  Leptospirosis      3         2.906447     Leptospirosis
2 2023-02-09   Kamrup        Cholera      3         2.766412           Cholera
3 2023-02-09   Kamrup        Cholera      9         8.848284           Cholera
4 2023-02-09   Kamrup  Leptospirosis      1         1.095954     Leptospirosis
