In [25]:
import joblib
import pickle

lbl_disease = joblib.load('label_encoder_disease.pkl')
model = joblib.load('case_regressor.pkl')
lbl_district = joblib.load('label_encoder_district.pkl')

# Using pickle
with open('label_encoder_disease.pkl', 'rb') as f:
    lbl_disease = pickle.load(f)

In [27]:
import pandas as pd
import numpy as np
import joblib

# -------------------------
# 1. Load models and encoders
# -------------------------
model = joblib.load('case_regressor.pkl')                # Regression model
lbl_district = joblib.load('label_encoder_district.pkl') # District encoder

# -------------------------
# 2. Load new data
# -------------------------
df_new = pd.read_csv(r"C:\Users\PINKY\OneDrive\Desktop\SIH\data\new_data.csv")  # Update this path

# Ensure date column is in datetime format
df_new['date'] = pd.to_datetime(df_new['date'])
df_new = df_new.sort_values('date').reset_index(drop=True)

# -------------------------
# 3. Preprocess new data
# -------------------------

# Encode district as usual (assuming all districts are known)
df_new['district_enc'] = lbl_district.transform(df_new['district'])


# Create time-based features
df_new['dayofyear'] = df_new['date'].dt.dayofyear
df_new['month'] = df_new['date'].dt.month
df_new['week'] = df_new['date'].dt.isocalendar().week.astype(int)
df_new['weekday'] = df_new['date'].dt.weekday
df_new['is_weekend'] = df_new['weekday'].isin([5, 6]).astype(int)

# Create lag and rolling features
for lag in [1, 2, 3, 7, 14]:
    df_new[f'lag_{lag}'] = df_new.groupby(['district'])['cases'].shift(lag)

for window in [3, 7, 14]:
    df_new[f'roll_mean_{window}'] = df_new.groupby(['district'])['cases'].shift(1).rolling(window).mean()
    df_new[f'roll_std_{window}'] = df_new.groupby(['district'])['cases'].shift(1).rolling(window).std()

df_new['cases_diff_1'] = df_new.groupby(['district'])['cases'].diff(1)

# Drop rows where lag or rolling features are NaN
df_new = df_new.dropna().reset_index(drop=True)

# -------------------------
# 4. Prepare feature columns
# -------------------------
env_cols = [
    'turbidity_NTU','water_surface_temp_C','chlorophyll_a_mg_m3',
    'NDWI','NDVI','EVI','SPM_mg_L',
    'surface_reflectance_B3','surface_reflectance_B4','surface_reflectance_B5'
]

feature_cols = (
    ['district_enc','dayofyear','month','week','weekday','is_weekend'] +
    [f'lag_{l}' for l in [1,2,3,7,14]] +
    [f'roll_mean_{w}' for w in [3,7,14]] +
    [f'roll_std_{w}' for w in [3,7,14]] +
    ['cases_diff_1'] + env_cols
)

X_new = df_new[feature_cols]

# -------------------------
# 5. Make predictions
# -------------------------

# Regression – predict case count
df_new['predicted_cases'] = model.predict(X_new)

# -------------------------
# 6. Display or save results
# -------------------------
print(df_new[['date','district','cases','predicted_cases']].head())

# Optionally, save to file
df_new.to_excel('predictions_results.xlsx', index=False)

        date district  cases  predicted_cases
0 2023-01-07   Kamrup      6         6.173376
1 2023-01-08   Kamrup      1         1.183913
2 2023-01-08   Kamrup      3         3.272526
3 2023-01-08   Kamrup     10         9.960131
4 2023-01-09   Kamrup      1         1.409281
