In [9]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline

file_path = 'delhi_disease_data_10000.csv'
df = pd.read_csv(file_path)

def assign_disease(row):
    # Primary conditions for specific diseases
    if row['Rainfall_mm'] > 100 and row['FeverCases'] > 25:
        return 'Dengue'
    elif row['Humidity_pct'] > 85 and row['ToiletUsage_pct'] < 75:
        return 'Typhoid'
    elif row['WaterIndex'] > 0.6 and row['NDVI'] < 0.4:
        return 'Malaria'
    elif row['FeverCases'] < 10 and row['Absenteeism_pct'] < 5:
        return 'Healthy'
    
    # Secondary conditions - more relaxed criteria to avoid "Other"
    elif row['Rainfall_mm'] > 50 and row['FeverCases'] > 15:
        return 'Dengue'
    elif row['Humidity_pct'] > 70 and row['ToiletUsage_pct'] < 80:
        return 'Typhoid'
    elif row['WaterIndex'] > 0.4 and row['NDVI'] < 0.5:
        return 'Malaria'
    elif row['FeverCases'] < 15 and row['Absenteeism_pct'] < 10:
        return 'Healthy'
    
    # Tertiary conditions - even more relaxed to capture remaining cases
    elif row['FeverCases'] > 20 or row['Rainfall_mm'] > 75:
        return 'Dengue'
    elif row['Humidity_pct'] > 80 or row['ToiletUsage_pct'] < 70:
        return 'Typhoid'
    elif row['WaterIndex'] > 0.5 or row['NDVI'] < 0.6:
        return 'Malaria'
    else:
        return 'Healthy'  # Default to Healthy instead of Other

df['Disease'] = df.apply(assign_disease, axis=1)

df['Disease'].value_counts()


Disease
Dengue     5131
Malaria    1780
Healthy    1748
Typhoid    1341
Name: count, dtype: int64

In [10]:


# Encode categorical features
df_encoded = df.copy()
label_encoders = {}
for col in ['Week', 'Location']:
    le = LabelEncoder()
    df_encoded[col] = le.fit_transform(df_encoded[col])
    label_encoders[col] = le

# Encode target
le_disease = LabelEncoder()
df_encoded['Disease'] = le_disease.fit_transform(df_encoded['Disease'])

# Feature matrix and target vector
X = df_encoded.drop(['Disease', 'DiseaseOutbreak'], axis=1)
y = df_encoded['Disease']

# Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



In [11]:
# XGBoost models
xgb_model = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42))
])

# Train and predict with XGBoost
xgb_model.fit(X_train, y_train)
y_pred_xgb = xgb_model.predict(X_test)
report_xgb = classification_report(y_test, y_pred_xgb, target_names=le_disease.classes_, output_dict=True)
conf_matrix_xgb = confusion_matrix(y_test, y_pred_xgb)

report_xgb

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


{'Dengue': {'precision': 0.9990403071017274,
  'recall': 1.0,
  'f1-score': 0.99951992318771,
  'support': 1041.0},
 'Healthy': {'precision': 1.0,
  'recall': 0.9937304075235109,
  'f1-score': 0.9968553459119497,
  'support': 319.0},
 'Malaria': {'precision': 0.9916434540389972,
  'recall': 0.9971988795518207,
  'f1-score': 0.994413407821229,
  'support': 357.0},
 'Typhoid': {'precision': 0.9964539007092199,
  'recall': 0.9929328621908127,
  'f1-score': 0.9946902654867257,
  'support': 283.0},
 'accuracy': 0.9975,
 'macro avg': {'precision': 0.9967844154624862,
  'recall': 0.9959655373165361,
  'f1-score': 0.9963697356019036,
  'support': 2000.0},
 'weighted avg': {'precision': 0.9975070633427647,
  'recall': 0.9975,
  'f1-score': 0.9975000135546201,
  'support': 2000.0}}

In [12]:
import joblib
model_path = "xgb_disease_prediction_model.pkl"
joblib.dump(xgb_model,model_path)

['xgb_disease_prediction_model.pkl']