In [1]:
# Disease Prediction Project
## Notebook 1: Data Preparation


In [2]:
# -------------------------------
# 1. Import libraries
# -------------------------------
import pandas as pd
import numpy as np
import json
import os
from sklearn.preprocessing import LabelEncoder

In [3]:
# Paths
RAW_TRAIN_PATH = r"E:\Disease_Prediction\data\raw\Training.csv"
RAW_TEST_PATH  = r"E:\Disease_Prediction\data\raw\Testing.csv"
PROCESSED_PATH = r"E:\Disease_Prediction\data\processed"

os.makedirs(PROCESSED_PATH, exist_ok=True)

In [4]:
# -------------------------------
# 2. Load raw data
# -------------------------------
train_df = pd.read_csv(RAW_TRAIN_PATH)
test_df  = pd.read_csv(RAW_TEST_PATH)

print("✅ Raw Data Loaded")
print("Training Shape:", train_df.shape)
print("Testing Shape:", test_df.shape)
display(train_df.head())

✅ Raw Data Loaded
Training Shape: (4920, 134)
Testing Shape: (42, 133)


Unnamed: 0,itching,skin_rash,nodal_skin_eruptions,continuous_sneezing,shivering,chills,joint_pain,stomach_pain,acidity,ulcers_on_tongue,...,scurring,skin_peeling,silver_like_dusting,small_dents_in_nails,inflammatory_nails,blister,red_sore_around_nose,yellow_crust_ooze,prognosis,Unnamed: 133
0,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,Fungal infection,
1,0,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,Fungal infection,
2,1,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,Fungal infection,
3,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,Fungal infection,
4,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,Fungal infection,


In [5]:
# 3. Remove unwanted columns
# -------------------------------
train_df = train_df.loc[:, ~train_df.columns.str.contains('^Unnamed')]
test_df  = test_df.loc[:, ~test_df.columns.str.contains('^Unnamed')]

print("✅ Removed Unnamed columns")


✅ Removed Unnamed columns


In [6]:
# 4. Check Missing Values
# -------------------------------
print("Missing values in Train:", train_df.isnull().sum().sum())
print("Missing values in Test:", test_df.isnull().sum().sum())


Missing values in Train: 0
Missing values in Test: 0


In [7]:
# 5. Encode target (prognosis)
# -------------------------------
#le = LabelEncoder()
#train_df['prognosis'] = le.fit_transform(train_df['prognosis'])
#test_df['prognosis']  = le.transform(test_df['prognosis'])

#print("✅ Encoded prognosis column")

In [8]:
# 6. Save processed train & test
# -------------------------------
train_df.to_csv(os.path.join(PROCESSED_PATH, "train_clean.csv"), index=False)
test_df.to_csv(os.path.join(PROCESSED_PATH, "test_clean.csv"), index=False)

print("✅ Processed train & test saved")


✅ Processed train & test saved


In [9]:
# 7. Save feature columns
# -------------------------------
feature_cols = [col for col in train_df.columns if col != 'prognosis']
with open(os.path.join(PROCESSED_PATH, "feature_columns.json"), "w") as f:
    json.dump(feature_cols, f)

print("✅ Feature columns saved")


✅ Feature columns saved


In [11]:
# 8. Save label mapping (int → disease name)

import json
import os

# Folder jahan mapping save karni hai
PROCESSED_PATH = "processed"
os.makedirs(PROCESSED_PATH, exist_ok=True)

# Unique disease names from train_df
unique_diseases = train_df['prognosis'].unique()

# Create mapping: number → disease name
mapping = {i: disease for i, disease in enumerate(unique_diseases)}

# Save mapping as JSON
with open(os.path.join(PROCESSED_PATH, "label_mapping.json"), "w") as f:
    json.dump(mapping, f, indent=4)

print("✅ Label mapping saved")


✅ Label mapping saved


In [13]:
# 9. Quick check
# Unique disease names
diseases = train_df['prognosis'].unique()

print("Total diseases:", len(diseases))
print("Disease list:", list(diseases))


Total diseases: 41
Disease list: ['Fungal infection', 'Allergy', 'GERD', 'Chronic cholestasis', 'Drug Reaction', 'Peptic ulcer diseae', 'AIDS', 'Diabetes ', 'Gastroenteritis', 'Bronchial Asthma', 'Hypertension ', 'Migraine', 'Cervical spondylosis', 'Paralysis (brain hemorrhage)', 'Jaundice', 'Malaria', 'Chicken pox', 'Dengue', 'Typhoid', 'hepatitis A', 'Hepatitis B', 'Hepatitis C', 'Hepatitis D', 'Hepatitis E', 'Alcoholic hepatitis', 'Tuberculosis', 'Common Cold', 'Pneumonia', 'Dimorphic hemmorhoids(piles)', 'Heart attack', 'Varicose veins', 'Hypothyroidism', 'Hyperthyroidism', 'Hypoglycemia', 'Osteoarthristis', 'Arthritis', '(vertigo) Paroymsal  Positional Vertigo', 'Acne', 'Urinary tract infection', 'Psoriasis', 'Impetigo']
