<a href="https://colab.research.google.com/github/DinurakshanRavichandran/Visio-Glance/blob/FINAL-MODEL-NLP/unified_eye_disease_detection_corrected.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Unified Eye Disease Detection Model
This notebook implements a machine learning pipeline to predict one of six eye diseases based on symptom datasets.

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Load, Merge, Clean, and Save the Dataset.

In [2]:
import pandas as pd

# Define dataset paths and labels
datasets = {
    "/content/drive/MyDrive/PROJECT 29/FINAL MODEL/PRE-PROCESSED DATASETS/preprocessed_glaucoma_dataset.csv": "Glaucoma",
    "/content/drive/MyDrive/PROJECT 29/FINAL MODEL/PRE-PROCESSED DATASETS/Preprocessed_Cataract_Dataset.csv": "Cataract",
    "/content/drive/MyDrive/PROJECT 29/FINAL MODEL/PRE-PROCESSED DATASETS/Preprocessed_Diabetic_Retinopathy_Dataset.csv": "Diabetic Retinopathy",
    "/content/drive/MyDrive/PROJECT 29/FINAL MODEL/PRE-PROCESSED DATASETS/Preprocessed_CNV_Detection_Dataset.csv": "CNV",
    "/content/drive/MyDrive/PROJECT 29/FINAL MODEL/PRE-PROCESSED DATASETS/Preprocessed_DME_Dataset.csv": "DME",
    "/content/drive/MyDrive/PROJECT 29/FINAL MODEL/PRE-PROCESSED DATASETS/Preprocessed_Drusen_Dataset.csv": "Drusen"
}


# Step 1: Load datasets into a list
dataframes = []
for file, disease in datasets.items():
    try:
        df = pd.read_csv(file)
        print(f"Loaded dataset: {file} | Shape: {df.shape}")
        df["Disease_Label"] = disease  # Add disease label column
        dataframes.append(df)
    except Exception as e:
        print(f"Error loading {file}: {e}")

# Step 2: Merge all datasets while keeping all columns
merged_df = pd.concat(dataframes, ignore_index=True, sort=False)  # Keeps unique columns
print(f"\nMerged dataset shape (before cleaning): {merged_df.shape}")

# -----------------------------------------
# Step 3: Handling missing values
# -----------------------------------------

# Check for missing values before handling
print("\nMissing values before handling:\n", merged_df.isnull().sum())

# 🔹 Essential Disease-Specific Features (Ensure They Are Kept)
essential_features = [
    "Intraocular Pressure (IOP)", "Cup-to-Disc Ratio (CDR)", "Pachymetry",
    "Retinal Thickness", 'History of Diabetes',
    "Microaneurysms Count", "Hemorrhages Count", "BMI", "Blood Pressure",
    "Cholesterol Levels"
]

# 🔹 Disease-Specific Features - Fill missing values per disease
disease_specific_features = {
    "Glaucoma": ["Cup-to-Disc Ratio (CDR)", "Pachymetry"],
    "Cataract": ["History of Diabetes"],
    "Diabetic Retinopathy": ["Retinal Thickness", "Cotton Wool Spots Count", "LDL Cholesterol", "Microaneurysms Count", "Hemorrhages Count"],
    "Drusen": ["BMI", "Blood Pressure", "Cholesterol Levels"]
}

for disease, features in disease_specific_features.items():
    for col in features:
        if col in merged_df.columns:
            disease_mean = merged_df.loc[merged_df["Disease_Label"] == disease, col].mean()

            # If the mean is NaN, use the global median
            if pd.isna(disease_mean):
                merged_df[col].fillna(merged_df[col].median(), inplace=True)
            else:
                merged_df.loc[merged_df["Disease_Label"] == disease, col] = (
                    merged_df.loc[merged_df["Disease_Label"] == disease, col].fillna(disease_mean)
                )

# 🔹 Common Features - Fill missing values per disease group
common_features = {
    "Glaucoma_DME": ["Intraocular Pressure (IOP)"],
    "Cataract_DiabeticRetinopathy_Drusen_CNV": ["Smoking Status"]
}

for col in common_features["Glaucoma_DME"]:
    if col in merged_df.columns:
        merged_df[col] = merged_df.groupby("Disease_Label")[col].transform(lambda x: x.fillna(x.mean()))

for col in common_features["Cataract_DiabeticRetinopathy_Drusen_CNV"]:
    if col in merged_df.columns:
        merged_df[col] = merged_df[col].fillna(merged_df[col].mode()[0])

# 🔹 Handle Missing Values for Other Disease Groups Using Median
for col in essential_features:
    if col in merged_df.columns:
        merged_df[col] = merged_df[col].fillna(merged_df[col].median())

# 🔹 One-Hot Encoded Features - Fill missing with 0 (category not applicable)
one_hot_encoded_features = [
    "Lens Opacity_mild", "Lens Opacity_moderate", "Lens Opacity_severe",
    "Glare Sensitivity_mild", "Glare Sensitivity_none", "Glare Sensitivity_severe",
    "UV Exposure_high", "UV Exposure_low", "UV Exposure_medium",
    "Lens Status_Cortical cataract", "Lens Status_Normal", "Lens Status_Nuclear cataract", "Lens Status_Posterior subcapsular cataract",
    "Optical Coherence Tomography (OCT) Results_Early CNV", "Optical Coherence Tomography (OCT) Results_Normal", "Optical Coherence Tomography (OCT) Results_Scarred/End-stage CNV",
    "Fluorescein Angiography Results_Early Neovascularization", "Fluorescein Angiography Results_No Neovascularization"
]

for col in one_hot_encoded_features:
    if col in merged_df.columns:
        merged_df[col] = merged_df[col].fillna(0)

# 🔹 Visual Symptoms - Fill missing values with 0
symptom_columns = [col for col in merged_df.columns if "Visual Symptoms_" in col]
merged_df[symptom_columns] = merged_df[symptom_columns].fillna(0)

# 🔹 Visual Acuity Test Results - Fill missing with 0 (category not applicable)
acuity_columns = [col for col in merged_df.columns if "Visual Acuity Test Results_" in col]
merged_df[acuity_columns] = merged_df[acuity_columns].fillna(0)

# 🔹 Drop Features That Still Have >50% Missing Values (Except Essential Features)
missing_percent = merged_df.isnull().sum() / len(merged_df) * 100
cols_to_drop = missing_percent[missing_percent > 50].index.tolist()
cols_to_drop = [col for col in cols_to_drop if col not in essential_features]
merged_df.drop(columns=cols_to_drop, inplace=True)
print(f"Dropped columns with >50% missing values (excluding essential features): {list(cols_to_drop)}")

# 🔹 Final check for missing values
print("\nMissing values after handling:\n", merged_df.isnull().sum().sum())

# -----------------------------------------
# Step 4: Remove duplicate rows
# -----------------------------------------
merged_df.drop_duplicates(inplace=True)
print(f"Dataset shape after removing duplicates: {merged_df.shape}")

# -----------------------------------------
# Step 5: Save the cleaned dataset
# -----------------------------------------
output_path = "/content/drive/MyDrive/TEST 29/merged_dataset.csv"
merged_df.to_csv(output_path, index=False)
print(f"Final dataset saved to: {output_path}")

# -----------------------------------------
# Additional Checks
# -----------------------------------------
print("\nFinal dataset shape:", merged_df.shape)

#print value counts for diagnosis and disease labels
print("\nDiagnosis Value Counts:")
print(merged_df["Diagnosis"].value_counts())

print("\nDisease Label Value Counts:")
print(merged_df["Disease_Label"].value_counts())


# 🔹 Define disease encoding
disease_mapping = {
    "Glaucoma": 0,
    "Cataract": 1,
    "Diabetic Retinopathy": 2,
    "CNV": 3,
    "DME": 4,
    "Drusen": 5
}

# 🔹 Apply encoding to Disease_Label column
merged_df["Disease_Label"] = merged_df["Disease_Label"].map(disease_mapping)

# 🔹 Drop the specified Visual Symptoms features
features_to_drop = [
    "Visual Symptoms_vomiting", "Visual Symptoms_nausea", "Visual Symptoms_eye pain",
    "Visual Symptoms_vision loss", "Visual Symptoms_general vision blurriness",
    "Visual Symptoms_vision change", "Visual Symptoms_vision loss area",
    "Visual Symptoms_loss of central vision", "Visual Symptoms_temporary vision disturbances",
    "Visual Symptoms_colors appear faded"
]

# Drop columns if they exist in the dataframe
merged_df.drop(columns=[col for col in features_to_drop if col in merged_df.columns], inplace=True)

print(f"Dropped features: {features_to_drop}")


# 🔹 Save the new encoded dataset
encoded_output_path = "/content/drive/MyDrive/TEST 29/encoded_merged_dataset.csv"
merged_df.to_csv(encoded_output_path, index=False)

print(f"Encoded dataset saved to: {encoded_output_path}")

# 🔹 Verify the encoding
print(merged_df[["Disease_Label"]].head(10))  # Print first 10 rows to check encoding

print("/n")
#print final dataset shape
print("Final dataset shape:", merged_df.shape)


Loaded dataset: /content/drive/MyDrive/PROJECT 29/FINAL MODEL/PRE-PROCESSED DATASETS/preprocessed_glaucoma_dataset.csv | Shape: (10000, 13)
Loaded dataset: /content/drive/MyDrive/PROJECT 29/FINAL MODEL/PRE-PROCESSED DATASETS/Preprocessed_Cataract_Dataset.csv | Shape: (10000, 23)
Loaded dataset: /content/drive/MyDrive/PROJECT 29/FINAL MODEL/PRE-PROCESSED DATASETS/Preprocessed_Diabetic_Retinopathy_Dataset.csv | Shape: (10000, 14)
Loaded dataset: /content/drive/MyDrive/PROJECT 29/FINAL MODEL/PRE-PROCESSED DATASETS/Preprocessed_CNV_Detection_Dataset.csv | Shape: (10750, 20)
Loaded dataset: /content/drive/MyDrive/PROJECT 29/FINAL MODEL/PRE-PROCESSED DATASETS/Preprocessed_DME_Dataset.csv | Shape: (10000, 19)
Loaded dataset: /content/drive/MyDrive/PROJECT 29/FINAL MODEL/PRE-PROCESSED DATASETS/Preprocessed_Drusen_Dataset.csv | Shape: (10000, 12)

Merged dataset shape (before cleaning): (60750, 74)

Missing values before handling:
 Age                                          0
Intraocular Pres