<a href="https://colab.research.google.com/github/DinurakshanRavichandran/Visio-Glance/blob/NLP/unified_eye_disease_detection_corrected.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Unified Eye Disease Detection Model
This notebook implements a machine learning pipeline to predict one of six eye diseases based on symptom datasets.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report

# Load datasets
datasets = {
    'diabetic_retinopathy': '/content/drive/MyDrive/DSGP PROJECT 29/DATASETS/augmented_diabetic_retinopathy_dataset.csv',
    'cnv': '/content/drive/MyDrive/DSGP PROJECT 29/DATASETS/augmented_cnv_dataset.csv',
    'dme': '/content/drive/MyDrive/DSGP PROJECT 29/DATASETS/augmented_dme_dataset.csv',
    'drusen': '/content/drive/MyDrive/DSGP PROJECT 29/DATASETS/augmented_drusen_dataset.csv',
    'glaucoma': '/content/drive/MyDrive/DSGP PROJECT 29/DATASETS/glaucoma_dataset.csv',
    'cataract': '/content/drive/MyDrive/DSGP PROJECT 29/DATASETS/cataract_patient_data.csv'
}

dataframes = {name: pd.read_csv(path) for name, path in datasets.items()}

# Assign labels to each dataset
labels = {
    'diabetic_retinopathy': 'Diabetic Retinopathy',
    'cnv': 'CNV',
    'dme': 'DME',
    'drusen': 'Drusen',
    'glaucoma': 'Glaucoma',
    'cataract': 'Cataract'
}

for name, df in dataframes.items():
    df['Disease'] = labels[name]

# Combine datasets
combined_df = pd.concat(dataframes.values(), ignore_index=True)

# Ensure uniform column types
for column in combined_df.columns:
    if combined_df[column].dtype == 'object':  # Check for string columns
        combined_df[column] = combined_df[column].astype(str)  # Convert all to strings
    else:  # Numeric columns
        combined_df[column] = pd.to_numeric(combined_df[column], errors='coerce')  # Convert all to numeric
        combined_df[column].fillna(combined_df[column].median(), inplace=True)  # Fill missing with median

# Handle missing values for object columns
for column in combined_df.select_dtypes(include=['object']).columns:
    combined_df[column].fillna('Unknown', inplace=True)

# Encode categorical features
label_encoders = {}
for column in combined_df.select_dtypes(include=['object']).columns:
    if column != 'Disease':  # Skip the target column for now
        le = LabelEncoder()
        combined_df[column] = le.fit_transform(combined_df[column])
        label_encoders[column] = le

# Separate features and target
X = combined_df.drop(columns=['Disease'])
y = combined_df['Disease']

# Encode target labels
target_encoder = LabelEncoder()
y = target_encoder.fit_transform(y)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Train a Random Forest model
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

# Evaluate the Random Forest model
y_pred_rf = rf_model.predict(X_test)
print("Random Forest Classification Report:")
print(classification_report(y_test, y_pred_rf, target_names=target_encoder.classes_))

# Perform hyperparameter tuning for Random Forest
param_grid_rf = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'class_weight': ['balanced', 'balanced_subsample', None]
}

grid_search_rf = GridSearchCV(estimator=RandomForestClassifier(random_state=42), param_grid=param_grid_rf,
                               scoring='accuracy', cv=5, verbose=2, n_jobs=-1)
grid_search_rf.fit(X_train, y_train)

print("Best Parameters for Random Forest:", grid_search_rf.best_params_)
print("Best Cross-Validation Accuracy for Random Forest:", grid_search_rf.best_score_)

# Evaluate the tuned Random Forest model
best_rf_model = grid_search_rf.best_estimator_
y_pred_best_rf = best_rf_model.predict(X_test)
print("Tuned Random Forest Classification Report:")
print(classification_report(y_test, y_pred_best_rf, target_names=target_encoder.classes_))

# Train an XGBoost model
xgb_model = XGBClassifier(random_state=42)
param_grid_xgb = {
    'n_estimators': [100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.8, 1.0]
}

grid_search_xgb = GridSearchCV(estimator=xgb_model, param_grid=param_grid_xgb,
                                scoring='accuracy', cv=5, verbose=2, n_jobs=-1)
grid_search_xgb.fit(X_train, y_train)

print("Best Parameters for XGBoost:", grid_search_xgb.best_params_)
print("Best Cross-Validation Accuracy for XGBoost:", grid_search_xgb.best_score_)

# Evaluate the tuned XGBoost model
best_xgb_model = grid_search_xgb.best_estimator_
y_pred_best_xgb = best_xgb_model.predict(X_test)
print("Tuned XGBoost Classification Report:")
print(classification_report(y_test, y_pred_best_xgb, target_names=target_encoder.classes_))


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  combined_df[column].fillna(combined_df[column].median(), inplace=True)  # Fill missing with median
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  combined_df[column].fillna(combined_df[column].median(), inplace=True)  # Fill missing with median
The behavior will change in pandas

Random Forest Classification Report:
                      precision    recall  f1-score   support

                 CNV       1.00      1.00      1.00      1959
            Cataract       1.00      1.00      1.00      2024
                 DME       1.00      1.00      1.00      1946
Diabetic Retinopathy       1.00      1.00      1.00      2004
              Drusen       1.00      1.00      1.00      2068
            Glaucoma       1.00      1.00      1.00      1999

            accuracy                           1.00     12000
           macro avg       1.00      1.00      1.00     12000
        weighted avg       1.00      1.00      1.00     12000

Fitting 5 folds for each of 324 candidates, totalling 1620 fits
Best Parameters for Random Forest: {'class_weight': 'balanced', 'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best Cross-Validation Accuracy for Random Forest: 1.0
Tuned Random Forest Classification Report:
                      precisio



AttributeError: 'super' object has no attribute '__sklearn_tags__'