<a href="https://colab.research.google.com/github/DinurakshanRavichandran/Visio-Glance/blob/Glaucoma-Model/glaucoma_V05.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Enhanced Glaucoma Detection Model


## Steps to Improve Model Accuracy
1. **Data Preprocessing**:
    - Handling class imbalance with SMOTE (requires `imblearn` library).
    - Scaling features using `StandardScaler`.
2. **Model Training**:
    - Using XGBoost with hyperparameter tuning via `GridSearchCV`.
3. **Evaluation**:
    - Reporting metrics including accuracy, precision, recall, and F1-score.


In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score
from imblearn.over_sampling import SMOTE
import xgboost as xgb

# Load your dataset here (replace with actual loading code)
# Example:
try:
    data = pd.read_csv('/content/drive/MyDrive/DSGP PROJECT 29/DATASETS/glaucoma_dataset.csv')
    print("Columns in dataset:", data.columns)

    # Ensuring 'target' column exists
    target_column = 'Diagnosis'  # Replace with the correct target column name
    if target_column not in data.columns:
        raise ValueError(f"Target column '{target_column}' not found in the dataset.")

    X = data.drop(target_column, axis=1)
    y = data[target_column]
except Exception as e:
    print("Error loading dataset:", e)


Columns in dataset: Index(['Patient ID', 'Age', 'Gender', 'Visual Acuity Measurements',
       'Intraocular Pressure (IOP)', 'Cup-to-Disc Ratio (CDR)',
       'Family History', 'Medical History', 'Medication Usage',
       'Visual Field Test Results',
       'Optical Coherence Tomography (OCT) Results', 'Pachymetry',
       'Cataract Status', 'Angle Closure Status', 'Visual Symptoms',
       'Diagnosis', 'Glaucoma Type'],
      dtype='object')


In [3]:

from sklearn.preprocessing import LabelEncoder
import numpy as np  # Ensure numpy is imported

# Splitting the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Identify categorical columns in X_train
categorical_columns = X_train.select_dtypes(include=['object']).columns

# Apply Label Encoding or One-Hot Encoding
label_encoders = {}
for col in categorical_columns:
    le = LabelEncoder()
    X_train[col] = le.fit_transform(X_train[col])

    # Handle unseen labels in test set by filling with a placeholder
    X_test[col] = X_test[col].map(lambda s: '<UNK>' if s not in le.classes_ else s)
    le.classes_ = np.append(le.classes_, '<UNK>')
    X_test[col] = le.transform(X_test[col])
    label_encoders[col] = le

# Proceed with SMOTE after encoding
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Normalize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_resampled)
X_test_scaled = scaler.transform(X_test)


In [4]:
!pip install --upgrade xgboost scikit-learn
# Ensure you have the compatible versions
!pip install scikit-learn==1.5.3 xgboost==1.7.6

# Encode the target labels in y_train_resampled and y_test
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
y_train_resampled = label_encoder.fit_transform(y_train_resampled)
y_test = label_encoder.transform(y_test)  # Ensure y_test uses the same encoding

# Define parameter grid for XGBoost
param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [4, 6, 8],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0],
    'random_state': [42]
}

# Grid search for best parameters
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
xgb_model = xgb.XGBClassifier()
grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, cv=5, scoring='accuracy', verbose=1, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train_resampled)

# Best model
best_xgb = grid_search.best_estimator_
label_encoder = LabelEncoder()
y_train_resampled = label_encoder.fit_transform(y_train_resampled)
y_test = label_encoder.transform(y_test)  # Ensure y_test uses the same encoding

# Define parameter grid for XGBoost
param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [4, 6, 8],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0],
    'random_state': [42]
}

# Grid search for best parameters
xgb_model = xgb.XGBClassifier()
grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, cv=5, scoring='accuracy', verbose=1, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train_resampled)

# Best model
best_xgb = grid_search.best_estimator_


[31mERROR: Could not find a version that satisfies the requirement scikit-learn==1.5.3 (from versions: 0.9, 0.10, 0.11, 0.12, 0.12.1, 0.13, 0.13.1, 0.14, 0.14.1, 0.15.0, 0.15.1, 0.15.2, 0.16.0, 0.16.1, 0.17, 0.17.1, 0.18, 0.18.1, 0.18.2, 0.19.0, 0.19.1, 0.19.2, 0.20.0, 0.20.1, 0.20.2, 0.20.3, 0.20.4, 0.21.1, 0.21.2, 0.21.3, 0.22, 0.22.1, 0.22.2.post1, 0.23.0, 0.23.1, 0.23.2, 0.24.0, 0.24.1, 0.24.2, 1.0, 1.0.1, 1.0.2, 1.1.0, 1.1.1, 1.1.2, 1.1.3, 1.2.0rc1, 1.2.0, 1.2.1, 1.2.2, 1.3.0rc1, 1.3.0, 1.3.1, 1.3.2, 1.4.0rc1, 1.4.0, 1.4.1.post1, 1.4.2, 1.5.0rc1, 1.5.0, 1.5.1, 1.5.2, 1.6.0rc1, 1.6.0, 1.6.1)[0m[31m
[0m[31mERROR: No matching distribution found for scikit-learn==1.5.3[0m[31m
[0m

AttributeError: 'super' object has no attribute '__sklearn_tags__'

In [5]:

# Evaluate on test data
y_pred = best_xgb.predict(X_test_scaled)

# Classification report and accuracy
print("Best Parameters:", grid_search.best_params_)
print("Classification Report:")
print(classification_report(y_test, y_pred))
print(f"Accuracy: {accuracy_score(y_test, y_pred) * 100:.2f}%")


NameError: name 'best_xgb' is not defined