<a href="https://colab.research.google.com/github/DinurakshanRavichandran/Visio-Glance/blob/Glaucoma-Model/glaucoma_V02.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Glaucoma Detection Using XGBoost

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:

# 1. Importing Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
from sklearn.metrics import classification_report, accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from xgboost import XGBClassifier

## 2. Loading and Exploring the Dataset

In [None]:

# Load dataset
data = pd.read_csv('/content/drive/MyDrive/DSGP PROJECT 29/DATASETS/glaucoma_dataset.csv')
print(data.head())
print(data.info())


   Patient ID  Age  Gender Visual Acuity Measurements  \
0       62431   69    Male                 LogMAR 0.1   
1       68125   69  Female                 LogMAR 0.1   
2       63329   67  Female                      20/40   
3       47174   23    Male                 LogMAR 0.0   
4       67361   21    Male                 LogMAR 0.1   

   Intraocular Pressure (IOP)  Cup-to-Disc Ratio (CDR) Family History  \
0                       19.46                     0.42             No   
1                       18.39                     0.72             No   
2                       23.65                     0.72             No   
3                       18.04                     0.61             No   
4                       15.87                     0.30             No   

  Medical History                                   Medication Usage  \
0        Diabetes  Amoxicillin, Lisinopril, Omeprazole, Atorvasta...   
1    Hypertension  Lisinopril, Amoxicillin, Atorvastatin, Ibuprof...   
2 

## 3. Data Preprocessing

In [None]:

# Handle missing values
data['Medication Usage'].fillna('None', inplace=True)

# Encode categorical features
categorical_features = ['Gender', 'Family History', 'Medical History', 'Cataract Status', 'Angle Closure Status']
for feature in categorical_features:
    data[feature] = LabelEncoder().fit_transform(data[feature])

# Process textual features with TF-IDF
tfidf = TfidfVectorizer(max_features=500)  # Limit features to 500
data['Visual Symptoms'] = data['Visual Symptoms'].fillna("")

# Transform 'Visual Symptoms' using TF-IDF
text_features = tfidf.fit_transform(data['Visual Symptoms']).toarray()

# Debugging: Check the shape of text_features
print(f"TF-IDF Features Shape: {text_features.shape}")  # Should be (10000, 500)

# Create a DataFrame from text_features
text_df = pd.DataFrame(
    text_features,
    columns=[f"symptom_{i}" for i in range(text_features.shape[1])],  # Dynamically get the correct number of columns
    index=data.index  # Ensure indices match
)

# Concatenate the TF-IDF features back to the original dataset
data = pd.concat([data, text_df], axis=1)

# Debugging: Check the resulting dataset shape
print(f"Final Data Shape: {data.shape}")



The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['Medication Usage'].fillna('None', inplace=True)


TF-IDF Features Shape: (10000, 14)
Final Data Shape: (10000, 31)


## 4. Feature Selection and Target Encoding

In [None]:
# Dynamically determine the correct number of TF-IDF features
tfidf_feature_count = len([col for col in data.columns if col.startswith('symptom_')])

# Select features and target
features = ['Age', 'Gender', 'Intraocular Pressure (IOP)', 'Cup-to-Disc Ratio (CDR)', 'Pachymetry'] + \
           [f"symptom_{i}" for i in range(tfidf_feature_count)]
X = data[features]
y = data['Diagnosis']

# Encode target variable
y = LabelEncoder().fit_transform(y)


## 5. Splitting the Data

In [None]:

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


## 6. Handling Class Imbalance

In [None]:

# Handle class imbalance with SMOTE
smote = SMOTE(random_state=42)
X_train, y_train = smote.fit_resample(X_train, y_train)


## 7. Normalizing the Features

In [None]:

# Normalize features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


## 8. Training the Model

In [None]:

# Train an XGBoost model
model = XGBClassifier(random_state=42, n_estimators=300, learning_rate=0.05, max_depth=6)
model.fit(X_train, y_train)


## 9. Evaluating the Model

In [None]:

# Evaluate the model
y_pred = model.predict(X_test)
print("Classification Report:")
print(classification_report(y_test, y_pred))
print(f"Accuracy: {accuracy_score(y_test, y_pred) * 100:.2f}%")


Classification Report:
              precision    recall  f1-score   support

           0       0.50      0.48      0.49      1002
           1       0.50      0.52      0.51       998

    accuracy                           0.50      2000
   macro avg       0.50      0.50      0.50      2000
weighted avg       0.50      0.50      0.50      2000

Accuracy: 49.75%


## 10. Saving the Model

In [None]:

import joblib

# Save the model
joblib.dump(model, 'glaucoma_model.pkl')


['glaucoma_model.pkl']