<a href="https://colab.research.google.com/github/DinurakshanRavichandran/Visio-Glance/blob/Pre-Processed-Datasets-NLP/glaucoma_preprocessed.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE

# Load the dataset
file_path = "/content/drive/MyDrive/DSGP PROJECT 29/DATASETS/glaucoma_dataset.csv"  # Update with actual path if needed
df = pd.read_csv(file_path)

# Step 1: Data Cleaning
df.drop_duplicates(inplace=True)
df["Medication Usage"].fillna("None", inplace=True)
df.drop(columns=["Patient ID"], inplace=True)

# Step 2: Encoding Categorical Variables
categorical_columns = ["Gender", "Family History", "Cataract Status", "Angle Closure Status", "Diagnosis"]
label_encoders = {}

for col in categorical_columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

# Step 3: Extracting Features
df["Medical History Count"] = df["Medical History"].apply(lambda x: len(x.split(",")))
df["Medication Usage Count"] = df["Medication Usage"].apply(lambda x: len(x.split(",")))

df["VFT Sensitivity"] = df["Visual Field Test Results"].str.extract(r'Sensitivity:\s*([\d.]+)').astype(float)
df["VFT Specificity"] = df["Visual Field Test Results"].str.extract(r'Specificity:\s*([\d.]+)').astype(float)

df["RNFL Thickness"] = df["Optical Coherence Tomography (OCT) Results"].str.extract(r'RNFL Thickness:\s*([\d.]+)').astype(float)
df["GCC Thickness"] = df["Optical Coherence Tomography (OCT) Results"].str.extract(r'GCC Thickness:\s*([\d.]+)').astype(float)

df.drop(columns=["Medical History", "Medication Usage", "Visual Field Test Results", "Optical Coherence Tomography (OCT) Results"], inplace=True)

# Step 4: Transform "Visual Acuity Measurements"
def convert_visual_acuity(value):
    if "LogMAR" in value:
        return float(value.split()[-1])
    elif "/" in value:
        num, denom = value.split("/")
        return float(num) / float(denom)
    return None

df["Visual Acuity"] = df["Visual Acuity Measurements"].apply(convert_visual_acuity)
df.drop(columns=["Visual Acuity Measurements"], inplace=True)

# Convert "Visual Symptoms" to a Count
df["Visual Symptoms Count"] = df["Visual Symptoms"].apply(lambda x: len(x.split(",")))
df.drop(columns=["Visual Symptoms"], inplace=True)

# Step 5: Handle Imbalanced Data Using SMOTE
X = df.drop(columns=["Diagnosis"])  # Features
y = df["Diagnosis"]  # Target variable

smote = SMOTE(sampling_strategy="auto", random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Convert back to DataFrame
df_balanced = pd.DataFrame(X_resampled, columns=X.columns)
df_balanced["Diagnosis"] = y_resampled

# Print class distribution after SMOTE
print("Balanced dataset count:\n", df_balanced["Diagnosis"].value_counts())

# Save the processed dataset
df_balanced.to_csv("processed_glaucoma_dataset.csv", index=False)
print("Preprocessing complete. Processed dataset saved as 'processed_glaucoma_dataset.csv'.")


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["Medication Usage"].fillna("None", inplace=True)


AttributeError: 'float' object has no attribute 'split'