<a href="https://colab.research.google.com/github/DinurakshanRavichandran/Visio-Glance/blob/Pre-Processed-Datasets-NLP/glaucomaPPFinal.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#mount drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [11]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
import re

# Load the dataset
data_path = '/content/drive/MyDrive/DSGP PROJECT 29/DATASETS/glaucoma_dataset.csv'
glaucoma_data = pd.read_csv(data_path)

# Columns to drop
columns_to_drop = ['Patient ID', 'Visual Acuity Measurements', 'Medical History', 'Medication Usage',
                   'Cataract Status', 'Angle Closure Status', 'Glaucoma Type']
glaucoma_data.drop(columns=columns_to_drop, inplace=True)

# Handle Missing Values
imputer = SimpleImputer(strategy='most_frequent')
glaucoma_data = pd.DataFrame(imputer.fit_transform(glaucoma_data), columns=glaucoma_data.columns)

# Feature Engineering for OCT and Visual Field Test Results
glaucoma_data['RNFL Thickness'] = glaucoma_data['Optical Coherence Tomography (OCT) Results'].apply(
    lambda x: float(re.search(r'RNFL Thickness: (\d+\.\d+)', x).group(1)) if re.search(r'RNFL Thickness: (\d+\.\d+)', x) else None
)
glaucoma_data['GCC Thickness'] = glaucoma_data['Optical Coherence Tomography (OCT) Results'].apply(
    lambda x: float(re.search(r'GCC Thickness: (\d+\.\d+)', x).group(1)) if re.search(r'GCC Thickness: (\d+\.\d+)', x) else None
)

# Simplified Text Processing for Visual Symptoms
def simple_preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove non-alphabetic characters
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Split text into words (simple tokenization)
    words = text.split()
    # Return a space-separated string of words
    return ' '.join(words)

glaucoma_data['Visual Symptoms'] = glaucoma_data['Visual Symptoms'].apply(simple_preprocess_text)

# Encode Categorical Data
encoder = LabelEncoder()
for col in ['Gender', 'Family History']:
    glaucoma_data[col] = encoder.fit_transform(glaucoma_data[col])

# Encode the target variable 'Diagnosis'
glaucoma_data['Diagnosis'] = encoder.fit_transform(glaucoma_data['Diagnosis'])

# Normalize/Scale Numerical Data
scaler = StandardScaler()
numerical_cols = ['Age', 'Intraocular Pressure (IOP)', 'Cup-to-Disc Ratio (CDR)', 'Pachymetry', 'RNFL Thickness', 'GCC Thickness']
glaucoma_data[numerical_cols] = scaler.fit_transform(glaucoma_data[numerical_cols].fillna(glaucoma_data[numerical_cols].mean()))

# Save Preprocessed Data
output_path = '/content/drive/MyDrive/DSGP PROJECT 29/FINAL MODEL/preprocessed_glaucoma_dataset.csv'
glaucoma_data.to_csv(output_path, index=False)

# Output the path of the saved file
output_path

#print the head of the preprocessed dataset
glaucoma_data.head()


  glaucoma_data[numerical_cols] = scaler.fit_transform(glaucoma_data[numerical_cols].fillna(glaucoma_data[numerical_cols].mean()))


Unnamed: 0,Age,Gender,Intraocular Pressure (IOP),Cup-to-Disc Ratio (CDR),Family History,Visual Field Test Results,Optical Coherence Tomography (OCT) Results,Pachymetry,Visual Symptoms,Diagnosis,RNFL Thickness,GCC Thickness
0,0.716058,1,0.448238,-0.889954,0,"Sensitivity: 0.54, Specificity: 0.75","RNFL Thickness: 86.48 µm, GCC Thickness: 64.14...",-0.284554,tunnel vision eye pain nausea,1,-0.132907,0.375042
1,0.716058,0,0.202593,1.188779,0,"Sensitivity: 0.72, Specificity: 0.88","RNFL Thickness: 96.88 µm, GCC Thickness: 56.48...",0.105048,redness in the eye vision loss tunnel vision,1,1.316332,-1.392495
2,0.62139,0,1.410156,1.188779,0,"Sensitivity: 0.56, Specificity: 0.8","RNFL Thickness: 89.81 µm, GCC Thickness: 59.05...",0.827507,halos around lights vision loss redness in the...,1,0.331128,-0.79947
3,-1.461302,1,0.122242,0.426577,0,"Sensitivity: 0.6, Specificity: 0.93","RNFL Thickness: 87.25 µm, GCC Thickness: 63.98...",1.416408,nausea nausea halos around lights,1,-0.025608,0.338122
4,-1.55597,1,-0.375935,-1.721448,0,"Sensitivity: 0.82, Specificity: 0.9","RNFL Thickness: 82.61 µm, GCC Thickness: 66.01...",1.338211,eye pain eye pain tunnel vision,1,-0.672191,0.806542
