<a href="https://colab.research.google.com/github/DinurakshanRavichandran/Visio-Glance/blob/Pre-Processed-Datasets-NLP/glaucomaPPFinal.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#mount drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
import re
import numpy as np

# Load the dataset
data_path = '/content/drive/MyDrive/PROJECT 29/DATASETS/glaucoma_dataset.csv'
glaucoma_data = pd.read_csv(data_path)

# Columns to drop
columns_to_drop = ['Patient ID','Gender','Visual Acuity Measurements', 'Medical History', 'Medication Usage',
                   'Cataract Status', 'Angle Closure Status', 'Glaucoma Type']
glaucoma_data.drop(columns=columns_to_drop, inplace=True)

# Handle Missing Values
imputer = SimpleImputer(strategy='most_frequent')
glaucoma_data = pd.DataFrame(imputer.fit_transform(glaucoma_data), columns=glaucoma_data.columns)

# Feature Engineering for OCT and Visual Field Test Results
def extract_numerical_features(text, pattern):
    match = re.search(pattern, text)
    return float(match.group(1)) if match else np.nan

glaucoma_data['RNFL Thickness'] = glaucoma_data['Optical Coherence Tomography (OCT) Results'].apply(
    lambda x: extract_numerical_features(x, r'RNFL Thickness: (\d+\.\d+)')
)
glaucoma_data['GCC Thickness'] = glaucoma_data['Optical Coherence Tomography (OCT) Results'].apply(
    lambda x: extract_numerical_features(x, r'GCC Thickness: (\d+\.\d+)')
)

# Simplified Text Processing for Visual Symptoms
def simple_preprocess_text(text):
    keywords = ['vomiting','nausea','eye pain', 'vision loss', 'tunnel vision', 'halos around lights', 'redness in the eye','blurred vision']
    text = text.lower()
    found_keywords = [keyword for keyword in keywords if keyword in text]
    return ' '.join(found_keywords)

glaucoma_data['Visual Symptoms'] = glaucoma_data['Visual Symptoms'].apply(simple_preprocess_text)

# Encode Categorical Data
encoder = LabelEncoder()
for col in ['Family History']:
    glaucoma_data[col] = encoder.fit_transform(glaucoma_data[col])

# Encode the target variable 'Diagnosis'
glaucoma_data['Diagnosis'] = encoder.fit_transform(glaucoma_data['Diagnosis'])

# Normalize/Scale Numerical Data
scaler = StandardScaler()
numerical_cols = ['Intraocular Pressure (IOP)', 'Cup-to-Disc Ratio (CDR)', 'Pachymetry', 'RNFL Thickness', 'GCC Thickness']
glaucoma_data[numerical_cols] = scaler.fit_transform(glaucoma_data[numerical_cols].fillna(glaucoma_data[numerical_cols].mean()))

# Save Preprocessed Data
output_path = '/content/drive/MyDrive/PROJECT 29/FINAL MODEL/preprocessed_glaucoma_dataset.csv'
glaucoma_data.to_csv(output_path, index=False)

# Output the path of the saved file
print(output_path)

#print the head of the preprocessed dataset
print(glaucoma_data.head())


  glaucoma_data[numerical_cols] = scaler.fit_transform(glaucoma_data[numerical_cols].fillna(glaucoma_data[numerical_cols].mean()))


/content/drive/MyDrive/PROJECT 29/FINAL MODEL/preprocessed_glaucoma_dataset.csv
  Age  Intraocular Pressure (IOP)  Cup-to-Disc Ratio (CDR)  Family History  \
0  69                    0.448238                -0.889954               0   
1  69                    0.202593                 1.188779               0   
2  67                    1.410156                 1.188779               0   
3  23                    0.122242                 0.426577               0   
4  21                   -0.375935                -1.721448               0   

              Visual Field Test Results  \
0  Sensitivity: 0.54, Specificity: 0.75   
1  Sensitivity: 0.72, Specificity: 0.88   
2   Sensitivity: 0.56, Specificity: 0.8   
3   Sensitivity: 0.6, Specificity: 0.93   
4   Sensitivity: 0.82, Specificity: 0.9   

          Optical Coherence Tomography (OCT) Results  Pachymetry  \
0  RNFL Thickness: 86.48 µm, GCC Thickness: 64.14...   -0.284554   
1  RNFL Thickness: 96.88 µm, GCC Thickness: 56.48...    