<a href="https://colab.research.google.com/github/DinurakshanRavichandran/Visio-Glance/blob/Pre-Processed-Datasets-NLP/dmePPFinal.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#mount google drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [9]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

# Load the dataset
file_path = '/content/drive/MyDrive/PROJECT 29/DATASETS/Synthetic_DME_Dataset.csv'
df = pd.read_csv(file_path)

# Remove duplicate rows
df.drop_duplicates(inplace=True)

# Print class distribution before balancing
print("Before balancing:")
print(df['Diagnosis'].value_counts())

# Handle missing values
df.fillna(df.mode().iloc[0], inplace=True)

# Simplified Text Processing for 'Visual Symptoms'
symptoms = ["floaters", "vision loss area", "distorted vision", "color vision changes", "blurred vision"]

# Create binary columns for each symptom
for symptom in symptoms:
    df[symptom] = df['Visual Symptoms'].str.lower().str.contains(symptom, na=False).astype(int)

# Rename 'Visual Symptoms' related columns with prefix
renamed_columns = {symptom: f"Visual Symptoms_{symptom}" for symptom in symptoms}
df.rename(columns=renamed_columns, inplace=True)

# One-hot encoding for 'Visual Acuity Test Results' and 'Lens Status'
df = pd.get_dummies(df, columns=['Visual Acuity Test Results', 'Lens Status'], drop_first=True)

# Drop the original 'Visual Symptoms' column
df.drop('Visual Symptoms', axis=1, inplace=True)

# Encode the target variable 'Diagnosis'
df['Diagnosis'] = df['Diagnosis'].map({'Yes': 1, 'No': 0})

# Convert TRUE/FALSE values to 1/0 if any exist
df = df.applymap(lambda x: 1 if x is True else (0 if x is False else x))

# Define features and target
X = df.drop('Diagnosis', axis=1)
y = df['Diagnosis']

# Define the desired sample size per class (~5,000 per class)
desired_class_size = 5000

# Step 1: Undersample the majority class (No DME) down to 5000
undersample = RandomUnderSampler(sampling_strategy={0: desired_class_size}, random_state=42)
X_under, y_under = undersample.fit_resample(X, y)

# Step 2: Oversample the minority class (DME Present) up to 5000 using SMOTE
smote = SMOTE(sampling_strategy={1: desired_class_size}, random_state=42)
X_balanced, y_balanced = smote.fit_resample(X_under, y_under)

# Reconstruct the DataFrame after resampling
df_balanced = pd.DataFrame(X_balanced, columns=X.columns)
df_balanced['Diagnosis'] = y_balanced.astype(int)  # Ensure it's an integer

# Print class distribution after balancing
print("\nAfter balancing:")
print(df_balanced['Diagnosis'].value_counts())

# Save Preprocessed Data
preprocessed_file_path = '/content/drive/MyDrive/PROJECT 29/FINAL MODEL/Preprocessed_Balanced_DME_Dataset.csv'
df_balanced.to_csv(preprocessed_file_path, index=False)

print("Preprocessing completed and data saved to:", preprocessed_file_path)

# Print head of preprocessed dataset
print(df_balanced.head())


Before balancing:
Diagnosis
No     8523
Yes    1465
Name: count, dtype: int64

After balancing:
Diagnosis
0    5000
1    5000
Name: count, dtype: int64
Preprocessing completed and data saved to: /content/drive/MyDrive/PROJECT 29/FINAL MODEL/Preprocessed_Balanced_DME_Dataset.csv
   Age  Intraocular Pressure (IOP)  Visual Symptoms_floaters  \
0   36                          34                         1   
1   31                          14                         0   
2   54                          23                         0   
3   83                          20                         0   
4   32                          11                         0   

   Visual Symptoms_vision loss area  Visual Symptoms_distorted vision  \
0                                 0                                 0   
1                                 0                                 1   
2                                 0                                 0   
3                                 1         

  df = df.applymap(lambda x: 1 if x is True else (0 if x is False else x))
