<a href="https://colab.research.google.com/github/DinurakshanRavichandran/Visio-Glance/blob/Pre-Processed-Datasets-NLP/cataractPPFinal.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#mount google drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
import re

# Load dataset
file_path = '/content/drive/MyDrive/PROJECT 29/DATASETS/Synthetic_Cataract_Dataset.csv'
df = pd.read_csv(file_path)

# Remove duplicate rows
df.drop_duplicates(inplace=True)

# Drop "Family History" column if it exists
if 'Family History' in df.columns:
    df.drop(columns=['Family History'], inplace=True)

# Handle missing values
imputer = SimpleImputer(strategy='most_frequent')
df_filled = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)

# Function to process text data
def process_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'[^a-z\s]', '', text)  # Remove non-alphabetic characters
    text = re.sub(r'\s+', ' ', text).strip()  # Remove multiple spaces
    return text

# Apply text processing
df_filled['Visual Symptoms'] = df_filled['Visual Symptoms'].apply(process_text)

# Encode categorical data
categorical_features = ['Visual Acuity Test Results', 'Lens Opacity', 'Glare Sensitivity',
                        'UV Exposure', 'Visual Symptoms']
one_hot_encoder = OneHotEncoder()
encoded_features = one_hot_encoder.fit_transform(df_filled[categorical_features]).toarray()
feature_labels = one_hot_encoder.get_feature_names_out(categorical_features)
encoded_df = pd.DataFrame(encoded_features, columns=feature_labels).astype(int)  # Convert to int

# Rename specific one-hot encoded features
rename_dict = {
    'Visual Symptoms_blurry vision': 'Visual Symptoms_blurred vision',
    'Visual Symptoms_none': 'Visual Symptoms_no visible symptoms'
}
encoded_df.rename(columns=rename_dict, inplace=True)

# Drop original categorical columns and add encoded ones
df_final = df_filled.drop(categorical_features, axis=1)
df_final = pd.concat([df_final, encoded_df], axis=1)

# Ensure the target variable is in integer format
df_final['Diagnosis'] = df_final['Diagnosis'].astype(int)

# Define target and features
features = df_final.drop(columns=['Diagnosis'])
target = df_final['Diagnosis']

# Print class distribution before balancing
print("Before balancing:")
print(target.value_counts())

# Define target sample size per class (approx. 5000 per class)
desired_class_size = 5000

# Step 1: Undersample the majority class (0 - No Cataract) down to 5000
undersample = RandomUnderSampler(sampling_strategy={0: desired_class_size}, random_state=42)
features_under, target_under = undersample.fit_resample(features, target)

# Step 2: Oversample the minority class (1 - Cataract) up to 5000 using SMOTE
smote = SMOTE(sampling_strategy={1: desired_class_size}, random_state=42)
features_balanced, target_balanced = smote.fit_resample(features_under, target_under)

# Reconstruct the DataFrame with balanced data
df_balanced = pd.DataFrame(features_balanced, columns=features.columns)
df_balanced['Diagnosis'] = target_balanced.astype(int)  # Ensure it's an integer

# Print class distribution after balancing
print("\nAfter balancing:")
print(df_balanced['Diagnosis'].value_counts())

# Save Preprocessed Data
preprocessed_file_path = '/content/drive/MyDrive/PROJECT 29/FINAL MODEL/Preprocessed_Cataract_Dataset.csv'
df_balanced.to_csv(preprocessed_file_path, index=False)

# Display the head of the preprocessed dataset
print(df_balanced.head())


Before balancing:
Diagnosis
0    5959
1    3808
Name: count, dtype: int64

After balancing:
Diagnosis
0    5000
1    5000
Name: count, dtype: int64
    Age History of Diabetes Smoking Status  Visual Acuity Test Results_20/100  \
0  51.0                 0.0            1.0                                  0   
1  50.0                 0.0            1.0                                  0   
2  47.0                 0.0            0.0                                  0   
3  88.0                 0.0            0.0                                  0   
4  41.0                 1.0            1.0                                  0   

   Visual Acuity Test Results_20/20  Visual Acuity Test Results_20/30  \
0                                 1                                 0   
1                                 0                                 1   
2                                 0                                 1   
3                                 0                                 0   
