<a href="https://colab.research.google.com/github/DinurakshanRavichandran/Visio-Glance/blob/Pre-Processed-Datasets-NLP/cataract_preprocessed.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from imblearn.over_sampling import SMOTE

# Load dataset
df = pd.read_csv("/content/drive/MyDrive/DSGP PROJECT 29/DATASETS/synthetic_cataract_dataset.csv")

# Step 1: Data Cleaning
# Handling missing values
df.drop_duplicates(inplace=True)
df.dropna(inplace=True)  # Drop rows with missing values

# Step 2: Data Transformation
# Identify categorical columns
categorical_cols = df.select_dtypes(include=['object']).columns

# Encoding categorical variables
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

# Standardizing numerical features
scaler = StandardScaler()
numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns.drop('Diagnosis')
df[numeric_cols] = scaler.fit_transform(df[numeric_cols])

# Step 3: Feature Selection
# Assuming all columns except 'Diagnosis' are relevant features
X = df.drop(columns=['Diagnosis'])
y = df['Diagnosis']  # Target variable

# Step 4: Handling Imbalanced Data using SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Convert resampled data back to a DataFrame
balanced_df = pd.DataFrame(X_resampled, columns=X.columns)
balanced_df['Diagnosis'] = y_resampled

# Print balanced dataset count
print("Balanced dataset class distribution:")
print(balanced_df['Diagnosis'].value_counts())


Balanced dataset class distribution:
Diagnosis
1    2537
0    2537
Name: count, dtype: int64
