<a href="https://colab.research.google.com/github/DinurakshanRavichandran/Visio-Glance/blob/Pre-Processed-Datasets-NLP/drusenPPFinal.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#mount google drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [1]:
import pandas as pd
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

# Load the dataset
file_path = '/content/drive/MyDrive/PROJECT 29/DATASETS/Synthetic_Drusen_Dataset.csv'
df = pd.read_csv(file_path)

#duplicate count
duplicate_count = df.duplicated().sum()
print(f"Number of duplicate rows: {duplicate_count}")

# Drop duplicate rows
#df.drop_duplicates(inplace=True)

# Handle Missing Values
for column in df.columns:
    most_frequent = df[column].mode()[0]
    df[column].fillna(most_frequent, inplace=True)

# Print class distribution before balancing
print("Before balancing:")
print(df['Diagnosis'].value_counts())

# Convert 'Smoking Status' to binary (1 for 'Yes', 0 for 'No')
df['Smoking Status'] = df['Smoking Status'].map({'Yes': 1, 'No': 0})

# One-Hot Encode 'Visual Symptoms' with lowercase labels after '_'
visual_symptoms_dummies = pd.get_dummies(df['Visual Symptoms'], prefix='Visual Symptoms')

# Convert only the part after "Visual Symptoms_" to lowercase
visual_symptoms_dummies.columns = [
    'Visual Symptoms_' + col.split('_')[1].lower() if '_' in col else col
    for col in visual_symptoms_dummies.columns
]

# Convert TRUE/FALSE features to binary (1 for TRUE, 0 for FALSE)
df.replace({True: 1, False: 0}, inplace=True)
visual_symptoms_dummies = visual_symptoms_dummies.astype(int)  # Ensure one-hot encoded values are 1/0

# Combine all features
features = pd.concat([df.drop(['Diagnosis', 'Visual Symptoms'], axis=1), visual_symptoms_dummies], axis=1)
target = df['Diagnosis']

# Define target sample size per class (approx. 5000 for each)
desired_class_size = 5000

# Step 1: Undersample the majority class (0 - No Drusen) down to 5000
undersample = RandomUnderSampler(sampling_strategy={0: desired_class_size}, random_state=42)
features_under, target_under = undersample.fit_resample(features, target)

# Step 2: Oversample the minority class (1 - Drusen) up to 5000 using SMOTE
smote = SMOTE(sampling_strategy={1: desired_class_size}, random_state=42)
features_balanced, target_balanced = smote.fit_resample(features_under, target_under)

# Reconstruct the DataFrame with balanced data
df_balanced = pd.DataFrame(features_balanced, columns=features.columns)
df_balanced['Diagnosis'] = target_balanced

# Print class distribution after balancing
print("\nAfter balancing:")
print(df_balanced['Diagnosis'].value_counts())

# Save Preprocessed Data
preprocessed_file_path = '/content/drive/MyDrive/PROJECT 29/FINAL MODEL/Preprocessed_Drusen_Dataset.csv'
df_balanced.to_csv(preprocessed_file_path, index=False)

# Display the head of the preprocessed dataset
print(df_balanced.head())


Number of duplicate rows: 0
Before balancing:
Diagnosis
0    8277
1    1723
Name: count, dtype: int64

After balancing:
Diagnosis
0    5000
1    5000
Name: count, dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[column].fillna(most_frequent, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[column].fillna(most_frequent, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values

   Age  Smoking Status   BMI  Blood Pressure  Cholesterol Levels  \
0   55               1  20.3             137                 248   
1   50               0  31.3             152                 169   
2   81               0  29.4             141                 163   
3   81               1  22.4             122                 214   
4   50               0  24.3             176                 234   

   Visual Symptoms_blind spots  Visual Symptoms_blurred vision  \
0                            1                               0   
1                            1                               0   
2                            0                               0   
3                            0                               0   
4                            0                               0   

   Visual Symptoms_distorted vision  Visual Symptoms_light sensitivity  \
0                                 0                                  0   
1                                 0           