<a href="https://colab.research.google.com/github/DinurakshanRavichandran/Visio-Glance/blob/Pre-Processed-Datasets-NLP/diabeticretinopathyPPFinal.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#mount google drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [7]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

# Load the dataset
file_path = '/content/drive/MyDrive/PROJECT 29/DATASETS/Synthetic_Diabetic_Retinopathy_Dataset.csv'
df = pd.read_csv(file_path)

# Remove duplicate rows
df.drop_duplicates(inplace=True)

# Drop unnecessary columns
columns_to_drop = ['Total Cholesterol', 'HDL Cholesterol', 'Cholesterol_HDL_Ratio']
df.drop(columns=[col for col in columns_to_drop if col in df.columns], inplace=True)

# Handle Missing Values
imputer = SimpleImputer(strategy='most_frequent')
df.iloc[:, :] = imputer.fit_transform(df)

# Text Processing and Encoding for "Visual Symptoms"
visual_symptoms_map = {
    'fluffy white patches in vision': 'fluffy white patches in vision',
    'occasional blurriness or floaters': 'occasional blurred vision',
    'no visible symptoms': 'no visible symptoms',
    'general vision blurriness': 'general vision blurriness',
    'blotches of dark vision': 'blotches of dark vision',
    'small dark spots in vision': 'small dark spots in vision'
}
df['Visual Symptoms'] = df['Visual Symptoms'].map(visual_symptoms_map)

# One-Hot Encode 'Visual Symptoms'
encoder = OneHotEncoder()
encoded_features = encoder.fit_transform(df[['Visual Symptoms']]).toarray()
encoded_feature_names = encoder.get_feature_names_out(['Visual Symptoms'])
df[encoded_feature_names] = pd.DataFrame(encoded_features, dtype=int)  # Convert to int
df.drop(['Visual Symptoms'], axis=1, inplace=True)

# Drop 'Visual Symptoms_nan' if it exists
if 'Visual Symptoms_nan' in df.columns:
    df.drop(['Visual Symptoms_nan'], axis=1, inplace=True)

# Ensure 'Diagnosis' is in integer format
df['Diagnosis'] = df['Diagnosis'].astype(int)

# Print class distribution before balancing
print("Before balancing:")
print(df['Diagnosis'].value_counts())

# Define features and target
features = df.drop(columns=['Diagnosis'])
target = df['Diagnosis']

# Define target sample size per class (approx. 5000 per class)
desired_class_size = 5000

# Step 1: Undersample the majority class (1 - DR) down to 5000
undersample = RandomUnderSampler(sampling_strategy={1: desired_class_size}, random_state=42)
features_under, target_under = undersample.fit_resample(features, target)

# Step 2: Oversample the minority class (0 - No DR) up to 5000 using SMOTE
smote = SMOTE(sampling_strategy={0: desired_class_size}, random_state=42)
features_balanced, target_balanced = smote.fit_resample(features_under, target_under)

# Reconstruct the DataFrame with balanced data
df_balanced = pd.DataFrame(features_balanced, columns=features.columns)
df_balanced['Diagnosis'] = target_balanced.astype(int)  # Ensure it's an integer

# Print class distribution after balancing
print("\nAfter balancing:")
print(df_balanced['Diagnosis'].value_counts())

# Save Preprocessed Data
preprocessed_file_path = '/content/drive/MyDrive/PROJECT 29/FINAL MODEL/Preprocessed_Diabetic_Retinopathy_Dataset.csv'
df_balanced.to_csv(preprocessed_file_path, index=False)

# Display the head of the preprocessed dataset
print(df_balanced.head())


Before balancing:
Diagnosis
1    5750
0    4250
Name: count, dtype: int64

After balancing:
Diagnosis
0    5000
1    5000
Name: count, dtype: int64
   Age  Retinal Thickness  Cotton Wool Spots Count  LDL Cholesterol  \
0   61         220.130947                        0        85.532216   
1   50         296.096819                        0        97.790530   
2   57         184.210130                        0       113.111589   
3   58         272.915897                        1        77.144017   
4   40         253.239568                        0        89.395615   

   Microaneurysms Count  Hemorrhages Count  Smoking Status  \
0                     3                  0               0   
1                     2                  4               0   
2                     0                  1               1   
3                     1                  1               0   
4                     1                  5               0   

   Visual Symptoms_blotches of dark vision  \
0     