<a href="https://colab.research.google.com/github/DinurakshanRavichandran/Visio-Glance/blob/Pre-Processed-Datasets-NLP/diabeticretinopathyPPFinal.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#mount google drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import re

# Load the dataset
df = pd.read_csv('/content/drive/MyDrive/PROJECT 29/DATASETS/Synthetic_Diabetic_Retinopathy_Dataset.csv')

# Step 1: Handle Missing Values
# Check for missing values and replace them with the most frequent category
imputer = SimpleImputer(strategy='most_frequent')
df.iloc[:, :] = imputer.fit_transform(df)

# Step 2: Feature Engineering
# Creating a new feature: Total Cholesterol to HDL ratio
df['Cholesterol_HDL_Ratio'] = df['Total Cholesterol'] / df['HDL Cholesterol']

# Step 3: Text Processing
# Simple text processing using basic Python functions and regex
def clean_text(text):
    text = text.lower()  # convert to lowercase
    text = re.sub(r'\s+', ' ', text)  # replace multiple spaces with a single space
    text = re.sub(r'[^\w\s]', '', text)  # remove punctuation
    tokens = text.split()  # tokenize the text by splitting
    return ' '.join(tokens)  # join tokens back into a string

df['Visual Symptoms'] = df['Visual Symptoms'].apply(clean_text)

# Step 4: Encode Categorical Data
# Convert binary categorical data using OneHotEncoder
encoder = OneHotEncoder()
encoded_features = encoder.fit_transform(df[['Smoking Status']]).toarray()
encoded_feature_names = encoder.get_feature_names_out(['Smoking Status'])
df[encoded_feature_names] = encoded_features
df.drop(['Smoking Status'], axis=1, inplace=True)

# Step 5: Normalize/Scale Numerical Data
# Scale numerical features to have zero mean and unit variance
scaler = StandardScaler()
numerical_features = ['Retinal Thickness', 'Cotton Wool Spots Count', 'LDL Cholesterol',
                      'HDL Cholesterol', 'Total Cholesterol', 'Microaneurysms Count',
                      'Hemorrhages Count', 'Cholesterol_HDL_Ratio']
df[numerical_features] = scaler.fit_transform(df[numerical_features])


# Combine all processed features and target into a final DataFrame
# Assuming you handle class imbalance externally or later

# Step 6: Save Preprocessed Data
# Save the final preprocessed data to a new CSV file
final_file_path = '/content/drive/MyDrive/PROJECT 29/FINAL MODEL/Preprocessed_Diabetic_Retinopathy_Dataset.csv'
df.to_csv(final_file_path, index=False)

final_file_path

#print head of preprocessed dataset.
df.head()



Unnamed: 0,Age,Retinal Thickness,Cotton Wool Spots Count,LDL Cholesterol,HDL Cholesterol,Total Cholesterol,Microaneurysms Count,Hemorrhages Count,Visual Symptoms,Diagnosis,Cholesterol_HDL_Ratio,Smoking Status_0,Smoking Status_1
0,59,-0.691365,-0.707243,-0.266937,0.306032,-0.195401,1.434473,-1.156751,occasional blurriness or floaters,1,-0.075919,0.0,1.0
1,53,-0.318727,3.604341,-0.579857,-3.221905,-2.403856,0.722576,1.14742,fluffy white patches in vision,1,6.28361,1.0,0.0
2,61,-0.610329,-0.707243,-0.730721,-0.000806,-0.312462,0.722576,-1.732794,no visible symptoms,0,-0.053497,1.0,0.0
3,70,0.096791,-0.707243,0.834876,0.782409,1.2856,0.722576,-1.156751,no visible symptoms,1,-0.048788,0.0,1.0
4,52,1.182509,0.729951,-0.335322,-2.277486,-1.520086,-1.413116,0.571377,fluffy white patches in vision general vision ...,1,0.463701,0.0,1.0
