<a href="https://colab.research.google.com/github/DinurakshanRavichandran/Visio-Glance/blob/Pre-Processed-Datasets-NLP/cnvPPFinal.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#mount google drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [6]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
import numpy as np
import re

# Load dataset
data_path = '/content/drive/MyDrive/PROJECT 29/DATASETS/Synthetic_CNV_Detection_Dataset.csv'
cnv_dataset = pd.read_csv(data_path)

# 1. Handle Missing Values
for column in cnv_dataset.columns:
    if cnv_dataset[column].isnull().any():
        most_frequent_category = cnv_dataset[column].mode()[0]
        cnv_dataset[column].fillna(most_frequent_category, inplace=True)

# 2. Remove "Visual Acuity Test Results" Column
cnv_dataset.drop(columns=['Visual Acuity Test Results'], inplace=True)

# 3. Convert "Smoking Status" to Binary (1 = Smoker, 0 = Non-smoker)
cnv_dataset['Smoking Status'] = cnv_dataset['Smoking Status'].map({'Smoker': 1, 'Non-smoker': 0})

# 4. Process "Visual Symptoms" into Individual Binary Columns

# Function to process visual symptoms correctly with a single underscore
def format_symptom_column(symptom):
    return "Visual Symptoms_" + symptom  # Keep only one underscore after "Visual Symptom"

# List of visual symptoms from the original dataset
visual_symptoms_list = [
    "occasional blurred vision", "blurred vision", "seeing dark spots",
    "temporary vision disturbances", "sudden vision loss", "colors appear faded",
    "distorted vision", "lines appear wavy", "no visible symptoms",
    "mild eye strain", "difficulty reading", "loss of central vision"
]

# Initialize all visual symptom columns with 0, using the new format
for symptom in visual_symptoms_list:
    col_name = format_symptom_column(symptom)
    cnv_dataset[col_name] = 0

# Function to update binary columns based on symptoms in each record
def process_visual_symptoms(symptoms, idx):
    symptoms = symptoms.lower()
    found_any = False
    for symptom in visual_symptoms_list:
        col_name = format_symptom_column(symptom)
        if symptom in symptoms:
            cnv_dataset.at[idx, col_name] = 1
            found_any = True
    # If no symptom is found, mark "Visual Symptom_no visible symptoms" as 1
    if not found_any:
        cnv_dataset.at[idx, "Visual Symptom_no visible symptoms"] = 1

# Apply function row-wise
for idx, row in cnv_dataset.iterrows():
    process_visual_symptoms(row["Visual Symptoms"], idx)


# Drop original "Visual Symptoms" column as it is now encoded
cnv_dataset.drop(columns=['Visual Symptoms'], inplace=True)

# 5. One-Hot Encoding for Other Categorical Features
encoder = OneHotEncoder(drop='first')  # drop='first' to avoid dummy variable trap
categorical_data = cnv_dataset[['Optical Coherence Tomography (OCT) Results', 'Fluorescein Angiography Results']]
categorical_encoded = encoder.fit_transform(categorical_data).toarray()
categorical_df = pd.DataFrame(categorical_encoded, columns=encoder.get_feature_names_out(categorical_data.columns))

# Encode target variable 'Diagnosis'
label_encoder = LabelEncoder()
cnv_dataset['Diagnosis'] = label_encoder.fit_transform(cnv_dataset['Diagnosis'])

# Concatenate all preprocessed parts
final_preprocessed_data = pd.concat([cnv_dataset[['Age', 'Diagnosis', 'Smoking Status']], categorical_df,
                                     cnv_dataset.drop(columns=['Optical Coherence Tomography (OCT) Results',
                                                               'Fluorescein Angiography Results'])], axis=1)
# Ensure no duplicate columns exist
final_preprocessed_data = final_preprocessed_data.loc[:, ~final_preprocessed_data.columns.duplicated()]

# 6. Save Preprocessed Data
preprocessed_file_path = '/content/drive/MyDrive/PROJECT 29/FINAL MODEL/Preprocessed_CNV_Detection_Dataset.csv'
final_preprocessed_data.to_csv(preprocessed_file_path, index=False)

# Display the head of the preprocessed dataset
final_preprocessed_data.head()


Unnamed: 0,Age,Diagnosis,Smoking Status,Optical Coherence Tomography (OCT) Results_Early CNV,Optical Coherence Tomography (OCT) Results_Normal,Optical Coherence Tomography (OCT) Results_Scarred/End-stage CNV,Fluorescein Angiography Results_Early Neovascularization,Fluorescein Angiography Results_No Neovascularization,Visual Symptoms_occasional blurred vision,Visual Symptoms_blurred vision,Visual Symptoms_seeing dark spots,Visual Symptoms_temporary vision disturbances,Visual Symptoms_sudden vision loss,Visual Symptoms_colors appear faded,Visual Symptoms_distorted vision,Visual Symptoms_lines appear wavy,Visual Symptoms_no visible symptoms,Visual Symptoms_mild eye strain,Visual Symptoms_difficulty reading,Visual Symptoms_loss of central vision
0,67,1,0,0.0,1.0,0.0,0.0,1.0,1,1,0,0,0,0,0,0,0,0,0,0
1,57,0,1,0.0,1.0,0.0,0.0,1.0,0,1,1,0,0,0,0,0,0,0,0,0
2,69,1,0,0.0,1.0,0.0,0.0,0.0,0,0,0,1,0,0,0,0,0,0,0,0
3,82,0,0,0.0,1.0,0.0,1.0,0.0,0,0,0,0,1,1,0,0,0,0,0,0
4,56,0,0,1.0,0.0,0.0,0.0,1.0,0,1,1,0,0,0,0,0,0,0,0,0
