<a href="https://colab.research.google.com/github/DinurakshanRavichandran/Visio-Glance/blob/Pre-Processed-Datasets-NLP/cnvPPFinal.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#mount google drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder
import numpy as np
import re

# Load dataset
data_path = '/content/drive/MyDrive/PROJECT 29/DATASETS/Synthetic_CNV_Detection_Dataset.csv'
cnv_dataset = pd.read_csv(data_path)

# 1. Handle Missing Values
missing_values = cnv_dataset.isnull().sum()
for column in cnv_dataset.columns:
    if cnv_dataset[column].isnull().any():
        most_frequent_category = cnv_dataset[column].mode()[0]
        cnv_dataset[column].fillna(most_frequent_category, inplace=True)

# 2. Text Processing for 'Visual Symptoms' - using TF-IDF Vectorization
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    return text

cnv_dataset['Visual Symptoms'] = cnv_dataset['Visual Symptoms'].apply(clean_text)

# Vectorize 'Visual Symptoms'
tfidf = TfidfVectorizer(stop_words='english', max_features=100)
visual_symptoms_tfidf = tfidf.fit_transform(cnv_dataset['Visual Symptoms'])

# Convert to DataFrame and prefix the feature names
visual_symptoms_df = pd.DataFrame(visual_symptoms_tfidf.toarray(), columns=tfidf.get_feature_names_out())
visual_symptoms_df = visual_symptoms_df.add_prefix('VS_')

# 3. One-Hot Encoding for other categorical data
encoder = OneHotEncoder()
categorical_data = cnv_dataset[['Optical Coherence Tomography (OCT) Results', 'Fluorescein Angiography Results', 'Visual Acuity Test Results', 'Smoking Status']]
categorical_encoded = encoder.fit_transform(categorical_data).toarray()
categorical_df = pd.DataFrame(categorical_encoded, columns=encoder.get_feature_names_out(categorical_data.columns))

# Encode target variable 'Diagnosis'
label_encoder = LabelEncoder()
cnv_dataset['Diagnosis'] = label_encoder.fit_transform(cnv_dataset['Diagnosis'])

# Concatenate all preprocessed parts
preprocessed_data = pd.concat([cnv_dataset[['Age', 'Diagnosis']], categorical_df, visual_symptoms_df], axis=1)

# 4. Save Preprocessed Data
preprocessed_file_path = '/content/drive/MyDrive/PROJECT 29/FINAL MODEL/Preprocessed_CNV_Detection_Dataset.csv'
preprocessed_data.to_csv(preprocessed_file_path, index=False)

# Display the head of the preprocessed dataset
preprocessed_data.head()


Unnamed: 0,Age,Diagnosis,Optical Coherence Tomography (OCT) Results_Active CNV,Optical Coherence Tomography (OCT) Results_Early CNV,Optical Coherence Tomography (OCT) Results_Normal,Optical Coherence Tomography (OCT) Results_Scarred/End-stage CNV,Fluorescein Angiography Results_Active Neovascularization,Fluorescein Angiography Results_Early Neovascularization,Fluorescein Angiography Results_No Neovascularization,Visual Acuity Test Results_Blindness,...,VS_reading,VS_seeing,VS_spots,VS_strain,VS_sudden,VS_symptoms,VS_temporary,VS_visible,VS_vision,VS_wavy
0,67,1,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.316588,0.0
1,57,0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.514695,0.514695,0.0,0.0,0.0,0.0,0.0,0.217145,0.0
2,69,1,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.678554,0.0,0.281297,0.0
3,82,0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.479115,0.0,0.0,0.0,0.197099,0.0
4,56,0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.514695,0.514695,0.0,0.0,0.0,0.0,0.0,0.217145,0.0
