<a href="https://colab.research.google.com/github/DinurakshanRavichandran/Visio-Glance/blob/Pre-Processed-Datasets-NLP/cnvPPFinal.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#mount google drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder
import numpy as np
import re

# Load dataset
data_path = '/content/drive/MyDrive/DSGP PROJECT 29/DATASETS/Synthetic_CNV_Detection_Dataset.csv'
cnv_dataset = pd.read_csv(data_path)

# 1. Handle Missing Values
# Check for any missing values in the dataset
missing_values = cnv_dataset.isnull().sum()
# Replace missing values with the most frequent category in each column
for column in cnv_dataset.columns:
    if cnv_dataset[column].isnull().any():
        most_frequent_category = cnv_dataset[column].mode()[0]
        cnv_dataset[column].fillna(most_frequent_category, inplace=True)

# 2. Feature Engineering
# Example: Create a new feature 'Age Group'
cnv_dataset['Age Group'] = pd.cut(cnv_dataset['Age'], bins=[0, 40, 60, 80, 100], labels=['0-40', '41-60', '61-80', '81-100'])

# 3. Text Processing
# Process 'Visual Symptoms' by removing stop words, tokenizing, and stemming
def clean_text(text):
    # Convert to lower case
    text = text.lower()
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    # Tokenize by splitting the cleaned text
    tokens = text.split()
    # Remove stop words
    stop_words = set(['a', 'the', 'and', 'of', 'in', 'to', 'for'])
    tokens = [word for word in tokens if word not in stop_words]
    # Stemming - simple approach by stripping suffixes
    tokens = [re.sub(r'ing$|s$|ed$', '', token) for token in tokens]
    return ' '.join(tokens)

cnv_dataset['Visual Symptoms'] = cnv_dataset['Visual Symptoms'].apply(clean_text)

# 4. Encode Categorical Data
label_encoders = {}
for column in cnv_dataset.select_dtypes (include=['object']).columns:
    le = LabelEncoder()
    cnv_dataset[column] = le.fit_transform(cnv_dataset[column])
    label_encoders[column] = le

# 5. Normalize/Scale Numerical Data
scaler = StandardScaler()
cnv_dataset['Age'] = scaler.fit_transform(cnv_dataset[['Age']])

# 6. Save Preprocessed Data
preprocessed_file_path = '/content/drive/MyDrive/DSGP PROJECT 29/FINAL MODEL/Preprocessed_CNV_Detection_Dataset.csv'
cnv_dataset.to_csv(preprocessed_file_path, index=False)

preprocessed_file_path

#print the head of preprocessed dataset
cnv_dataset.head()


Unnamed: 0,Age,Optical Coherence Tomography (OCT) Results,Fluorescein Angiography Results,Visual Acuity Test Results,Smoking Status,Visual Symptoms,Diagnosis,Age Group
0,0.503567,2,2,3,0,5,1,61-80
1,-0.165083,2,2,1,1,0,0,41-60
2,0.637297,2,0,3,0,7,1,61-80
3,1.506541,2,1,1,0,6,0,81-100
4,-0.231948,1,2,1,0,0,0,41-60
