<a href="https://colab.research.google.com/github/DinurakshanRavichandran/Visio-Glance/blob/Pre-Processed-Datasets-NLP/cnvPPFinal.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#mount google drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [12]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from imblearn.over_sampling import SMOTE

# Load dataset
data_path = '/content/drive/MyDrive/PROJECT 29/DATASETS/Synthetic_CNV_Detection_Dataset.csv'
cnv_dataset = pd.read_csv(data_path)

# Remove duplicate rows
cnv_dataset.drop_duplicates(inplace=True)

# Print class distribution before balancing
print("Before balancing:")
print(cnv_dataset['Diagnosis'].value_counts())

# Print duplicate count
print("Duplicate rows removed:", cnv_dataset.duplicated().sum())

# Print missing values
print("Missing values before handling:")
print(cnv_dataset.isnull().sum())

# 1. Handle Missing Values
cnv_dataset.fillna(cnv_dataset.mode().iloc[0], inplace=True)

# 2. Remove "Visual Acuity Test Results" Column if it exists
if 'Visual Acuity Test Results' in cnv_dataset.columns:
    cnv_dataset.drop(columns=['Visual Acuity Test Results'], inplace=True)

# 3. Convert "Smoking Status" to Binary (1 = Smoker, 0 = Non-smoker)
cnv_dataset['Smoking Status'] = cnv_dataset['Smoking Status'].map({'Smoker': 1, 'Non-smoker': 0})

# 4. Process "Visual Symptoms" into Individual Binary Columns
visual_symptoms_list = [
    "occasional blurred vision", "blurred vision", "seeing dark spots",
    "temporary vision disturbances", "sudden vision loss", "colors appear faded",
    "distorted vision", "lines appear wavy", "no visible symptoms",
    "mild eye strain", "difficulty reading", "loss of central vision"
]

# Initialize binary columns for visual symptoms
for symptom in visual_symptoms_list:
    cnv_dataset["Visual Symptoms_" + symptom] = cnv_dataset['Visual Symptoms'].str.contains(symptom, case=False, na=False).astype(int)

# Convert all visual symptoms columns to integer type
visual_symptom_cols = [col for col in cnv_dataset.columns if col.startswith("Visual Symptoms_")]
cnv_dataset[visual_symptom_cols] = cnv_dataset[visual_symptom_cols].astype(int)


# Drop original "Visual Symptoms" column
cnv_dataset.drop(columns=['Visual Symptoms'], inplace=True)

# 5. One-Hot Encoding for Other Categorical Features
encoder = OneHotEncoder(drop='first', sparse_output=False)
categorical_data = cnv_dataset[['Optical Coherence Tomography (OCT) Results', 'Fluorescein Angiography Results']]
categorical_encoded = encoder.fit_transform(categorical_data)
categorical_df = pd.DataFrame(categorical_encoded, columns=encoder.get_feature_names_out(categorical_data.columns)).astype(int)


# Encode target variable 'Diagnosis'
label_encoder = LabelEncoder()
cnv_dataset['Diagnosis'] = label_encoder.fit_transform(cnv_dataset['Diagnosis'])  # 0 = No CNV, 1 = CNV Present

# Concatenate all preprocessed parts
final_preprocessed_data = pd.concat([
    cnv_dataset.drop(columns=['Optical Coherence Tomography (OCT) Results', 'Fluorescein Angiography Results', 'Diagnosis']),
    categorical_df,
    cnv_dataset[['Diagnosis']]
], axis=1)

# Ensure no duplicate columns exist
final_preprocessed_data = final_preprocessed_data.loc[:, ~final_preprocessed_data.columns.duplicated()]

# Define features and target
features = final_preprocessed_data.drop(columns=['Diagnosis'])
target = final_preprocessed_data['Diagnosis']

# Ensure no NaN values exist before applying SMOTE
features.fillna(features.mode().iloc[0], inplace=True)  # Fill missing feature values
target.fillna(target.mode().iloc[0], inplace=True)  # Fill missing target values

# Define target sample size per class (approx. 5,000 per class)
desired_class_size = 5000

# Adjust the sampling strategy: Only oversample the minority class (CNV Present = 1)
# Automatically determine the correct sampling strategy
current_class_counts = target.value_counts()
minority_class = current_class_counts.idxmin()  # Identify the minority class
majority_class = current_class_counts.idxmax()  # Identify the majority class

# Define the new sample size based on the majority class
desired_samples = current_class_counts[majority_class]  # Make both classes equal

# Apply SMOTE only to the minority class
smote = SMOTE(sampling_strategy={minority_class: desired_samples}, random_state=42)
features_balanced, target_balanced = smote.fit_resample(features, target)

# Convert all columns to integer type
df_balanced = pd.DataFrame(features_balanced, columns=features.columns).astype(int)
df_balanced['Diagnosis'] = target_balanced.astype(int)  # Ensure target column is also integer

# Print class distribution after balancing
print("\nAfter balancing:")
print(df_balanced['Diagnosis'].value_counts())

# Save Preprocessed Data
preprocessed_file_path = '/content/drive/MyDrive/PROJECT 29/FINAL MODEL/Preprocessed_CNV_Detection_Dataset.csv'
df_balanced.to_csv(preprocessed_file_path, index=False)

# Display the head of the preprocessed dataset
print(df_balanced.head())


Before balancing:
Diagnosis
No CNV         4019
CNV Present    3985
Name: count, dtype: int64
Duplicate rows removed: 0
Missing values before handling:
Age                                           0
Optical Coherence Tomography (OCT) Results    0
Fluorescein Angiography Results               0
Visual Acuity Test Results                    0
Smoking Status                                0
Visual Symptoms                               0
Diagnosis                                     0
dtype: int64

After balancing:
Diagnosis
1    5375
0    5375
Name: count, dtype: int64
   Age  Smoking Status  Visual Symptoms_occasional blurred vision  \
0   67               0                                          1   
1   57               1                                          0   
2   69               0                                          0   
3   82               0                                          0   
4   56               0                                          0   

   Visual 