In [2]:
"""
Notebook for preprocessing the data
"""

'\nNotebook for preprocessing the data\n'

In [3]:
import pandas as pd
import openpyxl
import sys
sys.path.append('../../')
from src.data_preprocessing.isolation_forest import remove_outliers
from src.data_preprocessing.train_test_splitter import split_data
from src.data_preprocessing.z_score_normalization import normalize_data

In [4]:
# Load data from Excel file
df = pd.read_excel('../../data/raw/raw_data.xlsx')

In [5]:

# Display the first few rows of the dataframe to inspect the data
df.head()

Unnamed: 0,case,AGE,BMI,density
0,1,38,21.8,40
1,0,43,32.3,5
2,0,46,23.0,45
3,0,52,19.6,40
4,0,59,26.2,40


In [6]:
# Separate target variable (y) and features (x)
y_column = 'case'  # Column name
X_columns = ['AGE', 'BMI', 'density']  # Column names

y = df[y_column] # Seperate target variable y, and features X from dataframe
X = df[X_columns]

In [7]:
# Apply train-test split
data = pd.concat([y, X], axis=1)  # Concatenate X and y to one dataframe

In [8]:

train_data, cv_data, test_data = split_data(data)

# Test number of examples in train, cv and test sets
print("Train data number of examples:")
print(len(train_data))
print("CV data number of examples:")
print(len(cv_data))
print("Test data number of examples:")
print(len(test_data))

# Print the first few rows of the train data to inspect the data
train_data.head()

Train data number of examples:
639
CV data number of examples:
213
Test data number of examples:
213


Unnamed: 0,case,AGE,BMI,density
956,0,45,19.1,90
213,0,48,33.3,75
843,1,60,25.7,45
550,0,45,31.2,70
624,0,54,30.4,30


In [9]:
# Apply Isolation Forest to detect outliers and remove them from each set (save removed data in a separate dataframe)
train_data, removed_data1 = remove_outliers(train_data)
cv_data, removed_data2 = remove_outliers(cv_data)
test_data, removed_data3 = remove_outliers(test_data)
removed_data = pd.concat([removed_data1, removed_data2, removed_data3])

In [10]:
# Print number of outliers in each set
print("Train data number of outliers:")
print(len(removed_data1))
print("CV data number of outliers:")
print(len(removed_data2))
print("Test data number of outliers:")
print(len(removed_data3))

Train data number of outliers:
32
CV data number of outliers:
11
Test data number of outliers:
11


In [11]:
# Apply Z-score normalization to scale the features for each set
train_data = normalize_data(train_data)
cv_data = normalize_data(cv_data)
test_data = normalize_data(test_data)
# Print first few rows of normalized data to inspect the data
train_data.head()

Unnamed: 0,case,AGE,BMI,density
956,0,-0.842398,-1.710404,1.503131
213,0,-0.316333,1.551384,1.005639
843,1,1.78793,-0.194361,0.010655
550,0,-0.842398,1.069007,0.839808
624,0,0.735799,0.885245,-0.486838


In [16]:
# Save the preprocessed data to respective Excel files and the removed outliers to another Excel file
train_data.to_excel('train_data.xlsx', index=False)
cv_data.to_excel('cv_data.xlsx', index=False)
test_data.to_excel('test_data.xlsx', index=False)
removed_data.to_excel('removed_data.xlsx', index=False)