In [1]:
import pandas as pd
import openpyxl
import sys
sys.path.append("../../src/")
from data_preprocessing.isolation_forest import remove_outliers
from data_preprocessing.train_test_splitter import split_data
from data_preprocessing.z_score_normalization import normalize_data



In [2]:
# Load data from Excel file
df = pd.read_excel('../../data/raw/raw_data.xlsx')


In [3]:

# Display the first few rows of the dataframe to inspect the data
df.head()


Unnamed: 0,case,AGE,BMI,density
0,1,38,21.8,40
1,0,43,32.3,5
2,0,46,23.0,45
3,0,52,19.6,40
4,0,59,26.2,40


In [4]:

# Separate target variable (y) and features (x)
y_column = 'case'  # Column name
X_columns = ['AGE', 'BMI', 'density']  # Column names

y = df[y_column] # Seperate target variable y, and features X from dataframe
X = df[X_columns]


In [5]:

# Apply train-test split
train_data, cv_data, test_data = split_data(X)

# Test number of examples in train, cv and test sets
print("Train data number of examples:")
print(len(train_data))
print("CV data number of examples:")
print(len(cv_data))
print("Test data number of examples:")
print(len(test_data))


Train data number of examples:
639
CV data number of examples:
213
Test data number of examples:
213


In [6]:

# Apply Isolation Forest to detect outliers and remove them from each set (save removed data in a separate dataframe)
train_data, removed_data1 = remove_outliers(train_data)
cv_data, removed_data2 = remove_outliers(cv_data)
test_data, removed_data3 = remove_outliers(test_data)
removed_data = pd.concat([removed_data1, removed_data2, removed_data3])


In [8]:

# Print number of outliers in each set
print("Train data number of outliers:")
print(len(removed_data1))
print("CV data number of outliers:")
print(len(removed_data2))
print("Test data number of outliers:")
print(len(removed_data3))


Train data number of outliers:
32
CV data number of outliers:
11
Test data number of outliers:
11


In [None]:

# Apply Z-score normalization to scale the features for each set
train_data = normalize_data(train_data)
cv_data = normalize_data(cv_data)
test_data = normalize_data(test_data)


In [None]:

# Save the preprocessed data to one Excel file and the removed outliers to another Excel file
train_data.to_excel('train_data.xlsx', index=False)
cv_data.to_excel('cv_data.xlsx', index=False)
test_data.to_excel('test_data.xlsx', index=False)
removed_data.to_excel('removed_data.xlsx', index=False)
