## MODELING (KNN IMPUTATION VERSION)

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import flaml
import sys
import os

from sklearn.model_selection import train_test_split

# Add the parent directory to the system path to import the custom module
sys.path.append(os.path.abspath(os.path.join('..')))

from src.utils.ml_stratifiers import MultilabelStratifiedShuffleSplit

In [None]:
file_path = r'C:\unibo-dtm-ml-2526-cervical-cancer-predictor\data\data_after_imputation\knn_imputed.csv'
df = pd.read_csv(file_path)

# repeat the data profiling pipeline for the newly cleaned data
df = pd.read_csv(file_path)

print("\nDataset Info: \n")
print(df.info())

#check whether everything went smoothly at the data cleaning stage
print("\nMissing Values: \n")
print(df.isnull().sum()) 


print("\nDescriptive Statistics:")
print(df.describe(include='all'))


In [8]:
targets = ['Biopsy', 'Hinselmann', 'Schiller', 'Citology']
y = df[targets]
X = df.drop(columns=targets)

"""use msss (MultilabelStratifiedShuffleSplit) to split the data into train and test sets, 
ensuring that the distribution of the target variables is preserved in both sets.
This is particularly important in multilabel classification problems, where each instance can belong to multiple classes simultaneously.
Furthermore, this form of data splitting is needed for such an imbalanced dataset, where some classes may be underrepresented. 
"""

msss = MultilabelStratifiedShuffleSplit(n_splits=5, test_size=0.2,random_state=0)
msss.get_n_splits(X, y)

for train_index, test_index in msss.split(X, y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

print(f"X_train: {X_train.shape}")
print(f"X_test: {X_test.shape}")
print(f"y_train: {y_train.shape}")
print(f"y_test: {y_test.shape}")

print(f"Training samples: {X_train.shape[0]} ({len(y_train[y_train.sum(axis=1) > 0])} with at least one positive target)")
print(f"Testing samples:  {X_test.shape[0]} ({len(y_test[y_test.sum(axis=1) > 0])} with at least one positive target)")

X_train: (668, 19)
X_test: (167, 19)
y_train: (668, 4)
y_test: (167, 4)
Training samples: 668 (80 with at least one positive target)
Testing samples:  167 (21 with at least one positive target)
