In [None]:
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import ks_2samp

In [None]:
df = pd.read_csv('dataset_insilico.csv')

In [None]:
df

In [None]:
columns_to_keep = df.columns[3:]
df_parameters = df[columns_to_keep]

In [None]:
for col in df_parameters.columns:
    df_parameters[col] = df_parameters[col].astype(str).str.replace('%', '', regex=False)
    df_parameters[col] = pd.to_numeric(df_parameters[col], errors='coerce') / 100.0

In [None]:
pipeline = Pipeline([
    ('imputer', KNNImputer()),
    ('scaler', StandardScaler())
])

In [None]:
param_grid = {
    'imputer__n_neighbors': [2, 3, 5, 7, 10],
    'imputer__weights': ['uniform', 'distance'],
    'imputer__metric': ['nan_euclidean', 'manhattan']
}

In [None]:
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(df_parameters)

In [None]:
best_params = grid_search.best_params_
print(f"Best hyperparameters: {best_params}")

In [None]:
df_filled_scaled = pd.DataFrame(grid_search.best_estimator_.transform(df_parameters), columns=df_parameters.columns)

In [None]:
df_filled_scaled

In [None]:
def check_missing_values(data):
    missing_values = data.isna().sum()
    print("Missing Values After Imputation:")
    print(missing_values)
    assert missing_values.sum() == 0, "There are still missing values in the dataset!"

check_missing_values(df_filled_scaled)

In [None]:

def compare_distributions(original, imputed, column):
    plt.figure(figsize=(10, 5))
    sns.histplot(original[column].dropna(), color='blue', label='Original', kde=True)
    sns.histplot(imputed[column], color='orange', label='Imputed', kde=True)
    plt.title(f'Distribution of {column} Before and After Imputation')
    plt.legend()
    plt.show()

for col in df_parameters.columns:
    compare_distributions(df_parameters, df_filled_scaled, col)

In [None]:
def check_consistency(original, imputed):
    mean_original = original.mean()
    mean_imputed = imputed.mean()
    variance_original = original.var()
    variance_imputed = imputed.var()

    consistency_df = pd.DataFrame({
        'Mean_Original': mean_original,
        'Mean_Imputed': mean_imputed,
        'Variance_Original': variance_original,
        'Variance_Imputed': variance_imputed
    })

    print("Consistency Check (Mean and Variance):")
    print(consistency_df)

check_consistency(df_parameters, df_filled_scaled)

In [None]:
def ks_test(original, imputed):
    ks_results = {}
    for col in original.columns:
        ks_stat, p_value = ks_2samp(original[col].dropna(), imputed[col])
        ks_results[col] = {'KS Statistic': ks_stat, 'p-value': p_value}

    ks_df = pd.DataFrame(ks_results).T
    print("Kolmogorov-Smirnov Test Results:")
    print(ks_df)
    return ks_df

ks_test_results = ks_test(df_parameters, df_filled_scaled)

In [None]:
def correlation_analysis(original, imputed):
    corr_original = original.corr()
    corr_imputed = imputed.corr()

    plt.figure(figsize=(15, 5))

    plt.subplot(1, 2, 1)
    sns.heatmap(corr_original, annot=True, cmap='coolwarm')
    plt.title('Original Data Correlation')

    plt.subplot(1, 2, 2)
    sns.heatmap(corr_imputed, annot=True, cmap='coolwarm')
    plt.title('Imputed Data Correlation')

    plt.show()

correlation_analysis(df_parameters, df_filled_scaled)

In [None]:
df_filled_scaled.to_csv("filled_with_knn.csv", index=False)