In [None]:
import pandas as pd
import numpy as np
import openml

In [None]:
dataset = openml.datasets.get_dataset("4134")
bioresponse, bioresponse_labels, _, _ = dataset.get_data(target=dataset.default_target_attribute)


In [None]:
np.unique(bioresponse_labels)

In [None]:
# no missing values
bioresponse.isnull().sum().sum()

In [None]:
bioresponse.shape

In [None]:
# remove columns with constant values
bioresponse = bioresponse.loc[:, (bioresponse != bioresponse.iloc[0]).any()]
bioresponse.shape

In [None]:
print(bioresponse.min().min(), bioresponse.max().max())
np.sort(bioresponse.max())[-10:]

In [None]:
from matplotlib import pyplot as plt

plt.figure(figsize=(10, 6))
plt.boxplot(bioresponse.iloc[:, :5], labels=[f"Feature {i}" for i in range(1, 6)])
plt.title("Boxplot of First 5 Scaled Features")
plt.show()

# Label Distribution
train_label_counts = bioresponse_labels.value_counts()
print("\nTraining Label Distribution:\n", train_label_counts)

In [None]:
def remove_highly_correlated_features(df, threshold=0.7):
    corr_matrix = df.corr().abs()
    upper_triangle = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

    to_drop = [column for column in upper_triangle.columns if any(upper_triangle[column] > threshold)]
    
    print(f"Removing {len(to_drop)} highly correlated features.")
    return df.drop(columns=to_drop)

In [None]:
# remove features with VIF > 10
bioresponse = remove_highly_correlated_features(bioresponse)

In [None]:
# check number of features left
print(f"Number of features left: {bioresponse.shape[1]}")

In [None]:
bioresponse_cleaned = bioresponse.copy()

In [None]:
# create dummy variables
n_obs, n_features = bioresponse_cleaned.shape
min_features = n_obs // 2

i = 0
new_columns = {}
while bioresponse_cleaned.shape[1] + len(new_columns) < min_features:
    col_to_copy = np.random.choice(bioresponse_cleaned.columns)
    new_columns[f"{col_to_copy}_perm_{i}"] = np.random.permutation(bioresponse_cleaned[col_to_copy].values)
    i += 1

df = pd.concat([bioresponse_cleaned, pd.DataFrame(new_columns)], axis=1)

df.shape


In [None]:
# change labels to -1 and 1
bioresponse_labels = bioresponse_labels.map({'1': 1, '0': -1})
bioresponse_labels.rename('label', inplace=True)

In [None]:
df.to_csv('data/preprocessed_bioresponse.csv', index=False)
bioresponse_labels.to_csv('data/bioresponse_labels.csv', index=False)