In [None]:
import pandas as pd
from statsmodels.stats.outliers_influence import variance_inflation_factor
import numpy as np
import openml

In [None]:
dataset = openml.datasets.get_dataset("41026")
gisette, gisette_labels, _, _ = dataset.get_data(target=dataset.default_target_attribute)


In [None]:
np.unique(gisette_labels)

In [None]:
# no missing values
gisette.isnull().sum().sum()

In [None]:
gisette.shape

In [None]:
# remove columns with constant values
gisette = gisette.loc[:, (gisette != gisette.iloc[0]).any()]
gisette.shape

In [None]:
print(gisette.min().min(), gisette.max().max())
np.sort(gisette.max())[-10:]

In [None]:
from matplotlib import pyplot as plt

plt.figure(figsize=(10, 6))
plt.boxplot(gisette.iloc[:, :5], labels=[f"Feature {i}" for i in range(1, 6)])
plt.title("Boxplot of First 5 Scaled Features")
plt.show()

# Label Distribution
train_label_counts = gisette_labels.value_counts()
print("\nTraining Label Distribution:\n", train_label_counts)

In [None]:
def remove_highly_correlated_features(df, threshold=0.7):
    corr_matrix = df.corr().abs()
    upper_triangle = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

    to_drop = [column for column in upper_triangle.columns if any(upper_triangle[column] > threshold)]
    
    print(f"Removing {len(to_drop)} highly correlated features.")
    return df.drop(columns=to_drop)

In [None]:
# remove features with VIF > 10
gisette = remove_highly_correlated_features(gisette)

In [None]:
# check number of features left
print(f"Number of features left: {gisette.shape[1]}")

In [None]:
gisette_cleaned = gisette.copy()

In [None]:
# create dummy variables
n_obs, n_features = gisette_cleaned.shape
min_features = n_obs // 2

i = 0
new_columns = {}
while gisette_cleaned.shape[1] + len(new_columns) < min_features:
    col_to_copy = np.random.choice(gisette_cleaned.columns)
    new_columns[f"{col_to_copy}_perm_{i}"] = np.random.permutation(gisette_cleaned[col_to_copy].values)
    i += 1

df = pd.concat([gisette_cleaned, pd.DataFrame(new_columns)], axis=1)

df.shape


In [None]:
# Convert to dense format (regular pandas Series)
gisette_labels_dense = gisette_labels.sparse.to_dense()

# Perform the mapping
gisette_labels_dense = gisette_labels_dense.map({0: -1, 1: 1})

# Rename the column if needed
gisette_labels_dense.rename('label', inplace=True)

# Optionally replace the original variable
gisette_labels = gisette_labels_dense


In [None]:
df.to_csv('data/preprocessed_gisette.csv', index=False)
gisette_labels.to_csv('data/gisette_labels.csv', index=False)