In [1]:
import pandas as pd
from statsmodels.stats.outliers_influence import variance_inflation_factor
import numpy as np

In [2]:
secom = pd.read_csv('secom/secom.data', sep=' ', header=None)
secom_labels = pd.read_csv('secom/secom_labels.data', sep=' ', header=None)[0]

In [3]:
# check for missing values
secom.isnull().sum(axis=0)

0       6
1       7
2      14
3      14
4      14
       ..
585     1
586     1
587     1
588     1
589     1
Length: 590, dtype: int64

In [4]:
# fill missing values with median
secom = secom.fillna(secom.median())

In [5]:
# remove columns with constant values
secom = secom.loc[:, (secom != secom.iloc[0]).any()]

In [6]:
def remove_highly_correlated_features(df, threshold=0.7):
    corr_matrix = df.corr().abs()
    upper_triangle = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

    to_drop = [column for column in upper_triangle.columns if any(upper_triangle[column] > threshold)]
    
    print(f"Removing {len(to_drop)} highly correlated features.")
    return df.drop(columns=to_drop)

In [7]:
# remove very highly correlated features before calculating VIF
secom = remove_highly_correlated_features(secom)

Removing 260 highly correlated features.


In [8]:
def vif(X):
    X = np.asarray(X)
    vif_scores = [variance_inflation_factor(X, i) for i in range(X.shape[1])]
    return np.array(vif_scores)

def remove_high_vif_features(df, threshold=10):
    df = df.copy()
    while True:
        vif_scores = vif(df)
        max_vif = np.max(vif_scores)

        if max_vif < threshold:
            break
        
        feature_to_remove = df.columns[np.argmax(vif_scores)]
        print(f"Dropping {feature_to_remove} with VIF: {max_vif}")

        df.drop(columns=[feature_to_remove], inplace=True)
    
    return df

In [9]:
# remove features with VIF > 10
secom = remove_high_vif_features(secom)

Dropping 12 with VIF: 6172021.957747664
Dropping 131 with VIF: 183020.05062763335
Dropping 56 with VIF: 149341.35134765547
Dropping 121 with VIF: 124419.0911097322
Dropping 111 with VIF: 83694.35906938057
Dropping 37 with VIF: 74945.33353973534
Dropping 57 with VIF: 62222.93504736552
Dropping 38 with VIF: 55952.023586762014
Dropping 109 with VIF: 43121.62562668744
Dropping 55 with VIF: 40114.109303712976
Dropping 133 with VIF: 27756.098290591188
Dropping 582 with VIF: 25649.768544194
Dropping 45 with VIF: 16971.487220808107
Dropping 116 with VIF: 16729.78688091883
Dropping 119 with VIF: 14623.922975324243
Dropping 85 with VIF: 13670.550440455583
Dropping 44 with VIF: 12177.252043300092
Dropping 87 with VIF: 11999.640413912539
Dropping 11 with VIF: 10860.053039735196
Dropping 110 with VIF: 10691.309908981679
Dropping 20 with VIF: 10349.415146394163
Dropping 53 with VIF: 10165.587389906068
Dropping 547 with VIF: 9132.898006695152
Dropping 61 with VIF: 8848.823706071486
Dropping 2 with VI

In [10]:
# check number of features left
print(f"Number of features left: {secom.shape[1]}")

Number of features left: 100


In [11]:
secom_cleaned = secom.copy()

In [12]:
# create dummy variables
n_obs, n_features = secom_cleaned.shape
min_features = n_obs // 2

i = 0
new_columns = {}
while secom_cleaned.shape[1] + len(new_columns) < min_features:
    col_to_copy = np.random.choice(secom_cleaned.columns)
    new_columns[f"{col_to_copy}_perm_{i}"] = np.random.permutation(secom_cleaned[col_to_copy].values)
    i += 1

df = pd.concat([secom_cleaned, pd.DataFrame(new_columns)], axis=1)

df.shape


(1567, 783)

In [13]:
secom_labels.rename('label', inplace=True)
secom_labels = secom_labels.map({1: 1, -1: 0})
secom_labels

0       0
1       0
2       1
3       0
4       0
       ..
1562    0
1563    0
1564    0
1565    0
1566    0
Name: label, Length: 1567, dtype: int64

In [14]:
df.to_csv('data/preprocessed_secom.csv', index=False)
secom_labels.to_csv('data/secom_labels.csv', index=False)