In [1]:
import pandas as pd
from statsmodels.stats.outliers_influence import variance_inflation_factor
import numpy as np

In [2]:
sonar = pd.read_csv('connectionist+bench+sonar+mines+vs+rocks/sonar.all-data', header=None)
sonar_labels = sonar.iloc[:, -1]
sonar = sonar.iloc[:, :-1]

In [3]:
# no missing values
sonar.isnull().sum().sum()

np.int64(0)

In [4]:
sonar.shape

(208, 60)

In [5]:
# remove columns with constant values
sonar = sonar.loc[:, (sonar != sonar.iloc[0]).any()]
sonar.shape

(208, 60)

In [6]:
def remove_highly_correlated_features(df, threshold=0.7):
    corr_matrix = df.corr().abs()
    upper_triangle = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

    to_drop = [column for column in upper_triangle.columns if any(upper_triangle[column] > threshold)]
    
    print(f"Removing {len(to_drop)} highly correlated features.")
    return df.drop(columns=to_drop)

In [7]:
def vif(X):
    X = np.asarray(X)
    vif_scores = [variance_inflation_factor(X, i) for i in range(X.shape[1])]
    return np.array(vif_scores)

def remove_high_vif_features(df, threshold=10):
    df = df.copy()
    while True:
        vif_scores = vif(df)
        max_vif = np.max(vif_scores)

        if max_vif < threshold:
            break
        
        feature_to_remove = df.columns[np.argmax(vif_scores)]
        print(f"Dropping {feature_to_remove} with VIF: {max_vif}")

        df.drop(columns=[feature_to_remove], inplace=True)
    
    return df

In [8]:
# remove features with VIF > 10
sonar = remove_high_vif_features(sonar)

Dropping 25 with VIF: 285.09204734556056
Dropping 22 with VIF: 226.25073394063602
Dropping 19 with VIF: 167.2967434849508
Dropping 27 with VIF: 160.15866373955336
Dropping 17 with VIF: 139.67408046678466
Dropping 29 with VIF: 123.80966195646305
Dropping 15 with VIF: 90.06272812594621
Dropping 35 with VIF: 76.74327427310514
Dropping 32 with VIF: 70.93046710005616
Dropping 10 with VIF: 66.93872990891367
Dropping 23 with VIF: 63.42262208294637
Dropping 20 with VIF: 51.884919220787594
Dropping 37 with VIF: 51.64830191501168
Dropping 12 with VIF: 50.363782177181946
Dropping 45 with VIF: 47.76121603042287
Dropping 42 with VIF: 43.14993362346412
Dropping 47 with VIF: 40.25778316561585
Dropping 30 with VIF: 34.1205798316173
Dropping 39 with VIF: 33.70334486024496
Dropping 13 with VIF: 29.81693051480178
Dropping 8 with VIF: 28.15252938912821
Dropping 33 with VIF: 27.17376200525844
Dropping 26 with VIF: 23.564509010376742
Dropping 18 with VIF: 19.95250766790957
Dropping 40 with VIF: 18.953960329

In [9]:
# check number of features left
print(f"Number of features left: {sonar.shape[1]}")

Number of features left: 28


In [10]:
sonar_cleaned = sonar.copy()

In [11]:
# create dummy variables
n_obs, n_features = sonar_cleaned.shape
min_features = n_obs // 2

i = 0
new_columns = {}
while sonar_cleaned.shape[1] + len(new_columns) < min_features:
    col_to_copy = np.random.choice(sonar_cleaned.columns)
    new_columns[f"{col_to_copy}_perm_{i}"] = np.random.permutation(sonar_cleaned[col_to_copy].values)
    i += 1

df = pd.concat([sonar_cleaned, pd.DataFrame(new_columns)], axis=1)

df.shape


(208, 104)

In [12]:
# change labels to -1 and 1
sonar_labels = sonar_labels.map({'M': 1, 'R': -1})
sonar_labels.rename('label', inplace=True)

0     -1
1     -1
2     -1
3     -1
4     -1
      ..
203    1
204    1
205    1
206    1
207    1
Name: label, Length: 208, dtype: int64

In [13]:
df.to_csv('data/preprocessed_sonar.csv', index=False)
sonar_labels.to_csv('data/sonar_labels.csv', index=False)