In [1]:

# Preparing Data
import pandas as pd
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from sklearn.svm import SVC
from sklearn.metrics import mean_squared_error,mean_squared_log_error, roc_auc_score, accuracy_score, f1_score, precision_recall_curve, log_loss
# Load the rock mines dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/undocumented/connectionist-bench/sonar/sonar.all-data"
df = pd.read_csv(url)
column_names = ["sensor_" + str(i) for i in range(1, 61)] + ["target"]
df.columns = column_names
df = df.reset_index()
df = df.rename(columns={"index": "id"})
df['target'] = df['target'].map({'M': 1, 'R': 0})
x_data = df.iloc[:, :5]
x_data = x_data.drop('id', axis=1)
y_data = df['target']
from sklearn.metrics import log_loss
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import*
# KFold with random_state for reproducibility
kf = KFold(n_splits=10, shuffle=True, random_state=42)
# Initialize the Random Forest classifier
rf = RandomForestClassifier(n_estimators=100, random_state=42)
##### Get Base Scoring
# Cross Validation
accuracies = []
for train_index, test_index in kf.split(x_data):
    x_train, x_test = x_data.iloc[train_index], x_data.iloc[test_index]
    y_train, y_test = y_data.iloc[train_index], y_data.iloc[test_index]
    rf.fit(x_train, y_train)
    y_pred = rf.predict(x_test)
    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred)
    accuracies.append(accuracy)
mean_accuracy = np.mean(accuracies)
std_accuracy = np.std(accuracies)
print('Baseline')
print(f'Mean accuracy: {mean_accuracy:.4f}, Standard deviation of accuracy: {std_accuracy:.4f}')


Baseline
Mean accuracy: 0.5990, Standard deviation of accuracy: 0.0932


In [5]:

from itertools import combinations
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.model_selection import KFold
def determine_optimal_clusters(data, max_clusters=6):
    inertia = []
    #silhouette_avg = []
    
    for n in range(2, max_clusters):
        kmeans = KMeans(n_clusters=n, n_init=10, random_state=42)
        kmeans.fit(data)
        inertia.append(kmeans.inertia_)
    
    # Elbow Method: Find the "elbow" point
    optimal_clusters = np.diff(inertia, 2).argmin() + 2
    
    # Silhouette Analysis: Find the maximum silhouette score
    #optimal_clusters_silhouette = np.argmax(silhouette_avg) + 2
    # Combine both methods to determine the optimal number of clusters
    #optimal_clusters = max(elbow_point, optimal_clusters_silhouette)
    return optimal_clusters
from itertools import combinations
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.model_selection import KFold
# Generate combinations of columns with 1 to 3 columns
columns = x_data.columns
combs = [list(combinations(columns, i)) for i in range(1, 4)]
combs = [item for sublist in combs for item in sublist]
# Initialize DataFrames to store cluster labels
data_cluster = pd.DataFrame()
# Set the number of clusters and K-Fold parameters
kf = KFold(n_splits=10, shuffle=True, random_state=42)
# Process each combination of columns
for idx, comb in enumerate(combs, start=1):
    print(f"Processing combination {idx}/{len(combs)}", end="\r")
    
    # Initialize lists to store fold predictions
    #fold_predictions_train = []
    fold_predictions_test = []
    
    for train_index, test_index in kf.split(x_data):
        x_train_split = x_data.iloc[train_index].reset_index(drop=False)
        x_test_split = x_data.iloc[test_index].reset_index(drop=False)
        # Set the original index back
        x_train_split.set_index('index', inplace=True)
        x_test_split.set_index('index', inplace=True)   
        #x_train_fold, x_test_fold = x_train.iloc[train_index], x_train.iloc[test_index]
        x_train_subset = x_train_split[list(comb)]
        x_test_subset = x_test_split[list(comb)]
        
        # Determine the optimal number of clusters
        optimal_clusters = determine_optimal_clusters(x_train_subset)
        
        # Initialize and fit the K-Means model
        kmeans = KMeans(n_clusters=optimal_clusters, n_init=10)
        kmeans.fit(x_train_subset)
        
        # Store the cluster labels
        fold_predictions_test.append(pd.Series(kmeans.predict(x_test_subset), index=x_test_split.index))
    
    # Concatenate fold predictions
    feature_name = "_".join(comb) + "_cluster"
    #data_cluster_train[feature_name] = pd.concat(fold_predictions_train).sort_index()
    data_cluster[feature_name] = pd.concat(fold_predictions_test)
## Reevaluate with Cluster Data only
accuracies = []
for train_index, test_index in kf.split(data_cluster):
    x_train, x_test = data_cluster.iloc[train_index], data_cluster.iloc[test_index]
    y_train, y_test = y_data.iloc[train_index], y_data.iloc[test_index]
    rf.fit(x_train, y_train)
    y_pred = rf.predict(x_test)
    accuracy = accuracy_score(y_test, y_pred)
    accuracies.append(accuracy)
mean_accuracy = np.mean(accuracies)
std_accuracy = np.std(accuracies)
print('')
print('Cluster Data')
print(f'Mean accuracy: {mean_accuracy:.4f}, Standard deviation of accuracy: {std_accuracy:.4f}')
## Reevaluate with Cluster Data and Original Data
x_data_enc_cluster = pd.merge(data_cluster, x_data, left_index=True, right_index=True)
accuracies = []
for train_index, test_index in kf.split(x_data_enc_cluster):
    x_train, x_test = x_data_enc_cluster.iloc[train_index], x_data_enc_cluster.iloc[test_index]
    y_train, y_test = y_data.iloc[train_index], y_data.iloc[test_index]
    rf.fit(x_train, y_train)
    y_pred = rf.predict(x_test)
    accuracy = accuracy_score(y_test, y_pred)
    accuracies.append(accuracy)
mean_accuracy = np.mean(accuracies)
std_accuracy = np.std(accuracies)
print('Original Data + Cluster Data')
print(f'Mean accuracy: {mean_accuracy:.4f}, Standard deviation of accuracy: {std_accuracy:.4f}')


Processing combination 14/14
Cluster Data
Mean accuracy: 0.8836, Standard deviation of accuracy: 0.0660
Original Data + Cluster Data
Mean accuracy: 0.9121, Standard deviation of accuracy: 0.0620
