In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import datacompy
import os, sys
import numpy as np
import re
import ast
from pathlib import Path
import openpyxl
import itertools
import torch

# narzedzia
from sklearn.model_selection import (
    train_test_split,
    cross_val_score,
    cross_val_predict,
    learning_curve,
    RepeatedStratifiedKFold,
    GridSearchCV,
    RandomizedSearchCV
)
from collections import Counter
from sklearn.metrics import classification_report, confusion_matrix, silhouette_score, accuracy_score, pairwise_distances, make_scorer, precision_score, f1_score, roc_auc_score
from sklearn.metrics.pairwise import cosine_similarity
from sklearn import tree
from sklearn.tree import plot_tree
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from scipy import stats
from scipy.stats import chi2_contingency, f_oneway, friedmanchisquare, wilcoxon
from scipy.spatial import distance
from joblib import dump, load

# modele
from xgboost import XGBClassifier, XGBRFClassifier
#from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# methods
from imblearn.under_sampling import ClusterCentroids, NearMiss
from scipy.optimize import differential_evolution
from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN, HDBSCAN
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor, KNeighborsClassifier, NearestNeighbors
from sklearn.inspection import permutation_importance
from scipy.spatial.distance import euclidean
from imblearn.over_sampling import SMOTE, BorderlineSMOTE
from ctgan import CTGAN

### Read Datasets

In [None]:
#oversampling data
original_df = pd.read_csv("D:\\ml\\undersampling_data\\data\\unsw\\encoded_normalized\\original_data_normalized.csv")
mix_df = pd.read_csv("D:\\ml\\undersampling_data\\data\\unsw\\encoded_normalized\\mix_data_normalized.csv")
smote_df = pd.read_csv("D:\\ml\\undersampling_data\\data\\unsw\\encoded_normalized\\smote_data_normalized.csv") 
gan_df = pd.read_csv("D:\\ml\\undersampling_data\\data\\unsw\\encoded_normalized\\gan_data_normalized.csv")
borderline_df = pd.read_csv("D:\\ml\\undersampling_data\\data\\unsw\\encoded_normalized\\borderline_data_normalized.csv")

X_test = pd.read_csv("D:\\ml\\undersampling_data\\data\\unsw\\test\\X_test_norm.csv")

#Before undersampling
print(original_df["target"].value_counts())
count1=original_df["target"].value_counts().sum()
count2=abs((original_df['target']==0).sum() - (original_df['target']==1).sum())
print(f"Before undersampling: {count2}")
print(f"After number of samples: {count1+count2}")

In [None]:
#Sum up all df 
sum_all_data = pd.concat([smote_df, gan_df, borderline_df, original_df], axis=0, ignore_index=True)

#Split data
X_mix, y_mix = mix_df.drop(columns=["target", "source"]), mix_df["target"]
X_smote, y_smote = smote_df.drop(columns=["target", "source"]), smote_df["target"]
X_GAN, y_GAN = gan_df.drop(columns=["target", "source"]), gan_df["target"]
X_borderline, y_borderline = borderline_df.drop(columns=["target", "source"]), borderline_df["target"]

#Dictionary
data = {}
data["mix"] = (X_mix, y_mix)
data["smote"] = (X_smote, y_smote)
data["GAN"] = (X_GAN, y_GAN)
data["borderline"] = (X_borderline, y_borderline)

compare = {}
compare["mix"] = mix_df
compare["smote"] = smote_df
compare["GAN"] = gan_df
compare["borderline"] = borderline_df

### Undersampling

In [None]:
base = Path("D:/ml/undersampling_data/data/unsw/reduced")
base2 = Path("D:/ml/undersampling_data/models/kmeans&centroids")

df_ = {}

for name in data.keys() & compare.keys():
    # Read data from dictionary
    X_train, y_train = data[name]
    compare_df = compare[name]
    
    # Select majority class
    X_majority = X_train[y_train == 1]
    
    # Apply KMeans clustering
    KM = KMeans(n_clusters=(int)(count2))
    kmeans = KM.fit(X_majority)
    
    # Save KMeans model
    file_path2 = base2 / f"{name}_kmeans_model.pkl"
    with open(file_path2, "wb") as f:
        pickle.dump(kmeans, f)
    
    # Create a DataFrame for centroids
    X_majority_reduced = pd.DataFrame(kmeans.cluster_centers_, columns=X_train.columns)
    y_majority_reduced = pd.Series([1] * (int)(count2), name="target") 
    
    # Combine reduced majority class with original minority class
    df_majority = pd.concat([X_majority_reduced, y_majority_reduced], axis=1).reset_index(drop=True)
    
    # Add source column if not present
    df_majority["source"] = None
    missing_source = df_majority[df_majority["source"].isna()]
    if not missing_source.empty:
        df_majority.loc[df_majority["source"].isna(), "source"] = "centroid" 
    
    print(df_majority)
    df_majority = df_majority.reindex(columns=original_df.columns, fill_value=0.0)
    
    # Combine with original minority class
    df_[name] = pd.concat([df_majority, original_df], axis=0).reset_index(drop=True)  
    
    print(df_[name].info())
    
    # Save to CSV if file does not exist
    file_path = base / f"{name}_KM_centroids.csv"

    if os.path.exists(file_path):
        print("File exists.")
    else:
        df_[name].to_csv(file_path, index=False)
        print("File saved.")

### KMeans + NN

In [None]:
base = Path("D:/ml/undersampling_data/data/unsw/reduced")
base2 = Path("D:/ml/undersampling_data/models/kmeans&nn")

cluster_data_ = {
    "mix": {},
    "smote": {},
    "GAN": {},
    "borderline": {}
}
centroids_ = {}

results_ = {}
df_ = {}

results_KM_NN_ = {
    "mix": {},
    "smote": {},
    "GAN": {},
    "borderline": {}
}


for name in data.keys() & compare.keys():
    # Read data from dictionary
    X_train, y_train = data[name]
    compare_df = compare[name]
    
    # Select majority class (synthetic samples)
    X_majority = X_train[y_train == 1]
    
    # Apply KMeans clustering
    KM = KMeans(n_clusters=(int)(count2))
    kmeans = KM.fit(X_majority)
    
    # Save KMeans model
    file_path2 = base2 / f"{name}_kmeans_model.pkl"
    with open(file_path2, "wb") as f:
        pickle.dump(kmeans, f)
    
    # Centroids + Add rows to dictionary
    for i in range ((int)(count2)):
        rows_in_cluster = X_majority[kmeans.labels_ == i] 
        cluster_data_[name][i] = rows_in_cluster
        
        centroids_ = pd.DataFrame(kmeans.cluster_centers_, columns=X_train.columns)
    
    # Original minority class
    for i in range(len(centroids_)):
        if (len(cluster_data_[name][i])>1):     # If more than one sample in the cluster
            dist_={}
            index_ = {}
            centroid = centroids_.iloc[i]
            for j in range(len(cluster_data_[name][i])):
                index_ = list(cluster_data_[name][i].index)
                row = cluster_data_[name][i].iloc[j]
                index_map = {j: idx for j, idx in enumerate(index_)}
                dist_[j] = euclidean(centroid, row)         # Colculate Euclidean distance
                
            min_key = min(dist_, key=dist_.get)
            results_KM_NN_[name][i] = cluster_data_[name][i].iloc[[min_key]]    # Select the nearest neighbor to the centroid
            
        else:           # If only one sample in the cluster
            results_KM_NN_[name][i] = cluster_data_[name][i].iloc[[0]]
        
        results_[name] = pd.concat(results_KM_NN_[name].values(), ignore_index=True)    
    
    # Combine reduced majority class with original df
    df_y_majority = pd.Series([1] * (int)(count2), name="target") 
    df_majority = pd.concat([results_[name], df_y_majority], axis=1).reset_index(drop=True)
    columns_ = list(df_majority.columns.values)
    df_majority = df_majority.merge(sum_all_data, on=columns_, how="left")
     
    df_[name] = pd.concat([df_majority, original_df], axis=0).reset_index(drop=True) 
    
    print(df_[name].info())
    
    # Save to CSV if file does not exist
    file_path = base / f"{name}_KM_nn.csv"

    if os.path.exists(file_path):
        print("File exists.")
    else:
        df_[name].to_csv(file_path, index=False)
        print("File saved.")

### KMeans + Cosinus Similarity

In [None]:
base = Path("D:/ml/undersampling_data/data/unsw/reduced")
base2 = Path("D:/ml/undersampling_data/models/kmeans&cos")

cluster_data_ = {
    "mix": {},
    "smote": {},
    "GAN": {},
    "borderline": {}
}
centroids_ = {}

results_ = {}
df_ = {}

results_KM_COS_ = {
    "mix": {},
    "smote": {},
    "GAN": {},
    "borderline": {}
}


for name in data.keys() & compare.keys():
    # Read data from dictionary
    X_train, y_train = data[name]
    compare_df = compare[name]
    
    # Select majority class (synthetic samples)
    X_majority = X_train[y_train == 1]
    
    # Apply KMeans clustering
    KM = KMeans(n_clusters=(int)(count2))
    kmeans = KM.fit(X_majority)
    
    # Save KMeans model
    file_path2 = base2 / f"{name}_kmeans_model.pkl"
    with open(file_path2, "wb") as f:
        pickle.dump(kmeans, f)
    
    # Centroids + Add rows to dictionary
    for i in range ((int)(count2)):
        rows_in_cluster = X_majority[kmeans.labels_ == i] 
        cluster_data_[name][i] = rows_in_cluster
        
        centroids_ = pd.DataFrame(kmeans.cluster_centers_, columns=X_train.columns)
    
    # Original majority class
    for i in range(len(centroids_)):                
        if (len(cluster_data_[name][i])>1):       # If more than one sample in the cluster
            dist_ = {}
            index_ = {}
            centroid = centroids_.iloc[i]
            centroid = centroid.values.reshape(1,-1)
            for j in range(len(cluster_data_[name][i])):
                index_ = list(cluster_data_[name][i].index)
                row = cluster_data_[name][i].iloc[j]
                row = row.values.reshape(1,-1)
                index_map = {j: idx for j, idx in enumerate(index_)}
                dist_[j] = cosine_similarity(centroid, row)         # Colculate Cosine similarity
                
            min_key = max(dist_, key=dist_.get)
            results_KM_COS_[name][i] = cluster_data_[name][i].iloc[[min_key]]   # Select the nearest neighbor to the centroid
            
        else:       # If only one sample in the cluster
            results_KM_COS_[name][i] = cluster_data_[name][i].iloc[[0]]
        
        results_[name] = pd.concat(results_KM_COS_[name].values(), ignore_index=True)       
    
    # Combine reduced majority class with original df
    df_y_majority = pd.Series([1] * (int)(count2), name="target") 
    df_majority = pd.concat([results_[name], df_y_majority], axis=1).reset_index(drop=True)
    columns_ = list(df_majority.columns.values)
    df_majority = df_majority.merge(sum_all_data, on=columns_, how="left")
     
    df_[name] = pd.concat([df_majority, original_df], axis=0).reset_index(drop=True) 
    
    print(df_[name].info())
    
    # Save to CSV if file does not exist
    file_path = base / f"{name}_KM_cos.csv"

    if os.path.exists(file_path):
        print("File exists.")
    else:
        df_[name].to_csv(file_path, index=False)
        print("File saved.")

### KMeans + cos + Mahalanobis distance

In [None]:
base = Path("D:/ml/undersampling_data/data/unsw/reduced")
base2 = Path("D:/ml/undersampling_data/models/kmeans&cos&mal")

cluster_data_ = {
    "mix": {},
    "smote": {},
    "GAN": {},
    "borderline": {}
}
centroids_ = {}

results_ = {}
df_ = {}

results_KM_COS_MAN_ = {
    "mix": {},
    "smote": {},
    "GAN": {},
    "borderline": {}
}

alfa = 0.5  #wazenie

for name in data.keys() & compare.keys():
    # Read data from dictionary
    X_train, y_train = data[name]
    compare_df = compare[name]
    
    # Select majority class (synthetic samples)
    X_majority = X_train[y_train == 1]
        
    # Apply KMeans clustering
    KM = KMeans(n_clusters=(int)(count2))
    kmeans = KM.fit(X_majority)
    
    # Save KMeans model
    file_path2 = base2 / f"{name}_kmeans_model.pkl"
    with open(file_path2, "wb") as f:
        pickle.dump(kmeans, f)
    
    # Centroids + Add rows to dictionary
    for i in range ((int)(count2)):
        rows_in_cluster = X_majority[kmeans.labels_ == i] 
        cluster_data_[name][i] = rows_in_cluster
        
        centroids_ = pd.DataFrame(kmeans.cluster_centers_, columns=X_train.columns)
    
    # Original majority class
    for i in range(len(centroids_)):               
        if (len(cluster_data_[name][i])>1):       # If more than one sample in the cluster
            cos_={}
            dist_={}
            comb_score_ = {} # results for cosine similarity and Mahalanobis distance
            index_ = {}
            centroid = centroids_.iloc[i]
            centroid = centroid.values.reshape(1,-1)
            for j in range(len(cluster_data_[name][i])):
                index_ = list(cluster_data_[name][i].index)
                row = cluster_data_[name][i].iloc[j]
                row = row.values.reshape(1,-1)
                index_map = {j: idx for j, idx in enumerate(index_)}
                cos_[j] = cosine_similarity(centroid, row)[0,0]          # using Cosine similarity
                dist_[j] = distance.mahalanobis(centroid.flatten(), row.flatten(), np.linalg.pinv(np.cov(X_train.T)))  # using Mahalanobis distance
                
                comb_score_[j] = (alfa*cos_[j] + (1-alfa)*dist_[j])
                
            min_key = min(comb_score_, key=comb_score_.get)
            results_KM_COS_[name][i] = cluster_data_[name][i].iloc[[min_key]]
            
        else:
            results_KM_COS_[name][i] = cluster_data_[name][i].iloc[[0]]
        
        results_[name] = pd.concat(results_KM_COS_[name].values(), ignore_index=True)   
    
    # Combine reduced majority class with original df
    df_y_majority = pd.Series([1] * (int)(count2), name="target") 
    df_majority = pd.concat([results_[name], df_y_majority], axis=1).reset_index(drop=True)
    columns_ = list(df_majority.columns.values)
    df_majority = df_majority.merge(sum_all_data, on=columns_, how="left")
     
    df_[name] = pd.concat([df_majority, original_df], axis=0).reset_index(drop=True) 
    
    print(df_[name].info())
    
    # Save to CSV if file does not exist
    file_path = base / f"{name}_KM_cos_mal.csv"

    if os.path.exists(file_path):
        print("File exists.")
    else:
        df_[name].to_csv(file_path, index=False)
        print("File saved.")

### HDBSCAN + Euclidean distance

In [None]:
base = Path("D:/ml/undersampling_data/data/unsw/reduced")
base2 = Path("D:/ml/undersampling_data/models/hdbscan&nn")

cluster_data_ = {
    "mix": {},
    "smote": {},
    "GAN": {},
    "borderline": {}
}
centroids_ = {}

results_ = {}
df_ = {}

results_HDBSCAN_DIST_ = {
    "mix": {},
    "smote": {},
    "GAN": {},
    "borderline": {}
}
labels_={}

for name in data.keys() & compare.keys():
    # Read data from dictionary
    X_train, y_train = data[name]
    compare_df = compare[name]
    
    # Select majority class (synthetic samples)
    X_majority = X_train[y_train == 1]
    
    # Apply HDBSCAN clustering
    hdbscan = HDBSCAN(store_centers="centroid")
    hdbscan_res = hdbscan.fit(X_majority)
    
    # Save HDBSCAN model
    file_path2 = base2 / f"{name}_hdbscan_model.pkl"
    with open(file_path2, "wb") as f:
        pickle.dump(hdbscan, f)
    
    # Get labels and number of clusters
    labels = hdbscan_res.labels_
    unique_lables = np.unique(labels)
    labels_[name] = len(unique_lables[unique_lables >=0])
    labels = len(unique_lables[unique_lables >=0])
    print(f"{name}: {labels_[name]}")
    
    # Centroids from HDBSCAN
    centroids_ = pd.DataFrame(hdbscan_res.centroids_, columns=X_train.columns)
    centroids_hdbscan = hdbscan.fit_predict(X_majority)

    # Add rows to cluster dictionary
    for i in range(labels):
        rows_in_cluster = X_majority[hdbscan.labels_ == i] 
        cluster_data_[name][i] = rows_in_cluster

        target = count2
        
        # Sorted neighbors dictionary
        per_cluster_sorted = {}
        
        # Calculate the nieghbors for each centroid (centroid -> rows_in_cluster)
    for i in range(len(centroids_)):
        rows = cluster_data_[name][i]
        if len(rows) == 0:
            per_cluster_sorted[i] = []
            continue
        if len(rows) == 1:
            per_cluster_sorted[i] = [rows.index[0]]
            continue

        centroid = centroids_.iloc[i].to_numpy()
        # Calculate Euclidean distances from centroid to all points in the cluster
        dists = rows.apply(lambda r: euclidean(centroid, r.to_numpy()), axis=1)
        order = dists.sort_values().index.tolist()  # indeksy X_majority w kolejności rosnących dystansów
        per_cluster_sorted[i] = order

    # Round-robin chooice to target_n
    selected_idx = []
    ptr = {i: 0 for i in range(len(centroids_))}                     # Which "index" to take now from the cluster
    while len(selected_idx) < target:
        progressed = False
        for i in range(len(centroids_)):
            lst = per_cluster_sorted[i]
            j = ptr[i]
            if j < len(lst):
                idx = lst[j]
                if idx not in selected_idx:
                    selected_idx.append(idx)
                ptr[i] += 1
                progressed = True
                if len(selected_idx) >= target:
                    break
        if not progressed:
            break

    # Choose selected records of majority class (in order of selection)
    maj_selected = X_majority.loc[selected_idx].reset_index(drop=True)
    results_[name] = maj_selected   
    
    # Combine reduced majority class with original df    
    df_y_majority = pd.Series([1.0] * len(results_[name]), name="target")
    df_majority = pd.concat([results_[name], df_y_majority], axis=1).reset_index(drop=True)
    columns_ = list(df_majority.columns.values)
    df_majority = df_majority.merge(sum_all_data, on=columns_, how="left")
     
    df_[name] = pd.concat([df_majority, original_df], axis=0).reset_index(drop=True)  
    #print(df_[name])

    print(df_[name].info())

    # Save to CSV if file does not exist
    file_path = base / f"{name}_HDBSCAN_NN.csv"

    if os.path.exists(file_path):
        print("File exists.")
    else:
        df_[name].to_csv(file_path, index=False)
        print("File saved.")

### HDBSCAN + Cosinus Similarity

In [None]:
base = Path("D:/ml/undersampling_data/data/unsw/reduced")
base2 = Path("D:/ml/undersampling_data/models/hdbscan&cos")

cluster_data_ = {
    "mix": {},
    "smote": {},
    "GAN": {},
    "borderline": {}
}
centroids_ = {}

results_ = {}
df_ = {}

results_HDBSCAN_COS_ = {
    "mix": {},
    "smote": {},
    "GAN": {},
    "borderline": {}
}


for name in data.keys() & compare.keys():
    # Read data from dictionary
    X_train, y_train = data[name]
    compare_df = compare[name]
    
    # Select majority class (synthetic samples)
    X_majority = X_train[y_train == 1]
    
    # Apply HDBSCAN clustering
    hdbscan = HDBSCAN(store_centers="centroid")
    hdbscan_res = hdbscan.fit(X_majority)
    
    # Save HDBSCAN model
    file_path2 = base2 / f"{name}_hdbscan_model.pkl"
    with open(file_path2, "wb") as f:
        pickle.dump(hdbscan, f)
        
    # Get labels and number of clusters
    labels = hdbscan_res.labels_
    unique_lables = np.unique(labels)
    labels = len(unique_lables[unique_lables >=0])
    centroids_ = pd.DataFrame(hdbscan_res.centroids_, columns=X_train.columns)
    
    centroids_hdbscan = hdbscan.fit_predict(X_majority)

    # Centroids from HDBSCAN + Add rows to cluster dictionary
    for i in range(labels):
        # Add rows to cluster dictionary
        rows_in_cluster = X_majority[hdbscan.labels_ == i] 
        cluster_data_[name][i] = rows_in_cluster
        
        target = count2

        # Sorted crucial indexes from clusters
        per_cluster_sorted = {}
        
        # Calculate the nieghbors for each centroid (centroid -> rows_in_cluster)
    for i in range(len(centroids_)):
        rows = cluster_data_[name][i]
        if len(rows) == 0:
            per_cluster_sorted[i] = []
            continue
        if len(rows) == 1:
            per_cluster_sorted[i] = [rows.index[0]]
            continue

        centroid = centroids_.iloc[i].to_numpy().reshape(1, -1)
        # Calculate Cosine distances from centroid to all points in the cluster
        dists = pairwise_distances(rows.values, centroid, metric="cosine").ravel()
        order = rows.index[np.argsort(dists)].tolist()  # indeksy X_majority w kolejności rosnących dystansów
        per_cluster_sorted[i] = order

    # Round-robin chooice to target_n
    selected_idx = []
    ptr = {i: 0 for i in range(len(centroids_))}                 # which "index" to take now from the cluster
    while len(selected_idx) < target:
        progressed = False
        for i in range(len(centroids_)):
            lst = per_cluster_sorted[i]
            j = ptr[i]
            if j < len(lst):
                idx = lst[j]
                if idx not in selected_idx:
                    selected_idx.append(idx)
                ptr[i] += 1
                progressed = True
                if len(selected_idx) >= target:
                    break
        if not progressed: 
            break

    # Choose selected records of majority class (in order of selection)
    maj_selected = X_majority.loc[selected_idx].reset_index(drop=True)
    results_[name] = maj_selected   
    
    # Combine reduced majority class with original df
    df_y_majority = pd.Series([1.0] * len(results_[name]), name="target")
    df_majority = pd.concat([results_[name], df_y_majority], axis=1).reset_index(drop=True)
    columns_ = list(df_majority.columns.values)
    df_majority = df_majority.merge(sum_all_data, on=columns_, how="left")
        
    df_[name] = pd.concat([df_majority, original_df], axis=0).reset_index(drop=True)
    
    print(df_[name].info())
    
    # Save to CSV if file does not exist
    file_path = base / f"{name}_HDBSCAN_cos.csv"

    if os.path.exists(file_path):
        print("File exists.")
    else:
        df_[name].to_csv(file_path, index=False)
        print("File saved.")

### KMeans (samples=HDBSCAN()) + NN

In [None]:
base = Path("D:/ml/undersampling_data/data/unsw/reduced")
base2 = Path("D:/ml/undersampling_data/models/kmeans&hdbscan&nn")

cluster_data_ = {
    "mix": {},
    "smote": {},
    "GAN": {},
    "borderline": {}
}
centroids_ = {}

results_ = {}
df_ = {}

results_KMEANS_HDBSCAN_DIST_ = {
    "mix": {},
    "smote": {},
    "GAN": {},
    "borderline": {}
}

for name in data.keys() & compare.keys():
    # Read data from dictionary
    X_train, y_train = data[name]
    compare_df = compare[name]
    
    # Select majority class (synthetic samples)
    X_majority = X_train[y_train == 1]
    
    # Apply KMeans clustering
    KM = KMeans(n_clusters=labels_[name])
    kmeans = KM.fit(X_majority)
    
    # Save KMeans model
    file_path2 = base2 / f"{name}_kmeans_model.pkl"
    with open(file_path2, "wb") as f:
        pickle.dump(kmeans, f)
        
    # Centroids from KMeans + Add rows to cluster dictionary
    for i in range (labels_[name]):
        # Add rows to cluster dictionary
        rows_in_cluster = X_majority[kmeans.labels_ == i] 
        cluster_data_[name][i] = rows_in_cluster
        
        centroids_ = pd.DataFrame(kmeans.cluster_centers_, columns=X_train.columns)
    
    target = count2
    
    # Sorted crucial indexes from clusters
    per_cluster_sorted = {}
    for i in range(len(centroids_)):
        rows = cluster_data_[name][i]
        if len(rows) == 0:
            per_cluster_sorted[i] = []
            continue
        if len(rows) == 1:
            per_cluster_sorted[i] = [rows.index[0]]
            continue
             
        centroid = centroids_.iloc[i].to_numpy()
        # Calculate Euclidean distances from centroid to all points in the cluster
        dists = rows.apply(lambda r: euclidean(centroid, r.to_numpy()), axis=1)
        order = dists.sort_values().index.tolist()  # indexes of X_majority in ascending order of distances
        per_cluster_sorted[i] = order
        
    selected_idx = []
    ptr = {i: 0 for i in range(len(centroids_))}                     # which "index" to take now from the cluster
    while len(selected_idx) < target:
        progressed = False
        for i in range(len(centroids_)):
            lst = per_cluster_sorted[i]
            j = ptr[i]
            if j < len(lst):
                idx = lst[j]
                if idx not in selected_idx:
                    selected_idx.append(idx)
                ptr[i] += 1
                progressed = True
                if len(selected_idx) >= target:
                    break
        if not progressed: 
            break
    
    # Choose selected records of majority class (in order of selection)   
    maj_selected = X_majority.loc[selected_idx].reset_index(drop=True)   
    results_[name] = maj_selected     
    
    # Combine reduced majority class with original df
    df_y_majority = pd.Series([1.0] * len(results_[name]), name="target")
    df_majority = pd.concat([results_[name], df_y_majority], axis=1).reset_index(drop=True)
    columns_ = list(df_majority.columns.values)
    df_majority = df_majority.merge(sum_all_data, on=columns_, how="left")
        
    df_[name] = pd.concat([df_majority, original_df], axis=0).reset_index(drop=True) 
    
    print(df_[name].info()) 
    
    # Save to CSV if file does not exist
    file_path = base / f"{name}_kmeans&hdbscan_nn.csv"

    if os.path.exists(file_path):
        print("File exists.")
    else:
        df_[name].to_csv(file_path, index=False)
        print("File saved.")

### KMeans (samples=HDBSCAN()) + cosinus

In [None]:
base = Path("D:/ml/undersampling_data/data/unsw/reduced")
base2 = Path("D:/ml/undersampling_data/models/kmeans&hdbscan&cos")

cluster_data_ = {
    "mix": {},
    "smote": {},
    "GAN": {},
    "borderline": {}
}
centroids_ = {}

results_ = {}
df_ = {}

results_KMEANS_HDBSCAN_DIST_ = {
    "mix": {},
    "smote": {},
    "GAN": {},
    "borderline": {}
}

for name in data.keys() & compare.keys():
    # Read data from dictionary
    X_train, y_train = data[name]
    compare_df = compare[name]
    
    # Select majority class (synthetic samples) 
    X_majority = X_train[y_train == 1]
    
    # Apply KMeans clustering
    KM = KMeans(n_clusters=labels_[name])
    kmeans = KM.fit(X_majority)
    
    # Save KMeans model
    file_path2 = base2 / f"{name}_kmeans_model.pkl"
    with open(file_path2, "wb") as f:
        pickle.dump(kmeans, f)
    
    # Centroids from KMeans + Add rows to cluster dictionary
    for i in range (labels_[name]):
        rows_in_cluster = X_majority[kmeans.labels_ == i] 
        cluster_data_[name][i] = rows_in_cluster
        
        centroids_ = pd.DataFrame(kmeans.cluster_centers_, columns=X_train.columns)
    
    target = count2
    
    # Sorted crucial indexes from clusters
    per_cluster_sorted = {}
    for i in range(len(centroids_)):
        rows = cluster_data_[name][i]
        if len(rows) == 0:
            per_cluster_sorted[i] = []
            continue
        if len(rows) == 1:
            per_cluster_sorted[i] = [rows.index[0]]
            continue
             
        centroid = centroids_.iloc[i].to_numpy().reshape(1, -1)
        # Calculate Cosine distances from centroid to all points in the cluster
        dists = pairwise_distances(rows.values, centroid, metric="cosine").ravel()
        order = rows.index[np.argsort(dists)].tolist()              # indexes of X_majority in ascending order of distances
        per_cluster_sorted[i] = order
        
    selected_idx = []
    ptr = {i: 0 for i in range(len(centroids_))}                     # which "index" to take now from the cluster
    while len(selected_idx) < target:
        progressed = False
        for i in range(len(centroids_)):
            lst = per_cluster_sorted[i]
            j = ptr[i]
            if j < len(lst):
                idx = lst[j]
                if idx not in selected_idx:
                    selected_idx.append(idx)
                ptr[i] += 1
                progressed = True
                if len(selected_idx) >= target:
                    break
        if not progressed:  # wszystkie klastry wyczerpane
            break
        
    # Select selected records of majority class (in order of selection)    
    maj_selected = X_majority.loc[selected_idx].reset_index(drop=True)   # <-- CHANGED
    results_[name] = maj_selected     
    
    # Combine reduced majority class with original df
    df_y_majority = pd.Series([1.0] * len(results_[name]), name="target")
    df_majority = pd.concat([results_[name], df_y_majority], axis=1).reset_index(drop=True)
    columns_ = list(df_majority.columns.values)
    df_majority = df_majority.merge(sum_all_data, on=columns_, how="left")
    
    df_[name] = pd.concat([df_majority, original_df], axis=0).reset_index(drop=True) 
    print(df_[name].info()) 
    
    # Save to CSV if file does not exist
    file_path = base / f"{name}_kmeans&hdbscan_cos.csv"

    if os.path.exists(file_path):
        print("File exists.")
    else:
        df_[name].to_csv(file_path, index=False)
        print("File saved.")

### KMeans (samples=HDBSCAN()) + cosinus + Mahalanobis

In [None]:
base = Path("D:/ml/undersampling_data/data/unsw/reduced")
base2 = Path("D:/ml/undersampling_data/models/kmeans&hdbscan&cos&mal")

cluster_data_ = {
    "mix": {},
    "smote": {},
    "GAN": {},
    "borderline": {}
}
centroids_ = {}

results_ = {}
df_ = {}

results_KMEANS_HDBSCAN_COS_NN_ = {
    "mix": {},
    "smote": {},
    "GAN": {},
    "borderline": {}
}

alfa = 0.5

for name in data.keys() & compare.keys():
    # Read data from dictionary
    X_train, y_train = data[name]
    compare_df = compare[name]
    
    # Select majority class (synthetic samples)
    X_majority = X_train[y_train == 1]
    
    # Apply KMeans clustering
    KM = KMeans(n_clusters=labels_[name])
    kmeans = KM.fit(X_majority)
    
    # Save KMeans model
    file_path2 = base2 / f"{name}_kmeans_model.pkl"
    with open(file_path2, "wb") as f:
        pickle.dump(kmeans, f)
    
    # Centroids from KMeans + Add rows to cluster dictionary
    for i in range (labels_[name]):
        rows_in_cluster = X_majority[kmeans.labels_ == i] 
        cluster_data_[name][i] = rows_in_cluster
        
        centroids_ = pd.DataFrame(kmeans.cluster_centers_, columns=X_train.columns)
        
    target = count2
    
    # Sorted crucial indexes from clusters
    per_cluster_sorted = {}
    for i in range(len(centroids_)):                # for each centroid
        if (len(cluster_data_[name][i])>1):        # check if more than one sample in the cluster
            cos_={}
            dist_={}
            comb_score_ = {}                # results for cosine similarity and Mahalanobis distance
            index_ = {}
            centroid = centroids_.iloc[i]
            centroid = centroid.values.reshape(1,-1)
            for j in range(len(cluster_data_[name][i])):
                index_ = list(cluster_data_[name][i].index)
                row = cluster_data_[name][i].iloc[j]
                row = row.values.reshape(1,-1)
                index_map = {j: idx for j, idx in enumerate(index_)}
                cos_[j] = cosine_similarity(centroid, row)[0,0]         #using Cosine similarity
                dist_[j] = distance.mahalanobis(centroid.flatten(), row.flatten(), np.linalg.pinv(np.cov(X_train.T)))  #using Mahalanobis distance
                
                comb_score_[j] = (alfa*cos_[j] + (1-alfa)*dist_[j])
                
            min_key = min(comb_score_, key=comb_score_.get)
            results_KMEANS_HDBSCAN_COS_NN_[name][i] = cluster_data_[name][i].iloc[[min_key]]
            
            order = sorted(comb_score_, key=comb_score_.get, reverse=True)
            per_cluster_sorted[i] = [cluster_data_[name][i].index[j] for j in order]
                
        else:
            results_KMEANS_HDBSCAN_COS_NN_[name][i] = cluster_data_[name][i].iloc[[0]]
        
    results_[name] = pd.concat(results_KMEANS_HDBSCAN_COS_NN_[name].values(), ignore_index=True)
    
    # Sorted crucial indexes from clusters
    selected_idx = []
    ptr = {i: 0 for i in range(len(centroids_))}                     # which "index" to take now from the cluster
    while len(selected_idx) < target:
        progressed = False
        for i in range(len(centroids_)):
            lst = per_cluster_sorted.get(i, [])
            j = ptr[i]
            if j < len(lst):
                idx = lst[j]
                if idx not in selected_idx:
                    selected_idx.append(idx)
                ptr[i] += 1
                progressed = True
                if len(selected_idx) >= target:
                    break
        if not progressed:  
            break

    # Choose selected records of majority class (in order of selection)
    maj_selected = X_majority.loc[selected_idx].reset_index(drop=True)
    results_[name] = maj_selected     
    
    # Combine reduced majority class with original df
    df_y_majority = pd.Series([1.0] * len(results_[name]), name="target")
    df_majority = pd.concat([results_[name], df_y_majority], axis=1).reset_index(drop=True)
    columns_ = list(df_majority.columns.values)
    df_majority = df_majority.merge(sum_all_data, on=columns_, how="left")
    
    df_[name] = pd.concat([df_majority, original_df], axis=0).reset_index(drop=True) 
    print(df_[name].info()) 
    
    # Save to CSV if file does not exist
    file_path = base / f"{name}_kmeans&hdbscan_cos&mal.csv"

    if os.path.exists(file_path):
        print("File exists.")
    else:
        df_[name].to_csv(file_path, index=False)
        print("File saved.")