In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import datacompy
import os, sys
import numpy as np
import re
import ast

# narzedzia
from sklearn.model_selection import (
    train_test_split,
    cross_val_score,
    cross_val_predict,
    learning_curve,
    RepeatedStratifiedKFold,
    GridSearchCV,
)
from collections import Counter
from sklearn.metrics import classification_report, confusion_matrix, silhouette_score, accuracy_score, pairwise_distances
from sklearn.metrics.pairwise import cosine_similarity
from sklearn import tree
from sklearn.tree import plot_tree
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from scipy import stats
from scipy.stats import chi2_contingency
from scipy.spatial import distance
from joblib import dump, load

# modele
from xgboost import XGBClassifier, XGBRFClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# methods
from imblearn.under_sampling import ClusterCentroids, NearMiss
from scipy.optimize import differential_evolution
from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN, HDBSCAN
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor, KNeighborsClassifier, NearestNeighbors
from sklearn.inspection import permutation_importance
from scipy.spatial.distance import euclidean
from imblearn.over_sampling import SMOTE, BorderlineSMOTE
#from ctgan import CTGAN

In [2]:
#oversampling data
original_data = pd.read_csv("D:\\ml\\undersampling_data\\data\\ssh\\original_data.csv")    
original_data = original_data.drop(columns=["Unnamed: 0"])
smote_data = pd.read_csv("D:\\ml\\undersampling_data\\data\\ssh\\smote_data.csv")
GAN_data = pd.read_csv("D:\\ml\\undersampling_data\\data\\ssh\\GAN_data.csv")
borderline_data = pd.read_csv("D:\\ml\\undersampling_data\\data\\ssh\\borderline_data.csv")
smote2_data = pd.read_csv("D:\\ml\\undersampling_data\\data\\ssh\\smote3_data.csv")
GAN2_data = pd.read_csv("D:\\ml\\undersampling_data\\data\\ssh\\GAN3_data.csv")    
borderline2_data = pd.read_csv("D:\\ml\\undersampling_data\\data\\ssh\\borderline3_data.csv")

# test data
X_test = pd.read_csv("D:\\ml\\undersampling_data\\data\\ssh\\test\\X_test.csv")
y_test = pd.read_csv("D:\\ml\\undersampling_data\\data\\ssh\\test\\y_test.csv")

#Before undersampling
print(original_data["target"].value_counts())
count1=original_data["target"].value_counts().sum()
print(count1)
count2=abs((original_data['target']==0).sum() - (original_data['target']==1).sum())
print(f"Before undersampling: {count2}")
print(f"After number of samples: {count1+count2}")

target
0    151
1     46
Name: count, dtype: int64
197
Before undersampling: 105
After number of samples: 302


In [3]:
#mixed data
mix_data = pd.concat([GAN_data, smote_data, borderline_data], axis=0, ignore_index=True)         
mix_data = mix_data.reset_index(drop=True)

#data with one oversampling method and original data e.g.(smote+original)
smote_data = pd.concat([smote_data, smote2_data], axis=0, ignore_index=True)
smote_data = smote_data.reset_index(drop=True)
borderline_data = pd.concat([borderline_data, borderline2_data], axis=0, ignore_index=True)
borderline_data = borderline_data.reset_index(drop=True)
GAN_data = pd.concat([GAN_data, GAN2_data], axis=0, ignore_index=True)
GAN_data = GAN_data.reset_index(drop=True)

#convert data types to float64
int_cols = mix_data.select_dtypes(include=["int"]).columns
mix_data[int_cols] = mix_data[int_cols].astype("float64")
int_cols = smote_data.select_dtypes(include=["int"]).columns
smote_data[int_cols] = smote_data[int_cols].astype("float64")
int_cols = borderline_data.select_dtypes(include=["int"]).columns
borderline_data[int_cols] = borderline_data[int_cols].astype("float64")
int_cols = GAN_data.select_dtypes(include=["int"]).columns
GAN_data[int_cols] = GAN_data[int_cols].astype("float64")

sum_all_data = pd.concat([smote_data, GAN_data, borderline_data, original_data], axis=0, ignore_index=True)
sum_all_data = sum_all_data.drop_duplicates()


#Split data
X_mix, y_mix = mix_data.drop(columns=["target", "source"]), mix_data["target"]
X_smote, y_smote = smote_data.drop(columns=["target", "source"]), smote_data["target"]
X_GAN, y_GAN = GAN_data.drop(columns=["target", "source"]), GAN_data["target"]
X_borderline, y_borderline = borderline_data.drop(columns=["target", "source"]), borderline_data["target"]

#dodac standrazycje 

#Dictionary
data = {}
data["mix"] = (X_mix, y_mix)
data["smote"] = (X_smote, y_smote)
data["GAN"] = (X_GAN, y_GAN)
data["borderline"] = (X_borderline, y_borderline)

compare = {}
compare["mix"] = mix_data
compare["smote"] = smote_data
compare["GAN"] = GAN_data
compare["borderline"] = borderline_data

### HDBSCAN + Euclidean distance

In [None]:
hdbscan = HDBSCAN(store_centers="centroid")

#rows_in_cluster = {}
cluster_data_ = {
    "mix": {},
    "smote": {},
    "GAN": {},
    "borderline": {}
}
centroids_ = {}

results_ = {}
df_ = {}

results_HDBSCAN_DIST_ = {
    "mix": {},
    "smote": {},
    "GAN": {},
    "borderline": {}
}
labels_={}

for name in data.keys() & compare.keys():
    X_train, y_train = data[name]
    compare_df = compare[name]
    
    X_majority = X_train[y_train == 1]
    #X_minority = X_train[y_train == 0]
    print(X_majority.shape)
    #print(X_minority.shape)
    print(name)
    
    hdbscan_res = hdbscan.fit(X_majority)
    labels = hdbscan_res.labels_
    unique_lables = np.unique(labels)
    labels_[name] = len(unique_lables[unique_lables >=0])
    labels = len(unique_lables[unique_lables >=0])
    
    print(f"{name}: {labels_[name]}")
    centroids_ = pd.DataFrame(hdbscan_res.centroids_, columns=X_train.columns)
    #print(hdbscan_res.centroids_)
    
    centroids_hdbscan = hdbscan.fit_predict(X_majority)
    #print(centroids_hdbscan)

    #centroids
    for i in range(labels):
        rows_in_cluster = X_majority[hdbscan.labels_ == i] 
        cluster_data_[name][i] = rows_in_cluster
        #print(f"Cluster {i}:")
        #print(cluster_data_[name][i])
        
        
        target = count2
        #print(f"Target: {target}")
        
        per_cluster_sorted = {}
        #calculate the nieghbors for each centroid (centroid -> rows_in_cluster)
    for i in range(len(centroids_)):
        rows = cluster_data_[name][i]
        if len(rows) == 0:
            per_cluster_sorted[i] = []
            continue
        if len(rows) == 1:
            per_cluster_sorted[i] = [rows.index[0]]
            continue

        centroid = centroids_.iloc[i].to_numpy()
        # policz dystanse do centroidu
        dists = rows.apply(lambda r: euclidean(centroid, r.to_numpy()), axis=1)
        order = dists.sort_values().index.tolist()  # indeksy X_majority w kolejności rosnących dystansów
        per_cluster_sorted[i] = order

    # Round-robin wybór do target_n
    selected_idx = []
    ptr = {i: 0 for i in range(len(centroids_))}                     # wskaźnik który „numer” brać teraz z klastra
    while len(selected_idx) < target:
        progressed = False
        for i in range(len(centroids_)):
            lst = per_cluster_sorted[i]
            j = ptr[i]
            if j < len(lst):
                idx = lst[j]
                if idx not in selected_idx:
                    selected_idx.append(idx)
                ptr[i] += 1
                progressed = True
                if len(selected_idx) >= target:
                    break
        if not progressed:  # wszystkie klastry wyczerpane
            break

    # Złóż wybrane rekordy większości (kolejność wg selekcji)
    maj_selected = X_majority.loc[selected_idx].reset_index(drop=True)   # <-- CHANGED
    results_[name] = maj_selected   
    #print(results_[name])
    #print("ilosc duplikaotow",results_[name].duplicated().sum())
        
    df_y_majority = pd.Series([1.0] * len(results_[name]), name="target")
    df_majority = pd.concat([results_[name], df_y_majority], axis=1).reset_index(drop=True)
    columns_ = list(df_majority.columns.values)
    df_majority = df_majority.merge(sum_all_data, on=columns_, how="left")
        
    df_[name] = pd.concat([df_majority, original_data], axis=0).reset_index(drop=True)  
    #print(df_[name])
    
    
    #copy source from sum_all_data 
    #columns_ = list(df_[name].columns.values)
    #df_[name] = df_[name].merge(sum_all_data, on=columns_, how="left")
    print(df_[name])
    print(df_[name].duplicated().sum())

    #df_[name].to_csv(f"D:\\ml\\undersampling_data\\data\\ssh\\reduced\\{name}_HDBSCAN_NN_data.csv", index=False)

(315, 12)
borderline
borderline: 23
     user  is_private  is_failure  is_root  is_valid  not_valid_count  \
0    30.0         1.0         1.0      0.0       0.0             13.0   
1    37.0         1.0         1.0      0.0       0.0              0.0   
2    35.0         1.0         1.0      0.0       0.0              0.0   
3    25.0         1.0         1.0      0.0       0.0             11.0   
4     8.0         1.0         1.0      0.0       0.0              0.0   
..    ...         ...         ...      ...       ...              ...   
297  30.0         1.0         1.0      1.0       1.0              0.0   
298  30.0         1.0         0.0      0.0       1.0              0.0   
299  24.0         1.0         0.0      0.0       1.0              0.0   
300  18.0         1.0         0.0      0.0       1.0              0.0   
301  30.0         1.0         0.0      0.0       1.0              0.0   

     ip_failure  ip_success  no_failure  first      td        ts  target  \
0          

In [6]:
print(labels_["smote"])

13


### HDBSCAN + cos

In [None]:
hdbscan = HDBSCAN(store_centers="centroid")

#rows_in_cluster = {}
cluster_data_ = {
    "mix": {},
    "smote": {},
    "GAN": {},
    "borderline": {}
}
centroids_ = {}

results_ = {}
df_ = {}

results_HDBSCAN_DIST_ = {
    "mix": {},
    "smote": {},
    "GAN": {},
    "borderline": {}
}


for name in data.keys() & compare.keys():
    X_train, y_train = data[name]
    compare_df = compare[name]
    
    X_majority = X_train[y_train == 1]
    #X_minority = X_train[y_train == 0]
    print(X_majority.shape)
    #print(X_minority.shape)
    print(name)
    
    hdbscan_res = hdbscan.fit(X_majority)
    labels = hdbscan_res.labels_
    unique_lables = np.unique(labels)
    labels = len(unique_lables[unique_lables >=0])
    print(labels)
    centroids_ = pd.DataFrame(hdbscan_res.centroids_, columns=X_train.columns)
    #print(hdbscan_res.centroids_)
    
    centroids_hdbscan = hdbscan.fit_predict(X_majority)
    #print(centroids_hdbscan)

    #centroids
    for i in range(labels):
        rows_in_cluster = X_majority[hdbscan.labels_ == i] 
        cluster_data_[name][i] = rows_in_cluster
        #print(f"Cluster {i}:")
        #print(cluster_data_[name][i])
        
        
        target = count2
        #print(f"Target: {target}")
        
        per_cluster_sorted = {}
        #calculate the nieghbors for each centroid (centroid -> rows_in_cluster)
    for i in range(len(centroids_)):
        rows = cluster_data_[name][i]
        if len(rows) == 0:
            per_cluster_sorted[i] = []
            continue
        if len(rows) == 1:
            per_cluster_sorted[i] = [rows.index[0]]
            continue

        centroid = centroids_.iloc[i].to_numpy().reshape(1, -1)
        # policz dystanse do centroidu
        dists = pairwise_distances(rows.values, centroid, metric="cosine").ravel()
        order = rows.index[np.argsort(dists)].tolist()  # indeksy X_majority w kolejności rosnących dystansów
        per_cluster_sorted[i] = order

    # Round-robin wybór do target_n
    selected_idx = []
    ptr = {i: 0 for i in range(len(centroids_))}                     # wskaźnik który „numer” brać teraz z klastra
    while len(selected_idx) < target:
        progressed = False
        for i in range(len(centroids_)):
            lst = per_cluster_sorted[i]
            j = ptr[i]
            if j < len(lst):
                idx = lst[j]
                if idx not in selected_idx:
                    selected_idx.append(idx)
                ptr[i] += 1
                progressed = True
                if len(selected_idx) >= target:
                    break
        if not progressed:  # wszystkie klastry wyczerpane
            break

    # Złóż wybrane rekordy większości (kolejność wg selekcji)
    maj_selected = X_majority.loc[selected_idx].reset_index(drop=True)   # <-- CHANGED
    results_[name] = maj_selected   
    #print(results_[name])
    #print("ilosc duplikaotow",results_[name].duplicated().sum())
        
    df_y_majority = pd.Series([1.0] * len(results_[name]), name="target")
    df_majority = pd.concat([results_[name], df_y_majority], axis=1).reset_index(drop=True)
    columns_ = list(df_majority.columns.values)
    df_majority = df_majority.merge(sum_all_data, on=columns_, how="left")
        
    df_[name] = pd.concat([df_majority, original_data], axis=0).reset_index(drop=True)  
    #print(df_[name])
    
    
    #copy source from sum_all_data 
    #columns_ = list(df_[name].columns.values)
    #df_[name] = df_[name].merge(sum_all_data, on=columns_, how="left")
    print(df_[name])
    print(df_[name].duplicated().sum())
       
    #df_[name].to_csv(f"D:\\ml\\undersampling_data\\data\\ssh\\reduced\\{name}_HDBSCAN_COS_data.csv", index=False)

(315, 12)
borderline
23
     user  is_private  is_failure  is_root  is_valid  not_valid_count  \
0    31.0         1.0         1.0      0.0       0.0             11.0   
1    34.0         1.0         1.0      0.0       0.0              5.0   
2    34.0         1.0         1.0      0.0       0.0              1.0   
3    31.0         1.0         1.0      0.0       0.0             11.0   
4     8.0         1.0         1.0      0.0       0.0              0.0   
..    ...         ...         ...      ...       ...              ...   
297  30.0         1.0         1.0      1.0       1.0              0.0   
298  30.0         1.0         0.0      0.0       1.0              0.0   
299  24.0         1.0         0.0      0.0       1.0              0.0   
300  18.0         1.0         0.0      0.0       1.0              0.0   
301  30.0         1.0         0.0      0.0       1.0              0.0   

     ip_failure  ip_success  no_failure  first      td        ts  target  \
0          12.0        

### KMeans (samples=HDBSCAN()) + Euclidean distance

In [None]:
KM = KMeans(n_clusters=labels_[name])

#rows_in_cluster = {}
cluster_data_ = {
    "mix": {},
    "smote": {},
    "GAN": {},
    "borderline": {}
}
centroids_ = {}

results_ = {}
df_ = {}

results_KMEANS_HDBSCAN_DIST_ = {
    "mix": {},
    "smote": {},
    "GAN": {},
    "borderline": {}
}

for name in data.keys() & compare.keys():
    X_train, y_train = data[name]
    compare_df = compare[name]
    
    KM = KMeans(n_clusters=labels_[name])
     # klasteryzacja dotyczy tylko jednego ze zbiorow drugi jest przepisywany
    X_majority = X_train[y_train == 1]
    #X_minority = X_train[y_train == 0]
    
    kmeans = KM.fit(X_majority)
    
    #centroids
    for i in range (labels_[name]):
        rows_in_cluster = X_majority[kmeans.labels_ == i] 
        cluster_data_[name][i] = rows_in_cluster
        
        centroids_ = pd.DataFrame(kmeans.cluster_centers_, columns=X_train.columns)
    print(f"{name}")
    print(centroids_)
    #results_KM_SWAP_ = {}
    target = count2
    
    per_cluster_sorted = {}
    for i in range(len(centroids_)):
        rows = cluster_data_[name][i]
        if len(rows) == 0:
            per_cluster_sorted[i] = []
            continue
        if len(rows) == 1:
            per_cluster_sorted[i] = [rows.index[0]]
            continue
             
        centroid = centroids_.iloc[i].to_numpy()
        # policz dystanse do centroidu
        dists = rows.apply(lambda r: euclidean(centroid, r.to_numpy()), axis=1)
        order = dists.sort_values().index.tolist()  # indeksy X_majority w kolejności rosnących dystansów
        per_cluster_sorted[i] = order
        
    selected_idx = []
    ptr = {i: 0 for i in range(len(centroids_))}                     # wskaźnik który „numer” brać teraz z klastra
    while len(selected_idx) < target:
        progressed = False
        for i in range(len(centroids_)):
            lst = per_cluster_sorted[i]
            j = ptr[i]
            if j < len(lst):
                idx = lst[j]
                if idx not in selected_idx:
                    selected_idx.append(idx)
                ptr[i] += 1
                progressed = True
                if len(selected_idx) >= target:
                    break
        if not progressed:  # wszystkie klastry wyczerpane
            break
        
    maj_selected = X_majority.loc[selected_idx].reset_index(drop=True)   # <-- CHANGED
    results_[name] = maj_selected     
    
    df_y_majority = pd.Series([1.0] * len(results_[name]), name="target")
    df_majority = pd.concat([results_[name], df_y_majority], axis=1).reset_index(drop=True)
    columns_ = list(df_majority.columns.values)
    df_majority = df_majority.merge(sum_all_data, on=columns_, how="left")
        
    df_[name] = pd.concat([df_majority, original_data], axis=0).reset_index(drop=True) 
    print(df_[name]) 
    
    #df_[name].to_csv(f"D:\\ml\\undersampling_data\\data\\ssh\\reduced\\{name}_KM_HDBSCAN_NN_data.csv", index=False)

  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)


borderline
         user  is_private  is_failure       is_root      is_valid  \
0   30.000000    1.000000         1.0  0.000000e+00  0.000000e+00   
1   30.705882    0.397059         1.0  1.470588e-01  9.558824e-01   
2   34.500000    1.000000         1.0  0.000000e+00  0.000000e+00   
3   23.000000    1.000000         1.0  0.000000e+00  0.000000e+00   
4   31.500000    1.000000         1.0  0.000000e+00  0.000000e+00   
5   24.500000    1.000000         1.0  0.000000e+00  0.000000e+00   
6   31.000000    1.000000         1.0  0.000000e+00  0.000000e+00   
7   35.000000    1.000000         1.0  0.000000e+00  0.000000e+00   
8   36.000000    1.000000         1.0  0.000000e+00  0.000000e+00   
9   30.000000    1.000000         1.0  0.000000e+00  0.000000e+00   
10  11.400000    1.000000         1.0  2.775558e-17 -1.665335e-16   
11  32.250000    1.000000         1.0  0.000000e+00  0.000000e+00   
12  32.000000    1.000000         1.0  0.000000e+00  0.000000e+00   
13  35.430233    0.3720

  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)


### KMeans (samples=HDBSCAN()) + cosinus

In [16]:
KM = KMeans(n_clusters=labels_[name])

#rows_in_cluster = {}
cluster_data_ = {
    "mix": {},
    "smote": {},
    "GAN": {},
    "borderline": {}
}
centroids_ = {}

results_ = {}
df_ = {}

results_KMEANS_HDBSCAN_DIST_ = {
    "mix": {},
    "smote": {},
    "GAN": {},
    "borderline": {}
}

for name in data.keys() & compare.keys():
    X_train, y_train = data[name]
    compare_df = compare[name]
    
    KM = KMeans(n_clusters=labels_[name])
     # klasteryzacja dotyczy tylko jednego ze zbiorow drugi jest przepisywany
    X_majority = X_train[y_train == 1]
    #X_minority = X_train[y_train == 0]
    
    kmeans = KM.fit(X_majority)
    
    #centroids
    for i in range (labels_[name]):
        rows_in_cluster = X_majority[kmeans.labels_ == i] 
        cluster_data_[name][i] = rows_in_cluster
        
        centroids_ = pd.DataFrame(kmeans.cluster_centers_, columns=X_train.columns)
    print(f"{name}")
    print(centroids_)
    #results_KM_SWAP_ = {}
    target = count2
    
    per_cluster_sorted = {}
    for i in range(len(centroids_)):
        rows = cluster_data_[name][i]
        if len(rows) == 0:
            per_cluster_sorted[i] = []
            continue
        if len(rows) == 1:
            per_cluster_sorted[i] = [rows.index[0]]
            continue
             
        centroid = centroids_.iloc[i].to_numpy().reshape(1, -1)
        # policz dystanse do centroidu
        dists = pairwise_distances(rows.values, centroid, metric="cosine").ravel()
        order = rows.index[np.argsort(dists)].tolist()  # indeksy X_majority w kolejności rosnących dystansów
        per_cluster_sorted[i] = order
        
    selected_idx = []
    ptr = {i: 0 for i in range(len(centroids_))}                     # wskaźnik który „numer” brać teraz z klastra
    while len(selected_idx) < target:
        progressed = False
        for i in range(len(centroids_)):
            lst = per_cluster_sorted[i]
            j = ptr[i]
            if j < len(lst):
                idx = lst[j]
                if idx not in selected_idx:
                    selected_idx.append(idx)
                ptr[i] += 1
                progressed = True
                if len(selected_idx) >= target:
                    break
        if not progressed:  # wszystkie klastry wyczerpane
            break
        
    maj_selected = X_majority.loc[selected_idx].reset_index(drop=True)   # <-- CHANGED
    results_[name] = maj_selected     
    
    df_y_majority = pd.Series([1.0] * len(results_[name]), name="target")
    df_majority = pd.concat([results_[name], df_y_majority], axis=1).reset_index(drop=True)
    columns_ = list(df_majority.columns.values)
    df_majority = df_majority.merge(sum_all_data, on=columns_, how="left")
        
    df_[name] = pd.concat([df_majority, original_data], axis=0).reset_index(drop=True) 
    print(df_[name]) 
    
    df_[name].to_csv(f"D:\\ml\\undersampling_data\\data\\ssh\\reduced\\{name}_KM_HDBSCAN_COS_data.csv", index=False)

  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)


borderline
         user  is_private  is_failure       is_root      is_valid  \
0   33.177778    0.177778         1.0  2.444444e-01  9.777778e-01   
1   35.400000    1.000000         1.0  0.000000e+00 -5.551115e-17   
2   24.500000    1.000000         1.0  0.000000e+00  0.000000e+00   
3   31.000000    1.000000         1.0  0.000000e+00  0.000000e+00   
4   30.500000    1.000000         1.0  0.000000e+00  0.000000e+00   
5   30.000000    1.000000         1.0  0.000000e+00  0.000000e+00   
6   34.500000    1.000000         1.0  0.000000e+00  0.000000e+00   
7   34.000000    1.000000         1.0  0.000000e+00  0.000000e+00   
8   11.689655    1.000000         1.0  5.551115e-17 -1.665335e-16   
9   31.500000    1.000000         1.0  0.000000e+00  0.000000e+00   
10  36.000000    1.000000         1.0  0.000000e+00  0.000000e+00   
11  31.666667    1.000000         1.0  0.000000e+00  0.000000e+00   
12  22.000000    1.000000         1.0  0.000000e+00  0.000000e+00   
13  34.009615    0.4711

  super()._check_params_vs_input(X, default_n_init=10)


#### -------------------

In [None]:
#KM = KMeans(n_clusters=(int)((count1+count2)/2), init="k-means++")

centroids_rows_ = {
    "mix": {},
    "smote": {},
    "GAN": {},
    "borderline": {}
}

centroids_ = {}

results_KM_COS_ = {
    "mix": {},
    "smote": {},
    "GAN": {},
    "borderline": {}
}

results_ = {}
df_ = {}

for (name, (X_train, y_train)), (_, compare_df) in zip(data.items(), compare.items()):
     # klasteryzacja dotyczy tylko jednego ze zbiorow drugi jest przepisywany
    X_majority = X_train[y_train == 1]
    X_minority = X_train[y_train == 0]
    print(X_minority.shape)
    
    kmeans = KM.fit(X_majority)
    
    #centroids
    for i in range ((int)((count1+count2)/2)):
        rows_in_cluster = X_majority[kmeans.labels_ == i] 
        centroids_rows_[name][i] = rows_in_cluster
        
        centroids_ = pd.DataFrame(kmeans.cluster_centers_, columns=X_train.columns)
    
    #results_KM_COS_ = {}
    
    
    for i in range(len(centroids_)):                #dla kazdego z centroidow
        if (len(centroids_rows_[name][i])>1):       #sprawdzam czy jest wiecej niz jeden wiersz w klastrze
            dist_={}
            index_ = {}
            centroid = centroids_.iloc[i]
            centroid = centroid.values.reshape(1,-1)
            for j in range(len(centroids_rows_[name][i])):
                index_ = list(centroids_rows_[name][i].index)
                row = centroids_rows_[name][i].iloc[j]
                row = row.values.reshape(1,-1)
                index_map = {j: idx for j, idx in enumerate(index_)}
                dist_[j] = cosine_similarity(centroid, row)         #tworze slwonik wartosci
                
            min_key = min(dist_, key=dist_.get)
            results_KM_COS_[name][i] = centroids_rows_[name][i].iloc[[min_key]]
            
        else:
            results_KM_COS_[name][i] = centroids_rows_[name][i].iloc[[0]]
        
        results_[name] = pd.concat(results_KM_COS_[name].values(), ignore_index=True)    
        
    df_y_majority = pd.Series([1] * (int)((count1+count2)/2), name="target") 
    df_majority = pd.concat([results_[name], df_y_majority], axis=1).reset_index(drop=True)
    print(df_majority.shape)  
     
    df_X_minority = X_minority.reset_index(drop=True)
    df_y_minority = pd.Series([0] * len(X_minority), name="target")
    df_miniority = pd.concat([df_X_minority, df_y_minority], axis=1).reset_index(drop=True)
    print(df_miniority.shape)

    df_[name] = pd.concat([df_majority, df_miniority], axis=0).reset_index(drop=True)  
    print(df_[name])
    
    #copy source from sum_all_data 
    columns_ = list(df_[name].columns.values)
    df_[name] = df_[name].merge(sum_all_data, on=columns_, how="left")
    print(df_[name])  
    print(df_[name].dtypes)
    
    df_[name].to_csv(f"D:\\ml\\undersampling_data\\data\\ssh\\reduced\\{name}_KM_COS_data.csv", index=False)
    
    print(f"Num duplicates: {df_[name].duplicated().sum()}")
    

In [None]:
#KM = KMeans(n_clusters=(int)((count1+count2)/2))

centroids_rows_ = {
    "mix": {},
    "smote": {},
    "GAN": {},
    "borderline": {}
}

centroids_ = {}

results_KM_SWAP_ = {
    "mix": {},
    "smote": {},
    "GAN": {},
    "borderline": {}
}

results_ = {}
df_ = {}

for (name, (X_train, y_train)), (_, compare_df) in zip(data.items(), compare.items()):
     # klasteryzacja dotyczy tylko jednego ze zbiorow drugi jest przepisywany
    X_majority = X_train[y_train == 1]
    X_minority = X_train[y_train == 0]
    
    kmeans = KM.fit(X_majority)
    
    #centroids
    for i in range ((int)((count1+count2)/2)):
        rows_in_cluster = X_majority[kmeans.labels_ == i] 
        centroids_rows_[name][i] = rows_in_cluster
        
        centroids_ = pd.DataFrame(kmeans.cluster_centers_, columns=X_train.columns)
    
    #results_KM_SWAP_ = {}
    
    
    for i in range(len(centroids_)):
        if (len(centroids_rows_[name][i])>1):
            dist_={}
            index_ = {}
            centroid = centroids_.iloc[i]
            for j in range(len(centroids_rows_[name][i])):
                index_ = list(centroids_rows_[name][i].index)
                row = centroids_rows_[name][i].iloc[j]
                index_map = {j: idx for j, idx in enumerate(index_)}
                dist_[j] = euclidean(centroid, row)         #tworze slwonik wartosci
                
            min_key = min(dist_, key=dist_.get)
            results_KM_SWAP_[name][i] = centroids_rows_[name][i].iloc[[min_key]]
            
        else:
            results_KM_SWAP_[name][i] = centroids_rows_[name][i].iloc[[0]]
        
        results_[name] = pd.concat(results_KM_SWAP_[name].values(), ignore_index=True)    
        
    df_y_majority = pd.Series([1] * (int)((count1+count2)/2), name="target") 
    df_majority = pd.concat([results_[name], df_y_majority], axis=1).reset_index(drop=True)
        
     
    df_X_minority = X_minority.reset_index(drop=True)
    df_y_minority = pd.Series([0] * len(X_minority), name="target")
    df_miniority = pd.concat([df_X_minority, df_y_minority], axis=1).reset_index(drop=True)

    df_[name] = pd.concat([df_majority, df_miniority], axis=0).reset_index(drop=True)  
    
    #copy source from sum_all_data 
    columns_ = list(df_[name].columns.values)
    df_[name] = df_[name].merge(sum_all_data, on=columns_, how="left")
    print(df_[name])  
    
    df_[name].to_csv(f"D:\\ml\\undersampling_data\\data\\ssh\\reduced\\{name}_KM_NN_data.csv", index=False)

In [None]:
    for i in range(len(centroids_)):
        
        if (len(cluster_data_[name][i])>1):
            dist_={}
            index_ = {}
            centroid = centroids_.iloc[i]
            for j in range(len(cluster_data_[name][i])):
                index_ = list(cluster_data_[name][i].index)
                row = cluster_data_[name][i].iloc[j]
                index_map = {j: idx for j, idx in enumerate(index_)}
                dist_[j] = euclidean(centroid, row)         #tworze slwonik wartosci
                
            min_key = min(dist_, key=dist_.get)
            results_HDBSCAN_DIST_[name][i] = cluster_data_[name][i].iloc[[min_key]]
            
        else:
            results_HDBSCAN_DIST_[name][i] = cluster_data_[name][i].iloc[[0]]
        
        results_[name] = pd.concat(results_HDBSCAN_DIST_[name].values(), ignore_index=True)
        print(results_[name])
        print("ilosc duplikaotow",results_[name].duplicated().sum())