In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import datacompy
import os, sys
import numpy as np
import re
import ast

# narzedzia
from sklearn.model_selection import (
    train_test_split,
    cross_val_score,
    cross_val_predict,
    learning_curve,
    RepeatedStratifiedKFold,
    GridSearchCV,
)
from collections import Counter
from sklearn.metrics import classification_report, confusion_matrix, silhouette_score, accuracy_score
from sklearn import tree
from sklearn.tree import plot_tree
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from scipy import stats
from scipy.stats import chi2_contingency
from joblib import dump, load

# modele
from xgboost import XGBClassifier, XGBRFClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# methods
from imblearn.under_sampling import ClusterCentroids, NearMiss
from scipy.optimize import differential_evolution
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor, KNeighborsClassifier
from sklearn.inspection import permutation_importance
from scipy.spatial.distance import euclidean
from imblearn.over_sampling import SMOTE, BorderlineSMOTE
#from ctgan import CTGAN

In [2]:
#oversampling data
original_data = pd.read_csv("D:\\ml\\undersampling_data\\data\\ssh\\original_data.csv")    
original_data = original_data.drop(columns=["Unnamed: 0"])
smote_data = pd.read_csv("D:\\ml\\undersampling_data\\data\\ssh\\smote_data.csv")
GAN_data = pd.read_csv("D:\\ml\\undersampling_data\\data\\ssh\\GAN_data.csv")
borderline_data = pd.read_csv("D:\\ml\\undersampling_data\\data\\ssh\\borderline_data.csv")
smote2_data = pd.read_csv("D:\\ml\\undersampling_data\\data\\ssh\\smote3_data.csv")
GAN2_data = pd.read_csv("D:\\ml\\undersampling_data\\data\\ssh\\GAN3_data.csv")    
borderline2_data = pd.read_csv("D:\\ml\\undersampling_data\\data\\ssh\\borderline3_data.csv")

# test data
X_test = pd.read_csv("D:\\ml\\undersampling_data\\data\\ssh\\test\\X_test.csv")
y_test = pd.read_csv("D:\\ml\\undersampling_data\\data\\ssh\\test\\y_test.csv")

#Before undersampling
print(original_data["target"].value_counts())
count1=original_data["target"].value_counts().sum()
print(count1)
count2=abs((original_data['target']==0).sum() - (original_data['target']==1).sum())
print(f"Before undersampling: {count2}")
print(f"After number of samples: {count1+count2}")

target
0    151
1     46
Name: count, dtype: int64
197
Before undersampling: 105
After number of samples: 302


### Preprocessing Data

In [3]:
#mixed data
mix_data = pd.concat([GAN_data, smote_data, borderline_data, original_data], axis=0, ignore_index=True)         
mix_data = mix_data.reset_index(drop=True)

#data with one oversampling method and original data e.g.(smote+original)
smote_data = pd.concat([smote_data, smote2_data, original_data], axis=0, ignore_index=True)
smote_data = smote_data.reset_index(drop=True)
borderline_data = pd.concat([borderline_data, borderline2_data, original_data], axis=0, ignore_index=True)
borderline_data = borderline_data.reset_index(drop=True)
GAN_data = pd.concat([GAN_data, GAN2_data, original_data], axis=0, ignore_index=True)
GAN_data = GAN_data.reset_index(drop=True)

sum_all_data = pd.concat([smote_data, GAN_data, borderline_data, original_data], axis=0, ignore_index=True)
sum_all_data = sum_all_data.drop_duplicates()


#Split data
X_mix, y_mix = mix_data.drop(columns=["target", "source"]), mix_data["target"]
X_smote, y_smote = smote_data.drop(columns=["target", "source"]), smote_data["target"]
X_GAN, y_GAN = GAN_data.drop(columns=["target", "source"]), GAN_data["target"]
X_borderline, y_borderline = borderline_data.drop(columns=["target", "source"]), borderline_data["target"]

#Dictionary
data = {}
data["mix"] = (X_mix, y_mix)
data["smote"] = (X_smote, y_smote)
data["GAN"] = (X_GAN, y_GAN)
data["borderline"] = (X_borderline, y_borderline)

compare = {}
compare["mix"] = mix_data
compare["smote"] = smote_data
compare["GAN"] = GAN_data
compare["borderline"] = borderline_data

### Undersampling

#### K-means + centroids

In [10]:
KM = KMeans(n_clusters=(int)((count1+count2)/2))

for (name, (X_train, y_train)), (_, compare_df) in zip(data.items(), compare.items()):
    
    X_majority = X_train[y_train == 1]
    X_minority = X_train[y_train == 0]
    
    kmeans = KM.fit(X_majority)
    
    X_majority_reduced = pd.DataFrame(kmeans.cluster_centers_, columns=X_train.columns)
    y_majority_reduced = pd.Series([1] * (int)((count1+count2)/2), name="target") 
    
    X_minority = X_minority.reset_index(drop=True)
    y_minority = pd.Series([0] * len(X_minority), name="target")
    
    X_final = pd.concat([X_majority_reduced, X_minority], axis=0).reset_index(drop=True) 
    y_final = pd.concat([y_majority_reduced, y_minority], axis=0).reset_index(drop=True)
    
    # Concat resampled data
    reduced_data = pd.concat([X_final, y_final], axis=1)
    
    reduced_data["source"] = None  # Initialize the source column with None
    
    # gdy target = 1 wtedy source = "centroid", inaczej source = "original"
    
    # Compare data to copy source column
    data_nosource = compare_df
    reduced_data_nosource = reduced_data

    # Iterate through the rows in reduced_data_nosource
    for index, row in reduced_data_nosource.iterrows():
        match = data_nosource.eq(row).all(axis=1)  # Check where rows are identical
        if match.any():  # If a match is found
            matched_index = match.idxmax()  # Get the first matching index
            reduced_data.loc[index, "source"] = compare_df.loc[matched_index, "source"]
            
    # Check for any rows that still have None in the source column
    missing_source = reduced_data[reduced_data["source"].isna()]
    if not missing_source.empty:
        reduced_data.loc[reduced_data["source"].isna(), "source"] = "centroid"       
    
    reduced_data.to_csv(f"D:\\ml\\undersampling_data\\data\\ssh\\reduced\\{name}_KM_centroids_data.csv", index=False)
    
    print(f"Data reduced for {name} data")
    print(reduced_data["target"].value_counts())

  super()._check_params_vs_input(X, default_n_init=10)


Data reduced for mix data
target
1    151
0    151
Name: count, dtype: int64


  super()._check_params_vs_input(X, default_n_init=10)


Data reduced for smote data
target
1    151
0    151
Name: count, dtype: int64


  super()._check_params_vs_input(X, default_n_init=10)


Data reduced for GAN data
target
1    151
0    151
Name: count, dtype: int64


  super()._check_params_vs_input(X, default_n_init=10)


Data reduced for borderline data
target
1    151
0    151
Name: count, dtype: int64


#### K-means + the nearest neighbour

In [None]:
#KM = KMeans(n_clusters=(int)((count1+count2)/2))

centroids_rows_ = {
    "mix": {},
    "smote": {},
    "GAN": {},
    "borderline": {}
}

centroids_ = {}

results_KM_SWAP_ = {
    "mix": {},
    "smote": {},
    "GAN": {},
    "borderline": {}
}

results_ = {}
df_ = {}

for (name, (X_train, y_train)), (_, compare_df) in zip(data.items(), compare.items()):
     # klasteryzacja dotyczy tylko jednego ze zbiorow drugi jest przepisywany
    X_majority = X_train[y_train == 1]
    X_minority = X_train[y_train == 0]
    
    kmeans = KM.fit(X_majority)
    
    #centroids
    for i in range ((int)((count1+count2)/2)):
        rows_in_cluster = X_majority[kmeans.labels_ == i] 
        centroids_rows_[name][i] = rows_in_cluster
        
        centroids_ = pd.DataFrame(kmeans.cluster_centers_, columns=X_train.columns)
    
    #results_KM_SWAP_ = {}
    
    
    for i in range(len(centroids_)):
        if (len(centroids_rows_[name][i])>1):
            dist_={}
            index_ = {}
            centroid = centroids_.iloc[i]
            for j in range(len(centroids_rows_[name][i])):
                index_ = list(centroids_rows_[name][i].index)
                row = centroids_rows_[name][i].iloc[j]
                index_map = {j: idx for j, idx in enumerate(index_)}
                dist_[j] = euclidean(centroid, row)         #tworze slwonik wartosci
                
            min_key = min(dist_, key=dist_.get)
            results_KM_SWAP_[name][i] = centroids_rows_[name][i].iloc[[min_key]]
            
        else:
            results_KM_SWAP_[name][i] = centroids_rows_[name][i].iloc[[0]]
        
        results_[name] = pd.concat(results_KM_SWAP_[name].values(), ignore_index=True)    
        
    df_y_majority = pd.Series([1] * (int)((count1+count2)/2), name="target") 
    df_majority = pd.concat([results_[name], df_y_majority], axis=1).reset_index(drop=True)
        
     
    df_X_minority = X_minority.reset_index(drop=True)
    df_y_minority = pd.Series([0] * len(X_minority), name="target")
    df_miniority = pd.concat([df_X_minority, df_y_minority], axis=1).reset_index(drop=True)

    df_[name] = pd.concat([df_majority, df_miniority], axis=0).reset_index(drop=True)  
    
    #copy source from sum_all_data 
    columns_ = list(df_[name].columns.values)
    df_[name] = df_[name].merge(sum_all_data, on=columns_, how="left")
    print(df_[name])  
    
    df_[name].to_csv(f"D:\\ml\\undersampling_data\\data\\ssh\\reduced\\{name}_KM_NN_data.csv", index=False)

  super()._check_params_vs_input(X, default_n_init=10)


     user  is_private  is_failure  is_root  is_valid  not_valid_count  \
0       4           1           1        0         0                8   
1      31           1           1        0         0               10   
2      15           1           1        0         0                6   
3      30           1           1        0         0               14   
4      26           1           1        0         0                0   
..    ...         ...         ...      ...       ...              ...   
297    30           1           1        1         1                0   
298    30           1           0        0         1                0   
299    24           1           0        0         1                0   
300    18           1           0        0         1                0   
301    30           1           0        0         1                0   

     ip_failure  ip_success  no_failure  first    td        ts  target  \
0             9           0          14      0   

  super()._check_params_vs_input(X, default_n_init=10)


     user  is_private  is_failure  is_root  is_valid  not_valid_count  \
0      11           1           1        0         0                0   
1      37           1           1        0         0                1   
2      37           1           1        0         0                0   
3      24           1           1        0         0                1   
4      31           0           1        1         1                0   
..    ...         ...         ...      ...       ...              ...   
297    30           1           1        1         1                0   
298    30           1           0        0         1                0   
299    24           1           0        0         1                0   
300    18           1           0        0         1                0   
301    30           1           0        0         1                0   

     ip_failure  ip_success  no_failure  first    td        ts  target  \
0            31           0          31      0   

  super()._check_params_vs_input(X, default_n_init=10)


     user  is_private  is_failure  is_root  is_valid  not_valid_count  \
0      18           1           1        0         0               15   
1      44           1           1        1         0               17   
2       3           1           1        0         0               21   
3      30           0           1        0         0               10   
4      28           1           1        0         0                2   
..    ...         ...         ...      ...       ...              ...   
297    30           1           1        1         1                0   
298    30           1           0        0         1                0   
299    24           1           0        0         1                0   
300    18           1           0        0         1                0   
301    30           1           0        0         1                0   

     ip_failure  ip_success  no_failure  first    td        ts  target  \
0            42           2          12      0   

  super()._check_params_vs_input(X, default_n_init=10)


     user  is_private  is_failure  is_root  is_valid  not_valid_count  \
0      31           1           1        0         0                7   
1      33           1           1        0         0                7   
2      30           1           1        0         0               13   
3      14           1           1        0         0                5   
4      28           1           1        0         0               10   
..    ...         ...         ...      ...       ...              ...   
297    30           1           1        1         1                0   
298    30           1           0        0         1                0   
299    24           1           0        0         1                0   
300    18           1           0        0         1                0   
301    30           1           0        0         1                0   

     ip_failure  ip_success  no_failure  first    td        ts  target  \
0             7           0           7      0   

#### K-means + cosinus

In [None]:
centroids_rows_ = {
    "mix": {},
    "smote": {},
    "GAN": {},
    "borderline": {}
}

centroids_ = {}

results_KM_SWAP_ = {
    "mix": {},
    "smote": {},
    "GAN": {},
    "borderline": {}
}

results_ = {}
df_ = {}

for (name, (X_train, y_train)), (_, compare_df) in zip(data.items(), compare.items()):
     # klasteryzacja dotyczy tylko jednego ze zbiorow drugi jest przepisywany
    X_majority = X_train[y_train == 1]
    X_minority = X_train[y_train == 0]
    
    kmeans = KM.fit(X_majority)
    
    #centroids
    for i in range ((int)((count1+count2)/2)):
        rows_in_cluster = X_majority[kmeans.labels_ == i] 
        centroids_rows_[name][i] = rows_in_cluster
        
        centroids_ = pd.DataFrame(kmeans.cluster_centers_, columns=X_train.columns)
    
    #results_KM_SWAP_ = {}
    
    
    for i in range(len(centroids_)):
        if (len(centroids_rows_[name][i])>1):
            dist_={}
            index_ = {}
            centroid = centroids_.iloc[i]
            for j in range(len(centroids_rows_[name][i])):
                index_ = list(centroids_rows_[name][i].index)
                row = centroids_rows_[name][i].iloc[j]
                #index_ = { "index_rows": j, "index_centroid": list(centroids_rows_[name][i].index) } #tworze slwonik wartosci
                index_map = {j: idx for j, idx in enumerate(index_)}
                
                #print(index_)
                dist_[j] = euclidean(centroid, row)         #tworze slwonik wartosci
                
            #print(index_map)    
            min_key = min(dist_, key=dist_.get)
            #print(min_key)
            #new_cent = index_.get
            results_KM_SWAP_[name][i] = centroids_rows_[name][i].iloc[[min_key]]
            
            #print(results_KM_SWAP_[i])
            
            #results_KM_SWAP_[i] = dist_.iloc[min_index]
            #print(dist_[0])
        else:
            results_KM_SWAP_[name][i] = centroids_rows_[name][i].iloc[[0]]
        
        results_[name] = pd.concat(results_KM_SWAP_[name].values(), ignore_index=True)    
        #results_KM_SWAP_[name][i]["target"] = 1      
        #results_KM_SWAP_[name][i]["source"] = None
        #print(results_KM_SWAP_[name])    
    
    
    """ for key in ["mix", "smote", "GAN", "borderline"]:
        df_[name] = pd.DataFrame.from_dict(results_KM_SWAP_[name], orient="index")        """   

    
        
    df_y_majority = pd.Series([1] * (int)((count1+count2)/2), name="target") 
    df_majority = pd.concat([results_[name], df_y_majority], axis=1).reset_index(drop=True)
        
    #print(df_majority)        
    df_X_minority = X_minority.reset_index(drop=True)
    df_y_minority = pd.Series([0] * len(X_minority), name="target")
    df_miniority = pd.concat([df_X_minority, df_y_minority], axis=1).reset_index(drop=True)
    #print(df_miniority)
        #print(df_[name])
    df_[name] = pd.concat([df_majority, df_miniority], axis=0).reset_index(drop=True)  
    
    #copy source from sum_all_data 
    columns_ = list(df_[name].columns.values)
    df_[name] = df_[name].merge(sum_all_data, on=columns_, how="left")
    print(df_[name])  
    
    df_[name].to_csv(f"D:\\ml\\undersampling_data\\data\\ssh\\reduced\\{name}_KM_cos_data.csv", index=False)

#### K-means + cosinus + distance

#### DBSCAN + distance

#### DBSCAN + cosinus

#### K-means (resampling = calculate by DBSCAN) + centroids

#### K-means (resampling = calculate by DBSCAN) + the nearest neighbour

#### K-means (resampling = calculate by DBSCAN) + cosinus

#### K-means (resampling = calculate by DBSCAN) + cosinus + distance