### Library

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
#import seaborn as sns
import pickle
import datacompy
import os, sys
import numpy as np
import re
import ast
import gower
from pathlib import Path

# narzedzia
from sklearn.model_selection import (
    train_test_split,
    cross_val_score,
    cross_val_predict,
    learning_curve,
    RepeatedStratifiedKFold,
    GridSearchCV,
)
from collections import Counter
from sklearn.metrics import classification_report, confusion_matrix, silhouette_score, accuracy_score, pairwise_distances
from sklearn.metrics.pairwise import cosine_similarity
from sklearn import tree
from sklearn.tree import plot_tree
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from scipy import stats
from scipy.stats import chi2_contingency
from scipy.spatial import distance
from joblib import dump, load

# modele
from xgboost import XGBClassifier, XGBRFClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# methods
from imblearn.under_sampling import ClusterCentroids, NearMiss
from scipy.optimize import differential_evolution
from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN, HDBSCAN
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor, KNeighborsClassifier, NearestNeighbors
from sklearn.inspection import permutation_importance
from scipy.spatial.distance import euclidean
from imblearn.over_sampling import SMOTE, BorderlineSMOTE
#from ctgan import CTGAN

### Read datasets after preprocessed

In [15]:
#oversampling data
original_df = pd.read_csv("D:\\ml\\undersampling_data\\data\\ssh\\encoded_normalized\\original_data_normalized.csv")
mix_df = pd.read_csv("D:\\ml\\undersampling_data\\data\\ssh\\encoded_normalized\\mix_data_normalized.csv")
smote_df = pd.read_csv("D:\\ml\\undersampling_data\\data\\ssh\\encoded_normalized\\smote_data_normalized.csv") 
gan_df = pd.read_csv("D:\\ml\\undersampling_data\\data\\ssh\\encoded_normalized\\gan_data_normalized.csv")
borderline_df = pd.read_csv("D:\\ml\\undersampling_data\\data\\ssh\\encoded_normalized\\borderline_data_normalized.csv")

X_test = pd.read_csv("D:\\ml\\undersampling_data\\data\\ssh\\test\\X_test_norm.csv")

In [16]:
#Before undersampling
print(original_df["target"].value_counts())
count1=original_df["target"].value_counts().sum()
count2=abs((original_df['target']==0).sum() - (original_df['target']==1).sum())
print(f"Before undersampling: {count2}")
print(f"After number of samples: {count1+count2}")

target
0.0    151
1.0     46
Name: count, dtype: int64
Before undersampling: 105
After number of samples: 302


In [17]:
#Sum up all df 
sum_all_data = pd.concat([smote_df, gan_df, borderline_df, original_df], axis=0, ignore_index=True)

#Split data
X_mix, y_mix = mix_df.drop(columns=["target", "source"]), mix_df["target"]
X_smote, y_smote = smote_df.drop(columns=["target", "source"]), smote_df["target"]
X_GAN, y_GAN = gan_df.drop(columns=["target", "source"]), gan_df["target"]
X_borderline, y_borderline = borderline_df.drop(columns=["target", "source"]), borderline_df["target"]

#Dictionary
data = {}
data["mix"] = (X_mix, y_mix)
data["smote"] = (X_smote, y_smote)
data["GAN"] = (X_GAN, y_GAN)
data["borderline"] = (X_borderline, y_borderline)

compare = {}
compare["mix"] = mix_df
compare["smote"] = smote_df
compare["GAN"] = gan_df
compare["borderline"] = borderline_df

### Undersampling df

#### Kmeans + centroids

In [None]:
base = Path("D:/ml/undersampling_data/data/ssh/reduced")
base2 = Path("D:/ml/undersampling_data/models/kmeans&centroids")

df_ = {}

for name in data.keys() & compare.keys():
    # Read data from dictionary
    X_train, y_train = data[name]
    compare_df = compare[name]
    
    # Select majority class
    X_majority = X_train[y_train == 1]
    
    # Apply KMeans clustering
    KM = KMeans(n_clusters=(int)(count2))
    kmeans = KM.fit(X_majority)
    
    # Save KMeans model
    file_path2 = base2 / f"{name}_kmeans_model.pkl"
    with open(file_path2, "wb") as f:
        pickle.dump(kmeans, f)
    
    # Create a DataFrame for centroids
    X_majority_reduced = pd.DataFrame(kmeans.cluster_centers_, columns=X_train.columns)
    y_majority_reduced = pd.Series([1] * (int)(count2), name="target") 
    
    # Combine reduced majority class with original minority class
    df_majority = pd.concat([X_majority_reduced, y_majority_reduced], axis=1).reset_index(drop=True)
    
    # Add source column if not present
    df_majority["source"] = None
    missing_source = df_majority[df_majority["source"].isna()]
    if not missing_source.empty:
        df_majority.loc[df_majority["source"].isna(), "source"] = "centroid" 
    
    print(df_majority)
    df_majority = df_majority.reindex(columns=original_df.columns, fill_value=0.0)
    
    # Combine with original minority class
    df_[name] = pd.concat([df_majority, original_df], axis=0).reset_index(drop=True)  
    
    print(df_[name].info())
    
    # Save to CSV if file does not exist
    file_path = base / f"{name}_KM_centroids.csv"

    if os.path.exists(file_path):
        print("File exists.")
    else:
        df_[name].to_csv(file_path, index=False)
        print("File saved.")

In [None]:
    user_cols = [
    'kamran', 'student', 'root', 'admins', 'phoenix', 'piglet',
    'rainbow', 'runner', 'sam', 'abc123', 'passwd', 'newpass',
    'notused', 'Hockey', 'internet', 'asshole', 'Maddock', 'computer',
    'Mickey', 'qwerty', 'fiction', 'orange', 'tigger', 'wheeling',
    'mustang', 'admin', 'jennifer', 'money', 'Justin', 'chris',
    'david', 'foobar', 'buster', 'harley', 'jordan', 'stupid',
    'apple', 'fred', 'summer', 'sunshine', 'andrew', 'osamac',
    'gta', 'adminx', 'gtta', 'osamax'
    ]
    df = df_[name]
    # policz sumę wartości 1 w każdym wierszu
    row_sums = df[user_cols].sum(axis=1)
    mask = row_sums == 1

    # jeśli chcesz sprawdzić globalnie:
    all_ok = mask.all()

    print("Czy każdy wiersz ma dokładnie jedno '1' w kolumnach 1–45? ->", all_ok)
    
    

### KMeans + The Nearest Neighbor

In [None]:
base = Path("D:/ml/undersampling_data/data/ssh/reduced")
base2 = Path("D:/ml/undersampling_data/models/kmeans&nn")

cluster_data_ = {
    "mix": {},
    "smote": {},
    "GAN": {},
    "borderline": {}
}
centroids_ = {}

results_ = {}
df_ = {}

results_KM_NN_ = {
    "mix": {},
    "smote": {},
    "GAN": {},
    "borderline": {}
}


for name in data.keys() & compare.keys():
    # Read data from dictionary
    X_train, y_train = data[name]
    compare_df = compare[name]
    
    # Select majority class (synthetic samples)
    X_majority = X_train[y_train == 1]
    
    # Apply KMeans clustering
    KM = KMeans(n_clusters=(int)(count2))
    kmeans = KM.fit(X_majority)
    
    # Save KMeans model
    file_path2 = base2 / f"{name}_kmeans_model.pkl"
    with open(file_path2, "wb") as f:
        pickle.dump(kmeans, f)
    
    # Centroids + Add rows to dictionary
    for i in range ((int)(count2)):
        rows_in_cluster = X_majority[kmeans.labels_ == i] 
        cluster_data_[name][i] = rows_in_cluster
        
        centroids_ = pd.DataFrame(kmeans.cluster_centers_, columns=X_train.columns)
    
    # Original minority class
    for i in range(len(centroids_)):
        if (len(cluster_data_[name][i])>1):     # If more than one sample in the cluster
            dist_={}
            index_ = {}
            centroid = centroids_.iloc[i]
            for j in range(len(cluster_data_[name][i])):
                index_ = list(cluster_data_[name][i].index)
                row = cluster_data_[name][i].iloc[j]
                index_map = {j: idx for j, idx in enumerate(index_)}
                dist_[j] = euclidean(centroid, row)         # Colculate Euclidean distance
                
            min_key = min(dist_, key=dist_.get)
            results_KM_NN_[name][i] = cluster_data_[name][i].iloc[[min_key]]    # Select the nearest neighbor to the centroid
            
        else:           # If only one sample in the cluster
            results_KM_NN_[name][i] = cluster_data_[name][i].iloc[[0]]
        
        results_[name] = pd.concat(results_KM_NN_[name].values(), ignore_index=True)    
    
    # Combine reduced majority class with original df
    df_y_majority = pd.Series([1] * (int)(count2), name="target") 
    df_majority = pd.concat([results_[name], df_y_majority], axis=1).reset_index(drop=True)
    columns_ = list(df_majority.columns.values)
    df_majority = df_majority.merge(sum_all_data, on=columns_, how="left")
     
    df_[name] = pd.concat([df_majority, original_df], axis=0).reset_index(drop=True) 
    
    print(df_[name].info())
    
    # Save to CSV if file does not exist
    file_path = base / f"{name}_KM_nn.csv"

    if os.path.exists(file_path):
        print("File exists.")
    else:
        df_[name].to_csv(file_path, index=False)
        print("File saved.")

### KMeans + cosinus similarity         #nie działa

In [None]:
base = Path("D:/ml/undersampling_data/data/ssh/reduced")
base2 = Path("D:/ml/undersampling_data/models/kmeans&cos")

cluster_data_ = {
    "mix": {},
    "smote": {},
    "GAN": {},
    "borderline": {}
}
centroids_ = {}

results_ = {}
df_ = {}

results_KM_COS_ = {
    "mix": {},
    "smote": {},
    "GAN": {},
    "borderline": {}
}


for name in data.keys() & compare.keys():
    # Read data from dictionary
    X_train, y_train = data[name]
    compare_df = compare[name]
    
    # Select majority class (synthetic samples)
    X_majority = X_train[y_train == 1]
    
    # Apply KMeans clustering
    KM = KMeans(n_clusters=(int)(count2))
    kmeans = KM.fit(X_majority)
    
    # Save KMeans model
    file_path2 = base2 / f"{name}_kmeans_model.pkl"
    with open(file_path2, "wb") as f:
        pickle.dump(kmeans, f)
    
    # Centroids + Add rows to dictionary
    for i in range ((int)(count2)):
        rows_in_cluster = X_majority[kmeans.labels_ == i] 
        cluster_data_[name][i] = rows_in_cluster
        
        centroids_ = pd.DataFrame(kmeans.cluster_centers_, columns=X_train.columns)
    
    # Original majority class
    for i in range(len(centroids_)):                
        if (len(cluster_data_[name][i])>1):       # If more than one sample in the cluster
            index_ = {}
            centroid = centroids_.iloc[i]
            centroid = centroid.values.reshape(1,-1)
            for j in range(len(cluster_data_[name][i])):
                index_ = list(cluster_data_[name][i].index)
                row = cluster_data_[name][i].iloc[j]
                row = row.values.reshape(1,-1)
                index_map = {j: idx for j, idx in enumerate(index_)}
                dist_[j] = cosine_similarity(centroid, row)         # Colculate Cosine similarity
                
            min_key = max(dist_, key=dist_.get)
            results_KM_COS_[name][i] = cluster_data_[name][i].iloc[[min_key]]   # Select the nearest neighbor to the centroid
            
        else:       # If only one sample in the cluster
            results_KM_COS_[name][i] = cluster_data_[name][i].iloc[[0]]
        
        results_[name] = pd.concat(results_KM_COS_[name].values(), ignore_index=True)       
    
    # Combine reduced majority class with original df
    df_y_majority = pd.Series([1] * (int)(count2), name="target") 
    df_majority = pd.concat([results_[name], df_y_majority], axis=1).reset_index(drop=True)
    columns_ = list(df_majority.columns.values)
    df_majority = df_majority.merge(sum_all_data, on=columns_, how="left")
     
    df_[name] = pd.concat([df_majority, original_df], axis=0).reset_index(drop=True) 
    
    print(df_[name].info())
    
    # Save to CSV if file does not exist
    file_path = base / f"{name}_KM_cos.csv"

    if os.path.exists(file_path):
        print("File exists.")
    else:
        df_[name].to_csv(file_path, index=False)
        print("File saved.")

KeyError: "None of [Index([10], dtype='int64')] are in the [index]"

### KMeans + cos + Mahalanobis distance

In [22]:
base = Path("D:/ml/undersampling_data/data/ssh/reduced")
base2 = Path("D:/ml/undersampling_data/models/kmeans&cos&mal")

cluster_data_ = {
    "mix": {},
    "smote": {},
    "GAN": {},
    "borderline": {}
}
centroids_ = {}

results_ = {}
df_ = {}

results_KM_COS_MAN_ = {
    "mix": {},
    "smote": {},
    "GAN": {},
    "borderline": {}
}

alfa = 0.5  #wazenie

for name in data.keys() & compare.keys():
    # Read data from dictionary
    X_train, y_train = data[name]
    compare_df = compare[name]
    
    # Select majority class (synthetic samples)
    X_majority = X_train[y_train == 1]
        
    # Apply KMeans clustering
    KM = KMeans(n_clusters=(int)(count2))
    kmeans = KM.fit(X_majority)
    
    # Save KMeans model
    file_path2 = base2 / f"{name}_kmeans_model.pkl"
    with open(file_path2, "wb") as f:
        pickle.dump(kmeans, f)
    
    # Centroids + Add rows to dictionary
    for i in range ((int)(count2)):
        rows_in_cluster = X_majority[kmeans.labels_ == i] 
        cluster_data_[name][i] = rows_in_cluster
        
        centroids_ = pd.DataFrame(kmeans.cluster_centers_, columns=X_train.columns)
    
    # Original majority class
    for i in range(len(centroids_)):               
        if (len(cluster_data_[name][i])>1):       # If more than one sample in the cluster
            cos_={}
            dist_={}
            comb_score_ = {} # results for cosine similarity and Mahalanobis distance
            index_ = {}
            centroid = centroids_.iloc[i]
            centroid = centroid.values.reshape(1,-1)
            for j in range(len(cluster_data_[name][i])):
                index_ = list(cluster_data_[name][i].index)
                row = cluster_data_[name][i].iloc[j]
                row = row.values.reshape(1,-1)
                index_map = {j: idx for j, idx in enumerate(index_)}
                cos_[j] = cosine_similarity(centroid, row)          # using Cosine similarity
                dist_[j] = distance.mahalanobis(centroid, row, np.linalg.inv(np.cov(X_train.T)))  # using Mahalanobis distance
                
                comb_score_[j] = (alfa*cos_[j] + (1-alfa)*dist_[j])
                
            min_key = min(comb_score_, key=comb_score_.get)
            results_KM_COS_[name][i] = cluster_data_[name][i].iloc[[min_key]]
            
        else:
            results_KM_COS_[name][i] = cluster_data_[name][i].iloc[[0]]
        
        results_[name] = pd.concat(results_KM_COS_[name].values(), ignore_index=True)   
    
    # Combine reduced majority class with original df
    df_y_majority = pd.Series([1] * (int)(count2), name="target") 
    df_majority = pd.concat([results_[name], df_y_majority], axis=1).reset_index(drop=True)
    columns_ = list(df_majority.columns.values)
    df_majority = df_majority.merge(sum_all_data, on=columns_, how="left")
     
    df_[name] = pd.concat([df_majority, original_df], axis=0).reset_index(drop=True) 
    
    print(df_[name].info())
    
    # Save to CSV if file does not exist
    file_path = base / f"{name}_KM_cos_mal.csv"

    if os.path.exists(file_path):
        print("File exists.")
    else:
        df_[name].to_csv(file_path, index=False)
        print("File saved.")

LinAlgError: Singular matrix