### Library

In [21]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import datacompy
import os, sys
import numpy as np
import re
import ast
import gower
from pathlib import Path

# narzedzia
from sklearn.model_selection import (
    train_test_split,
    cross_val_score,
    cross_val_predict,
    learning_curve,
    RepeatedStratifiedKFold,
    GridSearchCV,
)
from collections import Counter
from sklearn.metrics import classification_report, confusion_matrix, silhouette_score, accuracy_score, pairwise_distances
from sklearn.metrics.pairwise import cosine_similarity
from sklearn import tree
from sklearn.tree import plot_tree
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from scipy import stats
from scipy.stats import chi2_contingency
from scipy.spatial import distance
from joblib import dump, load

# modele
from xgboost import XGBClassifier, XGBRFClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# methods
from imblearn.under_sampling import ClusterCentroids, NearMiss
from scipy.optimize import differential_evolution
from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN, HDBSCAN
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor, KNeighborsClassifier, NearestNeighbors
from sklearn.inspection import permutation_importance
from scipy.spatial.distance import euclidean
from imblearn.over_sampling import SMOTE, BorderlineSMOTE
#from ctgan import CTGAN

### Read datasets after preprocessed

In [22]:
#oversampling data
original_df = pd.read_csv("D:\\ml\\undersampling_data\\data\\ssh\\encoded_normalized\\original_data_normalized.csv")
mix_df = pd.read_csv("D:\\ml\\undersampling_data\\data\\ssh\\encoded_normalized\\mix_data_normalized.csv")
smote_df = pd.read_csv("D:\\ml\\undersampling_data\\data\\ssh\\encoded_normalized\\smote_data_normalized.csv") 
gan_df = pd.read_csv("D:\\ml\\undersampling_data\\data\\ssh\\encoded_normalized\\GAN_data_normalized.csv")
borderline_df = pd.read_csv("D:\\ml\\undersampling_data\\data\\ssh\\encoded_normalized\\borderline_data_normalized.csv")

X_test = pd.read_csv("D:\\ml\\undersampling_data\\data\\ssh\\test\\X_test_norm.csv")

In [23]:
#Before undersampling
print(original_df["target"].value_counts())
count1=original_df["target"].value_counts().sum()
count2=abs((original_df['target']==0).sum() - (original_df['target']==1).sum())
print(f"Before undersampling: {count2}")
print(f"After number of samples: {count1+count2}")

target
0    151
1     46
Name: count, dtype: int64
Before undersampling: 105
After number of samples: 302


In [24]:
#Sum up all df 
sum_all_data = pd.concat([smote_df, gan_df, borderline_df, original_df], axis=0, ignore_index=True)

#Split data
X_mix, y_mix = mix_df.drop(columns=["target", "source"]), mix_df["target"]
X_smote, y_smote = smote_df.drop(columns=["target", "source"]), smote_df["target"]
X_GAN, y_GAN = gan_df.drop(columns=["target", "source"]), gan_df["target"]
X_borderline, y_borderline = borderline_df.drop(columns=["target", "source"]), borderline_df["target"]

#Dictionary
data = {}
data["mix"] = (X_mix, y_mix)
data["smote"] = (X_smote, y_smote)
data["GAN"] = (X_GAN, y_GAN)
data["borderline"] = (X_borderline, y_borderline)

compare = {}
compare["mix"] = mix_df
compare["smote"] = smote_df
compare["GAN"] = gan_df
compare["borderline"] = borderline_df

In [34]:
print(mix_df.info())
print(smote_df.info())
print(gan_df.info())
print(borderline_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 315 entries, 0 to 314
Data columns (total 59 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   kamran           315 non-null    float64
 1   student          315 non-null    float64
 2   root             315 non-null    float64
 3   admins           315 non-null    float64
 4   phoenix          315 non-null    float64
 5   piglet           315 non-null    float64
 6   rainbow          315 non-null    float64
 7   runner           315 non-null    float64
 8   sam              315 non-null    float64
 9   abc123           315 non-null    float64
 10  passwd           315 non-null    float64
 11  newpass          315 non-null    float64
 12  notused          315 non-null    float64
 13  Hockey           315 non-null    float64
 14  internet         315 non-null    float64
 15  asshole          315 non-null    float64
 16  Maddock          315 non-null    float64
 17  computer        

### Undersampling df

#### Kmeans + centroids

In [25]:
def normalize_col(c):
    s = str(c).strip()
    # Jeśli to czysta liczba, ujednolić np. do STRINGA (lub do int – ale trzymaj się jednego wyboru wszędzie!)
    return s
def clean_df_cols(df: pd.DataFrame) -> pd.DataFrame:
    # usuń Unnamed
    df = df.loc[:, ~df.columns.astype(str).str.contains(r"^Unnamed", na=False)]
    # normalizuj nazwy
    df.columns = [normalize_col(c) for c in df.columns]
    return df
original_df = clean_df_cols(original_df)
print(original_df.columns)

Index(['kamran', 'root', 'admins', 'phoenix', 'piglet', 'runner', 'sam',
       'newpass', 'notused', 'internet', 'asshole', 'Maddock', 'computer',
       'Mickey', 'qwerty', 'fiction', 'orange', 'tigger', 'wheeling',
       'mustang', 'jennifer', 'money', 'chris', 'david', 'foobar', 'buster',
       'harley', 'jordan', 'stupid', 'apple', 'fred', 'summer', 'andrew',
       'osamac', 'gta', 'adminx', 'gtta', 'osamax', 'is_private', 'is_failure',
       'is_root', 'is_valid', 'not_valid_count', 'ip_failure', 'ip_success',
       'no_failure', 'first', 'td', 'ts', 'target', 'source'],
      dtype='object')


In [33]:
KM = KMeans(n_clusters=(int)(count2))
base = Path("D:/ml/undersampling_data/data/ssh/reduced")

df_ = {}

for name in data.keys() & compare.keys():
    # Read data from dictionary
    X_train, y_train = data[name]
    compare_df = compare[name]
    
    # Select majority class
    X_majority = X_train[y_train == 1]
    
    # Apply KMeans clustering
    kmeans = KM.fit(X_majority)
    
    # Create a DataFrame for centroids
    X_majority_reduced = pd.DataFrame(kmeans.cluster_centers_, columns=X_train.columns)
    y_majority_reduced = pd.Series([1] * (int)(count2), name="target") 
    
    # Combine reduced majority class with original minority class
    df_majority = pd.concat([X_majority_reduced, y_majority_reduced], axis=1).reset_index(drop=True)
    
    # Add source column if not present
    df_majority["source"] = None
    missing_source = df_majority[df_majority["source"].isna()]
    if not missing_source.empty:
        df_majority.loc[df_majority["source"].isna(), "source"] = "centroid" 
    
    print(df_majority)
    df_majority = df_majority.reindex(columns=original_df.columns, fill_value=0.0)
    
    # Combine with original minority class
    df_[name] = pd.concat([df_majority, original_df], axis=0).reset_index(drop=True)  
    
    print(df_[name].info())
    
    user_cols = [
    'kamran', 'student', 'root', 'admins', 'phoenix', 'piglet',
    'rainbow', 'runner', 'sam', 'abc123', 'passwd', 'newpass',
    'notused', 'Hockey', 'internet', 'asshole', 'Maddock', 'computer',
    'Mickey', 'qwerty', 'fiction', 'orange', 'tigger', 'wheeling',
    'mustang', 'admin', 'jennifer', 'money', 'Justin', 'chris',
    'david', 'foobar', 'buster', 'harley', 'jordan', 'stupid',
    'apple', 'fred', 'summer', 'sunshine', 'andrew', 'osamac',
    'gta', 'adminx', 'gtta', 'osamax'
    ]
    df = df_[name]
    # policz sumę wartości 1 w każdym wierszu
    row_sums = df[user_cols].sum(axis=1)
    mask = row_sums == 1

    # jeśli chcesz sprawdzić globalnie:
    all_ok = mask.all()

    print("Czy każdy wiersz ma dokładnie jedno '1' w kolumnach 1–45? ->", all_ok)
    
    
    # Save to CSV if file does not exist
    file_path = base / f"{name}_KM_centroids.csv"

    if os.path.exists(file_path):
        print("File exists.")
    else:
        df_[name].to_csv(file_path, index=False)
        print("File saved.")

  super()._check_params_vs_input(X, default_n_init=10)


     phoenix  rainbow  runner  sam  abc123  passwd  notused  internet  \
0        0.0      0.0     0.0  0.0     0.0     0.0      1.0       0.0   
1        0.0      0.0     0.0  0.0     0.0     0.0      0.0       0.0   
2        0.0      0.0     0.0  0.0     0.0     0.0      0.0       0.0   
3        0.0      0.0     0.0  0.0     0.0     0.0      0.0       0.0   
4        0.0      0.0     0.0  0.0     0.0     0.0      0.0       0.0   
..       ...      ...     ...  ...     ...     ...      ...       ...   
100      0.0      0.0     0.0  0.0     0.0     0.0      0.0       0.0   
101      0.0      0.0     0.0  0.0     0.0     0.0      0.0       1.0   
102      0.0      0.0     0.0  0.0     0.0     0.0      0.0       0.0   
103      0.0      0.0     0.0  0.0     0.0     0.0      0.0       0.0   
104      0.0      0.0     0.0  0.0     0.0     0.0      0.0       0.0   

     asshole  Maddock  ...  is_valid  not_valid_count  ip_failure  ip_success  \
0        0.0      0.0  ...       0.0      

KeyError: "['student', 'rainbow', 'abc123', 'passwd', 'Hockey', 'admin', 'Justin', 'sunshine'] not in index"