In [44]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import datacompy
import os, sys
import numpy as np

# narzedzia
from sklearn.model_selection import (
    train_test_split,
    cross_val_score,
    cross_val_predict,
    learning_curve,
    RepeatedStratifiedKFold,
    GridSearchCV
)
from collections import Counter
from sklearn.metrics import classification_report, confusion_matrix
from sklearn import tree
from sklearn.tree import plot_tree

# modele + Smote
from xgboost import XGBClassifier, XGBRFClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from imblearn.over_sampling import SMOTE, BorderlineSMOTE
from imblearn.under_sampling import NearMiss, RandomUnderSampler, CondensedNearestNeighbour, TomekLinks
from ctgan import CTGAN

### Data

In [45]:
df = pd.read_csv("D:/ml/xgboost-main/data/ssh_logs/SSH.csv")
df = df.drop(columns=["user", "ts"])
print(df.columns)
print(df.head())
df = df.drop_duplicates()
print(df.shape)
y = df["target"]
X = df.drop(columns=["target"])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
df_data = df.copy()
dell = pd.concat([X_test, y_test], axis=1)      #polaczenie macierzy X_test oraz y_test
#print(dell.shape)                              #271-55=216
target_num = dell['target'].value_counts()
print(f'Klasyfikator: {target_num}')
df_cleaned = df_data.merge(dell, how='left', indicator=True)            # Dodaje kolumnę '_merge' dla identyfikacji
df_cleaned = df_cleaned[df_cleaned['_merge'] == 'left_only'].drop('_merge', axis=1)

Index(['is_private', 'is_failure', 'is_root', 'is_valid', 'not_valid_count',
       'ip_failure', 'ip_success', 'no_failure', 'first', 'td', 'target'],
      dtype='object')
   is_private  is_failure  is_root  is_valid  not_valid_count  ip_failure  \
0           1           1        0         1                0           1   
1           1           1        0         1                0           2   
2           1           0        0         1                0           0   
3           1           1        0         1                0           1   
4           1           1        0         1                0           2   

   ip_success  no_failure  first   td  target  
0           0           2      1    0       0  
1           0           3      0   18       0  
2           1           0      0  133       0  
3           1           2      0   58       0  
4           1           3      0   29       0  
(271, 11)
Klasyfikator: target
0    37
1    18
Name: count, dtype: int64


In [46]:
# concat all datasets
file_path4 = "D:\\ml\\xgboost-main\\data\\processed\\generated_data_test.csv"        #sciezka wraz z nazwa pod jaka wygenerowac plik
#df_cleaned = pd.read_csv()
smote_data = pd.read_csv("D:\\ml\\xgboost-main\\data\\generated\\smote_data.csv")
boarderline_smote_data = pd.read_csv("D:\\ml\\xgboost-main\\data\\generated\\boarderlinesmote_data.csv")
gan_data = pd.read_csv("D:\\ml\\xgboost-main\\data\\generated\\GAN_data.csv")

y_test = pd.read_csv("D:\\ml\\xgboost-main\\data\\testframe\\y_test.csv")
X_test = pd.read_csv("D:\\ml\\xgboost-main\\data\\testframe\\X_test.csv")
X_test = X_test.drop(columns=["Unnamed: 0"])
y_test = y_test.drop(columns=["Unnamed: 0"])

data1 = pd.concat([df_cleaned, smote_data])
data1['source'] = ['original' if i < len(df_cleaned) else 'smote' for i in range(len(data1))]
data2 = pd.concat([boarderline_smote_data, gan_data])
data2['source'] = ['boarderline' if i < len(boarderline_smote_data) else 'gan' for i in range(len(data2))]
data = pd.concat([data1, data2])
data = data.drop_duplicates()
print(data)
#data.to_csv(file_path4,index=False)
target_num = data['target'].value_counts()
print(f'Klasyfikatory: {target_num}')
columns_list = data.columns
print('Kolumny:\n',columns_list)
data = data.reset_index(drop=True)


y_under = data['target']
X_under = data.loc[:, ['is_private', 'is_failure', 'is_root', 'is_valid', 'not_valid_count',
       'ip_failure', 'ip_success', 'no_failure', 'first', 'td']]

#undersampling danych
RUS = RandomUnderSampler()
X_RUS, y_RUS = RUS.fit_resample(X_under, y_under)
NM = NearMiss()
X_NM, y_NM = NM.fit_resample(X_under, y_under)
CNN = CondensedNearestNeighbour()
X_CNN, y_CNN = CNN.fit_resample(X_under, y_under)
TL = TomekLinks()
X_TL, y_TL = TL.fit_resample(X_under, y_under)

     is_private  is_failure  is_root  is_valid  not_valid_count  ip_failure  \
0             1           1        0         1                0           1   
1             1           1        0         1                0           2   
2             1           0        0         1                0           0   
3             1           1        0         1                0           1   
4             1           1        0         1                0           2   
..          ...         ...      ...       ...              ...         ...   
105           1           1        0         1               20          14   
106           1           1        0         1                5          26   
107           0           1        1         0                8          30   
108           1           1        1         0               28           8   
109           1           1        0         0               29          38   

     ip_success  no_failure  first   td  target    

### Data Compare

In [4]:
# X_train or X_train_sm or X_train_bsm
columns_list = ['is_private', 'is_failure', 'is_root', 'is_valid', 'not_valid_count', 'ip_failure', 'ip_success', 'no_failure', 'first', 'td']

compare = datacompy.Compare(
X_TL ,                        # X_train or X_train_sm or X_train_bsm or X_train_gan 
X_RUS ,
join_columns= columns_list,
abs_tol=0.0001,
rel_tol=0,
df1_name='tl',
df2_name='rus'
)
print(compare.report())

### DANE do uczenia

In [47]:
models = {
    
    "LR_RUS": LogisticRegression(max_iter=100, random_state=0),         # logistic regression dla RUS
    "LR_NM": LogisticRegression(max_iter=100, random_state=0),          # logistic regression dla NM
    "LR_CNN": LogisticRegression(max_iter=100, random_state=0),         # logistic regression dla CNN
    "LR_TL": LogisticRegression(max_iter=500, random_state=0),          # logistic regression dla TL
    
    "DT_RUS": DecisionTreeClassifier(max_depth=4, min_samples_leaf=1, random_state=30),     # decision tree dla RUS
    "DT_NM": DecisionTreeClassifier(max_depth=4, min_samples_leaf=1, random_state=0),       # decision tree dla NM
    "DT_CNN": DecisionTreeClassifier(max_depth=8, min_samples_leaf=1, random_state=0),      # decision tree dla CNN
    "DT_TL": DecisionTreeClassifier(max_depth=8, min_samples_leaf=1, random_state=0),       # decision tree dla TL
    
    "RF_RUS": RandomForestClassifier(max_depth=8, min_samples_leaf=1, n_estimators=20, random_state=0),         # random forest dla niezbalansowanych
    "RF_NM": RandomForestClassifier(max_depth=4, min_samples_leaf=1, n_estimators=100, random_state=40),        # random forest dla zbalansowanych SMOTE
    "RF_CNN": RandomForestClassifier(max_depth=4, min_samples_leaf=2, n_estimators=60, random_state=0),         # random forest dla zbalansowanych BoarderrLineSMOTE
    "RF_TL": RandomForestClassifier(max_depth=8, min_samples_leaf=1, n_estimators=20, random_state=0),          # random forest dla zbalansowanych GAN
    
    "XGB_RUS": XGBClassifier(max_depth=4, subsample=1),         # xgboost dla RUS
    "XGB_NM": XGBClassifier(max_depth=4, subsample=0.5),        # xgboost dla NM
    "XGB_CNN": XGBClassifier(max_depth=4, subsample=0.5),       # xgboost dla CNN
    "XGB_TL": XGBClassifier(max_depth=8, subsample=0.7),        # xgboost dla TL
    
    "XGB_RF_RUS": XGBRFClassifier(max_depth=4, n_estimators=20, random_state=20, subsample=0.5),            # xgboost rf dla RUS
    "XGB_RF_NM": XGBRFClassifier(max_depth=8, n_estimators=20, random_state=20, subsample=0.6),             # xgboost rf dla NM
    "XGB_RF_CNN": XGBRFClassifier(max_depth=8, n_estimators=20, random_state=20, subsample=0.6),            # xgboost rf dla CNN
    "XGB_RF_TL": XGBRFClassifier(max_depth=4, n_estimators=20, random_state=10, subsample=0.5),             # xgboost rf dla TL
}

fit_data = {
    
    "LR_RUS": (X_RUS, y_RUS),
    "LR_NM": (X_NM, y_NM),
    "LR_CNN": (X_CNN, y_CNN),
    "LR_TL": (X_TL, y_TL),
    
    "DT_RUS": (X_RUS, y_RUS),
    "DT_NM": (X_NM, y_NM),
    "DT_CNN": (X_CNN, y_CNN),
    "DT_TL": (X_TL, y_TL),
    
    "RF_RUS": (X_RUS, y_RUS),
    "RF_NM": (X_NM, y_NM),
    "RF_CNN": (X_CNN, y_CNN),
    "RF_TL": (X_TL, y_TL),
    
    "XGB_RUS": (X_RUS, y_RUS),
    "XGB_NM": (X_NM, y_NM),
    "XGB_CNN": (X_CNN, y_CNN),
    "XGB_TL": (X_TL, y_TL),

    "XGB_RF_RUS": (X_RUS, y_RUS),
    "XGB_RF_NM": (X_NM, y_NM),
    "XGB_RF_CNN": (X_CNN, y_CNN),
    "XGB_RF_TL": (X_TL, y_TL)
}

data_resampled = {
       "RUS" : (X_RUS, y_RUS),
       "NM" : (X_NM, y_NM),
       "CNN" : (X_CNN, y_CNN),
       "TL" : (X_TL, y_TL)
}

### Learn model

### GridSearch

In [6]:
""" # PARAMS to Search
params_LR = {                               # dla LR
    'max_iter':[100,500,1000],              # ilosc iteracji
    'random_state':[0,10,20,30,40]          # ustalona losowość dla powtarzalności
                
}

params_DT = {                               # dla DT
    'max_depth':[4,8,12,16],                # maksymalna głębokość drzewa
    'random_state':[0,10,20,30,40],         # ustalona losowość dla powtarzalności
    'min_samples_leaf':[1,2]                # minimalna ilosc probek lisci
}

params_RF = {
    'n_estimators':[20,40,60,80,100],       # rozmiar lasu
    'max_depth':[4,8,12,16],                # maksymalna głębokość drzewa
    'min_samples_leaf':[1,2],               # minimalna ilosc probek lisci
    'random_state':[0,10,20,30,40],         # ustalona losowość dla powtarzalności
}

params_XGB = {                              # dla XGBoost
    'max_depth':[4,8,12,16],                # maksymalna głębokość drzewa
    'subsample':[0.5,0.6,0.7,0.8,0.9,1]
}

params_XGB_RF = {                           # dla XGBoost RF
    'max_depth':[4,8,12,16],                # maksymalna głębokość drzewa
    'n_estimators':[20,40,60,80,100],       # rozmiar lasu
    'subsample':[0.5,0.6,0.7,0.8,0.9,1],
    'random_state':[0,10,20,30,40]          # ustalona losowość dla powtarzalności
}

models = {
    
    "LR_RUS": (LogisticRegression(), params_LR),             # logistic regression dla niezbalansowanych 
    "LR_NM": (LogisticRegression(), params_LR),      # logistic regression dla zbalansowanych SMOTE
    "LR_CNN": (LogisticRegression(), params_LR),    # logistic regression dla zbalansowanych BoarderrLineSMOTE
    "LR_TL": (LogisticRegression(), params_LR),        # logistic regression dla zbalansowanych GAN
    
    "DT_RUS": (DecisionTreeClassifier(), params_DT),             # decision tree dla niezbalansowanych
    "DT_NM": (DecisionTreeClassifier(), params_DT),       # decision tree dla zbalansowanych SMOTE
    "DT_CNN": (DecisionTreeClassifier(), params_DT),     # decision tree dla zbalansowanych BoarderrLineSMOTE
    "DT_TL": (DecisionTreeClassifier(), params_DT),         # decision tree dla zbalansowanych GAN
    
    "RF_RUS": (RandomForestClassifier(), params_RF),            # random forest dla niezbalansowanych
    "RF_NM": (RandomForestClassifier(), params_RF),       # random forest dla zbalansowanych SMOTE
    "RF_CNN": (RandomForestClassifier(), params_RF),    # random forest dla zbalansowanych BoarderrLineSMOTE
    "RF_TL": (RandomForestClassifier(), params_RF),        # random forest dla zbalansowanych GAN
    
    "XGB_RUS": (XGBClassifier(), params_XGB),            # xgboost dla niezbalansowanych
    "XGB_NM": (XGBClassifier(), params_XGB),    # xgboost dla zbalansowanych SMOTE
    "XGB_CNN": (XGBClassifier(), params_XGB),   # xgboost dla zbalansowanych BoarderrLineSMOTE
    "XGB_TL": (XGBClassifier(), params_XGB),      # xgboost dla zbalansowanych GAN
    
    "XGB_RF_RUS": (XGBRFClassifier(), params_XGB_RF),           # xgboost rf dla niezbalansowanych
    "XGB_RF_NM": (XGBRFClassifier(), params_XGB_RF),          # xgboost rf dla zbalansowanych SMOTE
    "XGB_RF_CNN": (XGBRFClassifier(), params_XGB_RF),    # xgboost rf dla zbalansowanych BoarderrLineSMOTE
    "XGB_RF_TL": (XGBRFClassifier(), params_XGB_RF),          # xgboost rf dla zbalansowanych GAN
}

fit_data = [
    
    ("LR_RUS", (X_RUS, y_RUS)),
    ("LR_NM", (X_NM, y_NM)),
    ("LR_CNN", (X_CNN, y_CNN)),
    ("LR_TL", (X_TL, y_TL)),
    
    ("DT_RUS", (X_RUS, y_RUS)),
    ("DT_NM", (X_NM, y_NM)),
    ("DT_CNN", (X_CNN, y_CNN)),
    ("DT_TL", (X_TL, y_TL)),
    
    ("RF_RUS", (X_RUS, y_RUS)),
    ("RF_NM", (X_NM, y_NM)),
    ("RF_CNN", (X_CNN, y_CNN)),
    ("RF_TL", (X_TL, y_TL)),
    
    ("XGB_RUS", (X_RUS, y_RUS)),
    ("XGB_NM", (X_NM, y_NM)),
    ("XGB_CNN", (X_CNN, y_CNN)),
    ("XGB_TL", (X_TL, y_TL)),

    ("XGB_RF_RUS", (X_RUS, y_RUS)),
    ("XGB_RF_NM", (X_NM, y_NM)),
    ("XGB_RF_CNN", (X_CNN, y_CNN)),
    ("XGB_RF_TL", (X_TL, y_TL)),
]

goal = ["precision", "f1","recall", "accuracy"] 
#path_files = ["D:\\ml\\xgboost-main\\reports\\Results\\undersampling\\precision_grid.txt", "D:\\ml\\xgboost-main\\reports\\Results\\undersampling\\f1_grid.txt", "D:\\ml\\xgboost-main\\reports\\Results\\undersampling\\recall_grid.txt", "D:\\ml\\xgboost-main\\reports\\Results\\undersampling\\accuracy_grid.txt"]
base_path = "D:\\ml\\xgboost-main\\reports\\Results\\undersampling"
file_names = ["precision_grid.txt", "f1_grid.txt", "recall_grid.txt", "accuracy_grid.txt"]

path_files = [f"{base_path}\\{file_name}" for file_name in file_names]
# Prezentacja wyników analizy
for score_2 in goal:
    print(f'Scoring: {score_2}')
    file_path = path_files[goal.index(score_2)]
    
    for (dataset_name, (X_train, y_train)), (model_name, (model, param_grid)) in zip(fit_data, models.items()):     # GridSearch
        with open (file_path, "a+") as f:
            X_train, y_train = None, None
            for name, (X, y) in fit_data:
                if name == model_name:
                    X_train, y_train = X, y
                    break
            print(model_name, dataset_name)
            gs = GridSearchCV(model, param_grid=param_grid, scoring=score_2, n_jobs=-1)
            gs.fit(X_train, y_train)
        
            print(f'Best parameters {model_name}: {gs.best_params_}',file=f)
            print(f'Best score {model_name}: {gs.best_score_}', file=f)
    
        # GridSearch dla NM
        gs_nm = GridSearchCV(model, param_grid=param_grid, scoring=score_2)
        gs_nm.fit(X_NM, y_NM)
        
        print(f'Best parameters {model_name}: {gs_rus.best_params_}')
        print(f'Best score {model_name}: {gs_rus.best_score_}')  """

## Wyniki

### Cross-Val

In [None]:
#sprawdzenie dla roznych pozmiomow danych czyli 100danych 200 itd...

cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=10, random_state=42, )
goal = ['precision','f1','recall','accuracy']
path_files = ["D:\\ml\\xgboost-main\\reports\\Results\\undersampling\\metrics\\precision_metrics.txt", "D:\\ml\\xgboost-main\\reports\\Results\\undersampling\\metrics\\f1_metrics.txt", "D:\\ml\\xgboost-main\\reports\\Results\\undersampling\\metrics\\recall_metrics.txt", "D:\\ml\\xgboost-main\\reports\\Results\\undersampling\\metrics\\accuracy_metrics.txt"]
excel_file = "D:\\ml\\xgboost-main\\reports\\Results\\undersampling\\metrics.xlsx"
name_sheet = "Arkusz2"
results = []

for metrix1, path_file in zip(goal, path_files):
    for model_name, model in models.items():
        with open(path_file, "a+") as f:
            # Pobieramy odpowiednie dane do trenowania dla bieżącego modelu
            X_train, y_train = fit_data[model_name]
            cross_val_results = cross_val_score(model, X_train, y_train, cv=cv, scoring=metrix1, n_jobs=1)
            #zapis do excel
            results.append({
            "Model": model_name,
            "Metric": metrix1,
            "Result": cross_val_results.mean()
            })
            df_results = pd.DataFrame(results)
            df_save = df_results.pivot(index="Metric", columns="Model", values="Result")
            with pd.ExcelFile(excel_file) as w:
                df_results.to_excel(w, sheet_name=name_sheet)
            # Zapis wyników do pliku txt
            print(f'{model_name} Cross-Validation Results {metrix1}:\n {cross_val_results}', file=f)
            print(f'Mean {metrix1}: {cross_val_results.mean()}', file=f)
            print(f'Dev: {cross_val_results.std()}', file=f)
            print("\n", file=f) 
            
                   

In [39]:
if not isinstance(X_RUS, pd.DataFrame):
    X_RUS = pd.DataFrame(X_RUS, index=True)  # Jeśli X_train jest macierzą NumPy, konwertujemy do DataFrame

# Dodaj y_train jako nową kolumnę do X_train
X_RUS['target'] = y_RUS

# Wynikowy DataFrame
print(X_RUS)
with pd.ExcelFile("D:\\ml\\xgboost-main\\reports\\Results\\test\\NM_data.xlsx") as w:
    X_RUS.to_excel(w, sheet_name=name_sheet)


### Wyliczenie % uzytych danych

In [48]:
for data_name, data_cal in data_resampled.items():
    X_1, y_1 = data_resampled[data_name]
    data_cal = pd.concat([X_1, y_1], axis=1)
    print(data_name)
    print(data_cal.head())
    data_cal = pd.merge(data_cal, data[['source']], how='left', left_index=True, right_index=True)          #Dopasowanie kolumn w `data` i `data_cal`
    comparison = pd.merge(data_cal, data, how='outer', indicator=True)                  #Połączenie zbiorów danych w celu identyfikacji unikalnych wierszy

#rozklad danych
    original_source_distribution = data['source'].value_counts()        #rozkładu `source` w oryginalnym zbiorze danych

    used_in_training = comparison[comparison['_merge'] == 'both']       #wiersze obecne w obu zbiorach
    training_source_distribution = used_in_training['source'].value_counts()



    source_percentage = (training_source_distribution / training_source_distribution.sum() * 100).fillna(0)     #procentowego udziału danych użytych do treningu

#procentowego udziału dla poszczególnych grup
    percent_original = source_percentage.get('original', 0)
    percent_gan = source_percentage.get('gan', 0)
    percent_boarderline = source_percentage.get('boarderline', 0)
    percent_smote = source_percentage.get('smote', 0)

    path_file = "D:\\ml\\xgboost-main\\reports\\Results\\undersampling\\data_used2.txt"
#wyświetlenie wyników
    with open (path_file, "a+") as f:
        print(f"Uzyty model undersamplingu: {data_name}", file=f)
        print("Rozklad `source` w oryginalnym zbiorze danych:", file=f)
        print(original_source_distribution, file=f)
        print(f"\nRozklad `source` w zbiorze treningowym {data_name}:", file=f)
        print(training_source_distribution, file=f)
        print(f"\nProcent uzycia danych do treningu {data_name} dla kazdej wartosci `source`:", file=f)
        print(f"Oryginal: {percent_original}", file=f)
        print(f"GAN: {percent_gan}", file=f)
        print(f"Boarderline SMOTE: {percent_boarderline}", file=f)
        print(f"SMOTE: {percent_smote}", file=f)
        print("\n", file=f)


RUS
   is_private  is_failure  is_root  is_valid  not_valid_count  ip_failure  \
0           1           1        0         1                0           1   
1           1           1        0         1                0           2   
2           1           0        0         1                0           0   
3           1           1        0         1                0           1   
4           1           1        0         1                0           2   

   ip_success  no_failure  first   td  target  
0           0           2      1    0       0  
1           0           3      0   18       0  
2           1           0      0  133       0  
3           1           2      0   58       0  
4           1           3      0   29       0  
NM
   is_private  is_failure  is_root  is_valid  not_valid_count  ip_failure  \
0           1           1        0         1                0           1   
1           1           1        0         1                0           2   
2          

#### Heatmap

In [None]:
y_test = pd.read_csv("D:\\ml\\xgboost-main\\data\\testframe\\y_test.csv")
X_test = pd.read_csv("D:\\ml\\xgboost-main\\data\\testframe\\X_test.csv")

for model_name, model in models.items():
    
    # Pobieramy odpowiednie dane do trenowania dla bieżącego modelu
    X_train, y_train = fit_data[model_name]
    learned = model.fit(X_train, y_train)      
    y_pred = learned.predict(X_test)
    conf_mat = confusion_matrix(y_test, y_pred)
            
    sns.heatmap(conf_mat, annot=True, xticklabels=["1", "0"], yticklabels=["1", "0"])
    plt.ylabel("Test", fontsize=13)
    plt.title(f"Confusion Matrix: {model_name}", fontsize=15, pad=20)
    plt.gca().xaxis.set_label_position("top")
    plt.xlabel("Symulacja", fontsize=13)
    plt.gca().xaxis.tick_top()
    plt.savefig(f'D:\\ml\\xgboost-main\\reports\\conf matrix\\undersampling\\{model_name}.png', bbox_inches="tight", dpi=200)     
    plt.show()
    plt.close()  