In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import datacompy
import os
import numpy as np

# narzedzia
from sklearn.model_selection import (
    train_test_split,
    cross_val_score,
    cross_val_predict,
    learning_curve,
    RepeatedStratifiedKFold,
    GridSearchCV
)
from collections import Counter
from sklearn.metrics import classification_report, confusion_matrix
from sklearn import tree
from sklearn.tree import plot_tree

# modele + Smote
from xgboost import XGBClassifier, XGBRFClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from imblearn.over_sampling import SMOTE, BorderlineSMOTE
from imblearn.under_sampling import NearMiss, RandomUnderSampler, CondensedNearestNeighbour, TomekLinks
from ctgan import CTGAN

In [2]:
df = pd.read_csv("D:/ml/xgboost-main/data/ssh_logs/SSH.csv")
df = df.drop(columns=["user", "ts"])
print(df.columns)
df.head()
df = df.drop_duplicates()
df.shape
y = df["target"]
X = df.drop(columns=["target"])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
df_data = df.copy()
dell = pd.concat([X_test, y_test], axis=1)      #polaczenie macierzy X_test oraz y_test
#print(dell.shape)                              #271-55=216
target_num = dell['target'].value_counts()
print(f'Klasyfikator: {target_num}')
df_cleaned = df_data.merge(dell, how='left', indicator=True)            # Dodaje kolumnę '_merge' dla identyfikacji
df_cleaned = df_cleaned[df_cleaned['_merge'] == 'left_only'].drop('_merge', axis=1)

Index(['is_private', 'is_failure', 'is_root', 'is_valid', 'not_valid_count',
       'ip_failure', 'ip_success', 'no_failure', 'first', 'td', 'target'],
      dtype='object')
Klasyfikator: target
0    44
1    11
Name: count, dtype: int64


In [3]:
# concat all datasets
file_path4 = "D:\\ml\\xgboost-main\\data\\processed\\generated_data_test.csv"        #sciezka wraz z nazwa pod jaka wygenerowac plik
#df_cleaned = pd.read_csv()
smote_data = pd.read_csv("D:\\ml\\xgboost-main\\data\\generated\\smote_data.csv")
boarderline_smote_data = pd.read_csv("D:\\ml\\xgboost-main\\data\\generated\\boarderlinesmote_data.csv")
gan_data = pd.read_csv("D:\\ml\\xgboost-main\\data\\generated\\GAN_data.csv")

data1 = pd.concat([df_cleaned, smote_data])
data1['source'] = ['original' if i < len(df_cleaned) else 'smote' for i in range(len(data1))]
data2 = pd.concat([boarderline_smote_data, gan_data])
data2['source'] = ['boarderline' if i < len(boarderline_smote_data) else 'gan' for i in range(len(data2))]
data = pd.concat([data1, data2])
data = data.drop_duplicates()
num_duplicates = data.duplicated().sum()
print(num_duplicates)
print(data)
#data.to_csv(file_path4,index=False)
target_num = data['target'].value_counts()
print(f'Klasyfikatory: {target_num}')
columns_list = data.columns
print('Kolumny:\n',columns_list)
data = data.reset_index(drop=True)


y_under = data['target']
X_under = data.loc[:, ['is_private', 'is_failure', 'is_root', 'is_valid', 'not_valid_count',
       'ip_failure', 'ip_success', 'no_failure', 'first', 'td']]

RUS = RandomUnderSampler()
X_RUS, y_RUS = RUS.fit_resample(X_under, y_under)
NM = NearMiss()
X_NM, y_NM = NM.fit_resample(X_under, y_under)
CNN = CondensedNearestNeighbour()
X_CNN, y_CNN = CNN.fit_resample(X_under, y_under)
TL = TomekLinks()
X_TL, y_TL = TL.fit_resample(X_under, y_under)

0
     is_private  is_failure  is_root  is_valid  not_valid_count  ip_failure  \
1             1           1        0         1                0           2   
3             1           1        0         1                0           1   
4             1           1        0         1                0           2   
5             1           1        1         1                0           3   
6             1           1        1         1                0           4   
..          ...         ...      ...       ...              ...         ...   
105           1           1        0         1               20          14   
106           1           1        0         1                5          26   
107           0           1        1         0                8          30   
108           1           1        1         0               28           8   
109           1           1        0         0               29          38   

     ip_success  no_failure  first   td  target  

In [4]:
models = {
    "RUS": RandomUnderSampler(),             #random under sampler
    "NM": NearMiss(),
    "CNN": CondensedNearestNeighbour(),
    "TL": TomekLinks()
}

fit_data = {
    "RUS":(X_RUS, y_RUS),
    "NM": (X_NM, y_NM),
    "CNN": (X_CNN, y_CNN),
    "TL": (X_TL, y_TL)
}

### GridSearch

In [7]:
params_RUS = {
    'random_state':[0,10,20,30,40,42] 
}

params_NM = {
    'n_neighbors':[1,2,3,4,5],
}

params_CNN = {
    'random_state':[0,10,20,30,40,42],
    'n_seeds_Sint':[1,2,3,4,5,6,7]
}

params_TL = {
    'random_state':[0,10,20,30,40,42],
}

call="RUS"
goal = ["precision", "f1","recall", "accuracy"] 
path_files = ["D:\\ml\\xgboost-main\\reports\\Results\\undersampling\\precision_grid.txt", "D:\\ml\\xgboost-main\\reports\\Results\\undersampling\\f1_grid.txt", "D:\\ml\\xgboost-main\\reports\\Results\\undersampling\\recall_grid.txt", "D:\\ml\\xgboost-main\\reports\\Results\\undersampling\\accuracy_grid.txt"]

# Prezentacja wyników analizy
for metrix1, path_file in zip(goal, path_files):
    for model_name, model in models.items():
        with open(path_file, "a+") as f:
    
            model_RUS = RandomUnderSampler()
            model_NM = NearMiss()
            model_CNN = CondensedNearestNeighbour()
            model_TL = TomekLinks()

            X_train, y_train = fit_data[model_name]
    
            print(f'scoring: {metrix1}')
            model_RUS = GridSearchCV(model_RUS,param_grid=params_RUS,scoring=metrix1).fit(X_train, y_train)
            print('Best parameters RUS:', model_RUS.best_params_)
            print('Best score RUS:', model_RUS.best_score_)
            model_NM = GridSearchCV(model_NM,param_grid=params_NM,scoring=metrix1).fit(X_train, y_train)
            print('Best parameters NM:', model_NM.best_params_)
            print('Best score NM:', model_NM.best_score_)
            model_CNN = GridSearchCV(model_CNN,param_grid=params_CNN,scoring=metrix1).fit(X_train, y_train)
            print('Best parameters CNN:', model_CNN.best_params_)
            print('Best score CNN:', model_CNN.best_score_)
            model_TL = GridSearchCV(model_TL,param_grid=params_TL,scoring=metrix1).fit(X_train, y_train)
            print('Best parameters TL:', model_TL.best_params_)
            print('Best score TL:', model_TL.best_score_)

scoring: precision
Best parameters RUS: {'random_state': 0}
Best score RUS: nan
Best parameters NM: {'n_neighbors': 1}
Best score NM: nan


Traceback (most recent call last):
  File "c:\Users\mateu\anaconda3\envs\python8\lib\site-packages\sklearn\model_selection\_validation.py", line 813, in _score
    scores = scorer(estimator, X_test, y_test)
  File "c:\Users\mateu\anaconda3\envs\python8\lib\site-packages\sklearn\metrics\_scorer.py", line 266, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
  File "c:\Users\mateu\anaconda3\envs\python8\lib\site-packages\sklearn\metrics\_scorer.py", line 353, in _score
    y_pred = method_caller(estimator, "predict", X)
  File "c:\Users\mateu\anaconda3\envs\python8\lib\site-packages\sklearn\metrics\_scorer.py", line 86, in _cached_call
    result, _ = _get_response_values(
  File "c:\Users\mateu\anaconda3\envs\python8\lib\site-packages\sklearn\utils\_response.py", line 109, in _get_response_values
    y_pred, pos_label = estimator.predict(X), None
AttributeError: 'RandomUnderSampler' object has no attribute 'predict'

Traceback (most recent

ValueError: Invalid parameter 'n_seeds_Sint' for estimator CondensedNearestNeighbour(). Valid parameters are: ['n_jobs', 'n_neighbors', 'n_seeds_S', 'random_state', 'sampling_strategy'].

In [None]:
#polaczenie dnaych dla rus nm itd i zrobienie przymiarki % 
data_cal = pd.concat([X_resampled, y_resampled], axis=1)
print(data_cal.head)

<bound method NDFrame.head of      is_private  is_failure  is_root  is_valid  not_valid_count  ip_failure  \
0             1           1        0         1                0           2   
1             1           0        0         1                0           0   
2             1           1        0         1                0           2   
3             1           1        1         1                0           3   
4             1           1        1         1                0           4   
..          ...         ...      ...       ...              ...         ...   
332           0           1        0         1                0           7   
396           1           1        0         1               11          33   
294           1           1        0         0               12          43   
96            1           1        0         0               14          45   
444           1           1        1         0                1          44   

     ip_success  no_f

In [None]:
data_cal = pd.merge(data_cal, data[['source']], how='left', left_index=True, right_index=True)          #Dopasowanie kolumn w `data` i `data_cal`
comparison = pd.merge(data_cal, data, how='outer', indicator=True)                  #Połączenie zbiorów danych w celu identyfikacji unikalnych wierszy

#rozklad danych
original_source_distribution = data['source'].value_counts()        #rozkładu `source` w oryginalnym zbiorze danych

used_in_training = comparison[comparison['_merge'] == 'both']       #wiersze obecne w obu zbiorach
training_source_distribution = used_in_training['source'].value_counts()


for 
source_percentage = (training_source_distribution / training_source_distribution.sum() * 100).fillna(0)     #procentowego udziału danych użytych do treningu

#procentowego udziału dla poszczególnych grup
percent_original = source_percentage.get('original', 0)
percent_gan = source_percentage.get('gan', 0)
percent_boarderline = source_percentage.get('boarderline', 0)
percent_smote = source_percentage.get('smote', 0)

path_file = "D:\\ml\\xgboost-main\\reports\\Results\\undersampling\\data_used.txt"
#wyświetlenie wyników
with open (path_file, "a+") as f:
    print("Rozklad `source` w oryginalnym zbiorze danych:", file=f)
    print(original_source_distribution, file=f)
    print("\nRozklad `source` w zbiorze treningowym:", file=f)
    print(training_source_distribution, file=f)
    print("\nProcent uzycia danych do treningu dla kazdej wartosci `source`:", file=f)
    print(f"Oryginal: {percent_original}", file=f)
    print(f"Gan: {percent_gan}", file=f)
    print(f"Boarderline: {percent_boarderline}", file=f)
    print(f"SMOTE: {percent_smote}", file=f)
