# Melhoria no tratamento de dados

In [99]:
'''
!pip uninstall -y scikit-learn
!pip install scikit-learn==1.2.2
'''

'\n!pip uninstall -y scikit-learn\n!pip install scikit-learn==1.2.2\n'

In [100]:
import re
from textacy.preprocessing.replace import emails, urls
from textacy.preprocessing.normalize import quotation_marks
from imblearn.under_sampling import RandomUnderSampler, NearMiss, OneSidedSelection
from imblearn.over_sampling import RandomOverSampler, SMOTE
import numpy as np
from sklearn.metrics import matthews_corrcoef, make_scorer, f1_score, accuracy_score
from sklearn.model_selection import cross_val_score, cross_validate
import os
import joblib
from pprint import pprint
from sklearn.model_selection import train_test_split


# Importação dos algoritmos de ML 

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier

In [101]:
def replace_quoted_string(text):
    """
    Substitui tudo que está entre aspas por _STRING_
    """
    return re.sub(r"\"(.+?)\"|\'(.+?)\'", "_STRING_", text)

def replace_file_path(text):
    """Replace file name or directory name with _PATH_
    """
    #return re.sub(r"(~)*(/[^/ ]*)+", "_PATH_", text) # ((?:[^/]*/)*)(.*)
    return re.sub(r"((?<= )[^ ]*/)([^/ ]*)", r"_PATH_/\2", text) # Modificado para substituir apenas o diretório não o arquivo final

def replace_env_variables(text):
    """Replace $String with $_ENV and String= with _ENV_=.
    """
    text = re.sub(r"\$([A-Za-z_][A-Za-z0-9_]*)", r"$_ENV", text)
    text = re.sub(r"([A-Za-z_][A-Za-z0-9_]*)=", r"_ENV_=", text)
    return text

def replace_ip_address(text):
    """Replace IPv4 with _IP_
    """
    return re.sub("[0-9]+(?:\.[0-9]+){3}", "_IP_", text)

def refine_pipe(text):
    """Make clear seperation between different commands in pipe.
    E.g.: cmd1|cmd2 -> cmd1 | cmd2
    """
    return text.replace("|", " | ")

def normalize(line):
   
    # convert to lowercase
    line = line.lower()
    
    # Replace $String with $_ENV and String= with _ENV_=
    line = replace_env_variables(line)

    # normalize quoted mark
    line = quotation_marks(line) #Função do TEXTACY

    # refine pipe operator
    line = refine_pipe(line)

    # replace quoted text by _STRING_
    line = replace_quoted_string(line)

    # replace IP adress by '_IP_' token
    line = replace_ip_address(line)

    # replace file path by '_PATH_' token
    line = replace_file_path(line)

    # replace email by '_EMAIL_' token
    line = emails(line, "_EMAIL_") #Função do TEXTACY

    # replace URL by '_URL_' token
    line = urls(line, "_URL_") #Função do TEXTACY
    
    return line


# Leitura dos arquivos Z

In [102]:
benign_logs = joblib.load(f"./art_benign.z")
GTFOBins = joblib.load(f"./art_gtfo.z")

benign_labels = [0] * len(benign_logs)
GTFOBins_labels = [1] * len(GTFOBins)

print(len(benign_logs))
print(len(GTFOBins))

660
1457


In [103]:
len(benign_logs[0])

23

# Criação da Janela de 03 Comandos

In [104]:
def create_n_command_by_sliding_window(session_cmds, labels, window_size=3):
    cmds_flat = []
    labels_flat = []

    for sess_idx, cmds in enumerate(session_cmds):
        for i in range(len(cmds) + 1):
            start_idx = max(0, i - window_size)
            if start_idx == i:
                continue
            cmds_flat.append(" ".join(cmds[start_idx:i]))
            labels_flat.append(labels[sess_idx])

    return cmds_flat, labels_flat

In [105]:
WINDOW_SIZE = 3

cmds_flat, labels_flat = create_n_command_by_sliding_window(
    session_cmds=benign_logs+GTFOBins, labels=benign_labels+GTFOBins_labels, window_size=WINDOW_SIZE
)

print("N-commands", len(cmds_flat), len(labels_flat))

N-commands 213859 213859


In [106]:
pprint(cmds_flat[1000:1003])

['git pull git add . git commit -m _STRING_',
 'git add . git commit -m _STRING_ git push',
 'git commit -m _STRING_ git push git checkout']


In [107]:
# try to filter the unique n-commands
unique_indices = []
cmd_set = set()

for i, cmd in enumerate(cmds_flat):
    if cmd not in cmd_set:
        cmd_set.add(cmd)
        unique_indices.append(i)

print(len(cmd_set))

157045


In [108]:
X_unique = [cmds_flat[i] for i in unique_indices]
y_unique = [labels_flat[i] for i in unique_indices]
print(len(X_unique), len(y_unique))

157045 157045


# Classificção de 3-Comandos com TF-IDF

In [109]:
X, y = cmds_flat, labels_flat
print(len(X), len(y))

213859 213859


In [110]:
from sklearn.feature_extraction.text import CountVectorizer

VOCAB_SIZE = 128
N_GRAM = 3

vectorizer = CountVectorizer(ngram_range=(1, N_GRAM), max_features=VOCAB_SIZE)
vectorizer.fit(X)

In [111]:
X_train_encoded = vectorizer.transform(X)

In [112]:
# transform the count vector to TF-IDF

from sklearn.feature_extraction.text import TfidfTransformer

# tfidf_transformer = TfidfTransformer()
tf_transformer = TfidfTransformer().fit(X_train_encoded)

X_train_tfidf = tf_transformer.transform(X_train_encoded)

print(X_train_tfidf.shape)

(213859, 128)


In [113]:
# process tfidf feature with SVD

from sklearn.decomposition import TruncatedSVD

svd = TruncatedSVD(n_components=50).fit(X_train_tfidf)

X_train_features = svd.transform(X_train_tfidf)

print(X_train_features.shape)

(213859, 50)


# Lidando com dados desbalanceados

Função de aplicação de diversos algoritmos de balanceamento de dados em conjuntos desbalanceados:

1. **Random UnderSampler (RUS):** Este algoritmo reduz a quantidade de exemplos da classe majoritária aleatoriamente, de modo a equilibrar a distribuição das classes no conjunto de dados. Ele remove aleatoriamente exemplos da classe majoritária até que o número de exemplos de cada classe seja mais equilibrado.

2. **NearMiss (NM):** O NearMiss é uma técnica que visa selecionar exemplos da classe majoritária com base na distância de seus vizinhos da classe minoritária. Ele procura pelos exemplos da classe majoritária que estão mais próximos dos exemplos da classe minoritária e os mantém, reduzindo assim a discrepância entre as classes.

3. **OneSidedSelection (OSS):** Este algoritmo utiliza um método de seleção baseado em vizinhos para remover exemplos da classe majoritária que estão longe dos exemplos da classe minoritária. Funciona de forma semelhante ao algoritmo de seleção KNN (K-Nearest Neighbors).

4. **Random OverSampler (ROS):** Ao contrário do UnderSampling, o OverSampling aumenta a quantidade de exemplos da classe minoritária. O ROS gera exemplos sintéticos aleatórios da classe minoritária para equilibrar as classes.

5. **SMOTE (Synthetic Minority Over-sampling Technique):** Similar ao ROS, o SMOTE também trabalha com OverSampling, mas em vez de gerar exemplos sintéticos aleatoriamente, cria novos exemplos sintéticos por meio da interpolação entre exemplos existentes da classe minoritária.

In [114]:
def criar_dataset_balanceado(X, Y):
    '''
    Função de aplicação de diversos algoritmos de balanceamento de dados em conjuntos desbalanceados:
    '''
    
    # Random Undersampler
    rus = RandomUnderSampler(random_state=32)
    X_rus_res, y_rus_res = rus.fit_resample(X, Y)

    # NearMiss
    nm = NearMiss(version=1)
    X_nm_res, y_nm_res = nm.fit_resample(X, Y)

    # OneSidedSelection (Algoritmo tipo KNN)
    oss = OneSidedSelection(random_state=32)
    X_oss_res, y_oss_res = oss.fit_resample(X, Y)

    # Random Oversampler
    ros = RandomOverSampler(random_state=32)
    X_ros_res, y_ros_res = ros.fit_resample(X, Y)

    # SMOTE
    smote = SMOTE(random_state=32)
    X_smote_res, y_smote_res = smote.fit_resample(X, Y)
    
    return X_rus_res, y_rus_res, X_nm_res, y_nm_res, X_oss_res, y_oss_res, X_ros_res, y_ros_res, X_smote_res, y_smote_res

Essa função `fit_predict` realiza uma tarefa de aprendizado supervisionado. Ela divide os dados fornecidos em conjuntos de treino e teste, treina um modelo de aprendizado de máquina usando os dados de treino e, em seguida, faz previsões nos dados de teste. Após as previsões, calcula três métricas de desempenho comuns: acurácia, F1-Score e Coeficiente de Correlação de Matthews (MCC). Essas métricas são então exibidas na tela e retornadas como uma tupla contendo os valores de acurácia, F1-Score e MCC, respectivamente. Este processo permite avaliar o desempenho do modelo usando diferentes métricas para entender sua eficácia na classificação dos dados de teste.

Retorna os dados de Acurácia, F1-Score e MCC: 

```python
Random Undersampler
Accuracy:  0.842946490618485
F1-Score:  0.8308383233532933
MCC:  0.6900884943956258
```

In [115]:
def fit_predict(model, X, Y):
    train_cmds, test_cmds, train_labels, test_labels = train_test_split(
        X, Y, test_size=0.3, random_state=42
    )

    # Treinando o modelo
    model.fit(train_cmds, train_labels)

    # Prevendo os resultados
    y_pred = model.predict(test_cmds)

    # Calculando as métricas
    acc = accuracy_score(test_labels, y_pred)*100
    f1 = f1_score(test_labels, y_pred)*100
    mcc = matthews_corrcoef(test_labels, y_pred)*100
    nota = (acc + 2*f1 + mcc)/4 # Nota final

    # Retornando as métricas
    print ("Accuracy: ", acc)
    print ("F1-Score: ", f1)
    print ("MCC: ", mcc)
    print("Nota: ", nota)

    return {"Accuracy": acc, "F1-Score": f1, "MCC": mcc, "Nota": nota}

Por fim a automação das funções anteriores: 

In [116]:
import pandas as pd

def treinar_modelo(nome_modelo,modelo, X_rus_res, y_rus_res, X_nm_res, y_nm_res, X_oss_res, y_oss_res, X_ros_res, y_ros_res, X_smote_res, y_smote_res):
    resultados = []
    tecnicas = ["Random Undersampler", "NearMiss", "OneSidedSelection", "Random Oversampler", "SMOTE"]

    # Executando fit_predict para cada técnica e coletando resultados
    for tecnica, X_res, y_res in zip(tecnicas, [X_rus_res, X_nm_res, X_oss_res, X_ros_res, X_smote_res], [y_rus_res, y_nm_res, y_oss_res, y_ros_res, y_smote_res]):
        print('###################')
        print(tecnica)
        metricas = fit_predict(modelo, X_res, y_res)
        metricas["Tecnica"] = tecnica
        metricas["Modelo"] = nome_modelo
        resultados.append(metricas)

    # Convertendo a lista de resultados em um DataFrame
    df_resultados = pd.DataFrame(resultados)

    print('###################')
    return df_resultados


## Criação dos dados balanceados

In [117]:
%%time 
X_rus_res, y_rus_res, X_nm_res, y_nm_res, X_oss_res, y_oss_res, X_ros_res, y_ros_res, X_smote_res, y_smote_res = criar_dataset_balanceado(X_train_features, y)

CPU times: user 3min 18s, sys: 4.22 s, total: 3min 22s
Wall time: 24.3 s


## KNN

Treinamento com KNN usando janela de 01 comando e BagOfWords. 

In [118]:
# Classify
from sklearn.neighbors import KNeighborsClassifier

In [119]:
%%time
knn = KNeighborsClassifier(n_neighbors=5, n_jobs=-1)
DF_KNN = treinar_modelo("KNN", knn, X_rus_res, y_rus_res, X_nm_res, y_nm_res, X_oss_res, y_oss_res, X_ros_res, y_ros_res, X_smote_res, y_smote_res)

###################
Random Undersampler
Accuracy:  91.71084337349397
F1-Score:  91.70684667309546
MCC:  83.70857667353523
Nota:  89.70827834830503
###################
NearMiss
Accuracy:  92.04819277108433
F1-Score:  91.58592554818969
MCC:  84.08956364223661
Nota:  89.82740187742507
###################
OneSidedSelection
Accuracy:  98.97120945240286
F1-Score:  65.51724137931033
MCC:  65.03803627014175
Nota:  73.76093212029132
###################
Random Oversampler
Accuracy:  94.15329288192518
F1-Score:  94.10025018584093
MCC:  88.31074161091769
Nota:  92.66613371613118
###################
SMOTE
Accuracy:  94.05982161245862
F1-Score:  93.90111990370617
MCC:  88.20476374969024
Nota:  92.51670629239031
###################
CPU times: user 6min 3s, sys: 6.07 s, total: 6min 9s
Wall time: 42.9 s


## Árvore de decisão 

In [120]:
%%time
DT = DecisionTreeClassifier(criterion='entropy',random_state=0)
DF_DT = treinar_modelo("DT", DT,X_rus_res, y_rus_res, X_nm_res, y_nm_res, X_oss_res, y_oss_res, X_ros_res, y_ros_res, X_smote_res, y_smote_res)

###################
Random Undersampler
Accuracy:  91.7590361445783
F1-Score:  91.91489361702128
MCC:  84.15950164999133
Nota:  89.93708125715305
###################
NearMiss
Accuracy:  92.09638554216868
F1-Score:  91.60696008188332
MCC:  84.2010168908956
Nota:  89.87783064920774
###################
OneSidedSelection
Accuracy:  99.03356039468146
F1-Score:  69.27651139742319
MCC:  68.79702944173007
Nota:  76.59590315781449
###################
Random Oversampler
Accuracy:  94.54935758305477
F1-Score:  94.75409586106474
MCC:  89.43035936237877
Nota:  93.37197716689076
###################
SMOTE
Accuracy:  94.8590801793381
F1-Score:  95.05388144557745
MCC:  90.05747815895198
Nota:  93.75608030736124
###################
CPU times: user 14.4 s, sys: 47.1 ms, total: 14.4 s
Wall time: 14.6 s


## Random Forest

In [121]:
%%time
RF = RandomForestClassifier(n_estimators=100,
                            random_state=0,
                            criterion='gini')
DF_RF= treinar_modelo("RF",RF,X_rus_res, y_rus_res, X_nm_res, y_nm_res, X_oss_res, y_oss_res, X_ros_res, y_ros_res, X_smote_res, y_smote_res)

###################
Random Undersampler
Accuracy:  93.59036144578313
F1-Score:  93.62110311750598
MCC:  87.55720091714052
Nota:  92.0974421494839
###################
NearMiss
Accuracy:  92.53012048192771
F1-Score:  92.10392256749871
MCC:  85.0529369485173
Nota:  90.4477256413606
###################
OneSidedSelection
Accuracy:  99.09591133696009
F1-Score:  70.40816326530613
MCC:  69.95370974037351
Nota:  77.46648690198646
###################
Random Oversampler
Accuracy:  94.60639090001742
F1-Score:  94.80613591462811
MCC:  89.5360214132116
Nota:  93.43867103562131
###################
SMOTE
Accuracy:  94.97473107206793
F1-Score:  95.16330700497089
MCC:  90.28335704767966
Nota:  93.89617553242233
###################
CPU times: user 3min 12s, sys: 812 ms, total: 3min 13s
Wall time: 3min 38s


## Regressão Logística

In [122]:
%%time
RL = LogisticRegression(random_state=1, max_iter=500)
DF_RL=treinar_modelo("RL",RL,X_rus_res, y_rus_res, X_nm_res, y_nm_res, X_oss_res, y_oss_res, X_ros_res, y_ros_res, X_smote_res, y_smote_res)

###################
Random Undersampler
Accuracy:  91.80722891566265
F1-Score:  91.5
MCC:  83.59575909912101
Nota:  89.60074700369591
###################
NearMiss
Accuracy:  91.51807228915662
F1-Score:  91.02040816326532
MCC:  83.02766386008513
Nota:  89.14663811894309
###################
OneSidedSelection
Accuracy:  98.68907143859211
F1-Score:  51.63887291546866
MCC:  51.50666361374082
Nota:  63.36837022081756
###################
Random Oversampler
Accuracy:  90.96814055544114
F1-Score:  90.92096252767028
MCC:  81.93615505754278
Nota:  88.68655516708111
###################
SMOTE
Accuracy:  91.07111737773482
F1-Score:  91.02290465420025
MCC:  82.1421768351273
Nota:  88.81477588031566
###################
CPU times: user 5.22 s, sys: 174 ms, total: 5.39 s
Wall time: 5.47 s


## SVM

In [123]:
%%time
SVM_bow = SVC(kernel="linear",
              random_state=1,
              C=2.0)
DF_SVC=treinar_modelo("SVC",SVM_bow,X_rus_res, y_rus_res, X_nm_res, y_nm_res, X_oss_res, y_oss_res, X_ros_res, y_ros_res, X_smote_res, y_smote_res)

###################
Random Undersampler
Accuracy:  91.80722891566265
F1-Score:  91.33537206931702
MCC:  83.60431647400452
Nota:  89.5205723820753
###################
NearMiss
Accuracy:  91.66265060240963
F1-Score:  91.2316269640142
MCC:  83.30035104564857
Nota:  89.35656389402163
###################
OneSidedSelection
Accuracy:  98.66413106168068
F1-Score:  53.80053908355795
MCC:  53.25466508012291
Nota:  64.87996857722987
###################
Random Oversampler
Accuracy:  90.61326658322903
F1-Score:  90.35204845958445
MCC:  81.31123381325772
Nota:  88.15714932891392
###################
SMOTE
Accuracy:  91.12260578888167
F1-Score:  91.04135191091713
MCC:  82.24883924638503
Nota:  88.86353721427524
###################
CPU times: user 1h 7min 5s, sys: 21.6 s, total: 1h 7min 27s
Wall time: 1h 23min 39s


## Rede Neural

In [124]:
%%time
rede_neural = MLPClassifier(random_state=1,
                            tol=0.00000000001,
                            max_iter=1500,
                            hidden_layer_sizes=(64,64,64),
                            activation='tanh',
                            solver='adam',
                            verbose=False)
DF_NN=treinar_modelo("NN",rede_neural,X_rus_res, y_rus_res, X_nm_res, y_nm_res, X_oss_res, y_oss_res, X_ros_res, y_ros_res, X_smote_res, y_smote_res)

###################
Random Undersampler
Accuracy:  92.86746987951807
F1-Score:  92.95908658420552
MCC:  86.25612642223616
Nota:  91.26044236754132
###################
NearMiss
Accuracy:  91.90361445783132
F1-Score:  91.46341463414635
MCC:  83.78876872221468
Nota:  89.65480311208468
###################
OneSidedSelection
Accuracy:  99.06941218649166
F1-Score:  71.08958837772397
MCC:  70.67645444794059
Nota:  77.98126084747005
###################
Random Oversampler
Accuracy:  94.40439790244135
F1-Score:  94.33965287905255
MCC:  88.8183900271031
Nota:  92.97552342191238
###################
SMOTE
Accuracy:  94.72600243975856
F1-Score:  94.67649598618351
MCC:  89.45685220666975
Nota:  93.38396165469882
###################
CPU times: user 7min, sys: 1min 4s, total: 8min 5s
Wall time: 9min 48s


In [125]:
df_resultados_concatenados = pd.concat([DF_KNN, DF_DT, DF_RF, DF_RL, DF_SVC, DF_NN])
df_resultados_concatenados['Metodo'] = 'TFIDF'
df_resultados_concatenados.to_csv("resultados_concatenados_com_TFIDF.csv", index=False)
df_resultados_concatenados


Unnamed: 0,Accuracy,F1-Score,MCC,Nota,Tecnica,Modelo,Metodo
0,91.710843,91.706847,83.708577,89.708278,Random Undersampler,KNN,TFIDF
1,92.048193,91.585926,84.089564,89.827402,NearMiss,KNN,TFIDF
2,98.971209,65.517241,65.038036,73.760932,OneSidedSelection,KNN,TFIDF
3,94.153293,94.10025,88.310742,92.666134,Random Oversampler,KNN,TFIDF
4,94.059822,93.90112,88.204764,92.516706,SMOTE,KNN,TFIDF
0,91.759036,91.914894,84.159502,89.937081,Random Undersampler,DT,TFIDF
1,92.096386,91.60696,84.201017,89.877831,NearMiss,DT,TFIDF
2,99.03356,69.276511,68.797029,76.595903,OneSidedSelection,DT,TFIDF
3,94.549358,94.754096,89.430359,93.371977,Random Oversampler,DT,TFIDF
4,94.85908,95.053881,90.057478,93.75608,SMOTE,DT,TFIDF
