# Melhoria no tratamento de dados

In [1]:
'''
!pip uninstall -y scikit-learn
!pip install scikit-learn==1.2.2
'''

'\n!pip uninstall -y scikit-learn\n!pip install scikit-learn==1.2.2\n'

In [2]:
import re
from textacy.preprocessing.replace import emails, urls
from textacy.preprocessing.normalize import quotation_marks
from imblearn.under_sampling import RandomUnderSampler, NearMiss, OneSidedSelection
from imblearn.over_sampling import RandomOverSampler, SMOTE
import numpy as np
from sklearn.metrics import matthews_corrcoef, make_scorer, f1_score, accuracy_score
from sklearn.model_selection import cross_val_score, cross_validate
import os
import joblib
from pprint import pprint
from sklearn.model_selection import train_test_split


# Importação dos algoritmos de ML 

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier

In [3]:
def replace_quoted_string(text):
    """
    Substitui tudo que está entre aspas por _STRING_
    """
    return re.sub(r"\"(.+?)\"|\'(.+?)\'", "_STRING_", text)

def replace_file_path(text):
    """Replace file name or directory name with _PATH_
    """
    #return re.sub(r"(~)*(/[^/ ]*)+", "_PATH_", text) # ((?:[^/]*/)*)(.*)
    return re.sub(r"((?<= )[^ ]*/)([^/ ]*)", r"_PATH_/\2", text) # Modificado para substituir apenas o diretório não o arquivo final

def replace_env_variables(text):
    """Replace $String with $_ENV and String= with _ENV_=.
    """
    text = re.sub(r"\$([A-Za-z_][A-Za-z0-9_]*)", r"$_ENV", text)
    text = re.sub(r"([A-Za-z_][A-Za-z0-9_]*)=", r"_ENV_=", text)
    return text

def replace_ip_address(text):
    """Replace IPv4 with _IP_
    """
    return re.sub("[0-9]+(?:\.[0-9]+){3}", "_IP_", text)

def refine_pipe(text):
    """Make clear seperation between different commands in pipe.
    E.g.: cmd1|cmd2 -> cmd1 | cmd2
    """
    return text.replace("|", " | ")

def normalize(line):
   
    # convert to lowercase
    line = line.lower()
    
    # Replace $String with $_ENV and String= with _ENV_=
    line = replace_env_variables(line)

    # normalize quoted mark
    line = quotation_marks(line) #Função do TEXTACY

    # refine pipe operator
    line = refine_pipe(line)

    # replace quoted text by _STRING_
    line = replace_quoted_string(line)

    # replace IP adress by '_IP_' token
    line = replace_ip_address(line)

    # replace file path by '_PATH_' token
    line = replace_file_path(line)

    # replace email by '_EMAIL_' token
    line = emails(line, "_EMAIL_") #Função do TEXTACY

    # replace URL by '_URL_' token
    line = urls(line, "_URL_") #Função do TEXTACY
    
    return line


# Leitura dos arquivos Z

In [4]:
benign_logs = joblib.load(f"./art_benign.z")
GTFOBins = joblib.load(f"./art_gtfo.z")

benign_labels = [0] * len(benign_logs)
GTFOBins_labels = [1] * len(GTFOBins)

print(len(benign_logs))
print(len(GTFOBins))

660
1457


In [5]:
len(benign_logs[0])

23

# Criação da Janela de 03 Comandos

In [6]:
def create_n_command_by_sliding_window(session_cmds, labels, window_size=3):
    cmds_flat = []
    labels_flat = []

    for sess_idx, cmds in enumerate(session_cmds):
        for i in range(len(cmds) + 1):
            start_idx = max(0, i - window_size)
            if start_idx == i:
                continue
            cmds_flat.append(" ".join(cmds[start_idx:i]))
            labels_flat.append(labels[sess_idx])

    return cmds_flat, labels_flat

In [7]:
WINDOW_SIZE = 3

cmds_flat, labels_flat = create_n_command_by_sliding_window(
    session_cmds=benign_logs+GTFOBins, labels=benign_labels+GTFOBins_labels, window_size=WINDOW_SIZE
)

print("N-commands", len(cmds_flat), len(labels_flat))

N-commands 213859 213859


In [8]:
pprint(cmds_flat[1000:1003])

['git pull git add . git commit -m _STRING_',
 'git add . git commit -m _STRING_ git push',
 'git commit -m _STRING_ git push git checkout']


In [9]:
# try to filter the unique n-commands
unique_indices = []
cmd_set = set()

for i, cmd in enumerate(cmds_flat):
    if cmd not in cmd_set:
        cmd_set.add(cmd)
        unique_indices.append(i)

print(len(cmd_set))

157045


In [10]:
X_unique = [cmds_flat[i] for i in unique_indices]
y_unique = [labels_flat[i] for i in unique_indices]
print(len(X_unique), len(y_unique))

157045 157045


# Classificção de 3-Comandos com Doc2Vec

In [11]:
X, y = cmds_flat, labels_flat
print(len(X), len(y))

213859 213859


In [12]:
# create tagged corpus: {[list of token], [tag]}

from gensim.models import doc2vec

all_corpus = []

for cmd, lbl in zip(cmds_flat, labels_flat):
    tokens = cmd.split()
    all_corpus.append(doc2vec.TaggedDocument(tokens, str(lbl)))

In [13]:
# build Doc2Vec model

VECTOR_SIZE = 128

model = doc2vec.Doc2Vec(
    vector_size = VECTOR_SIZE,
    min_count = 2,
)

model.build_vocab(all_corpus)

# train model

model.train(all_corpus, total_examples=model.corpus_count, epochs=model.epochs)

In [14]:
# build feature

def extract_features(tagged_corpus, model):
    X = []
    y = []

    for words, tags in tagged_corpus:
        X.append(model.infer_vector(words))
        y.append(float(tags))

    return X, y

X_train_features, y = extract_features(all_corpus, model)

print(len(X_train_features), len(y))

213859 213859


# Lidando com dados desbalanceados

Função de aplicação de diversos algoritmos de balanceamento de dados em conjuntos desbalanceados:

1. **Random UnderSampler (RUS):** Este algoritmo reduz a quantidade de exemplos da classe majoritária aleatoriamente, de modo a equilibrar a distribuição das classes no conjunto de dados. Ele remove aleatoriamente exemplos da classe majoritária até que o número de exemplos de cada classe seja mais equilibrado.

2. **NearMiss (NM):** O NearMiss é uma técnica que visa selecionar exemplos da classe majoritária com base na distância de seus vizinhos da classe minoritária. Ele procura pelos exemplos da classe majoritária que estão mais próximos dos exemplos da classe minoritária e os mantém, reduzindo assim a discrepância entre as classes.

3. **OneSidedSelection (OSS):** Este algoritmo utiliza um método de seleção baseado em vizinhos para remover exemplos da classe majoritária que estão longe dos exemplos da classe minoritária. Funciona de forma semelhante ao algoritmo de seleção KNN (K-Nearest Neighbors).

4. **Random OverSampler (ROS):** Ao contrário do UnderSampling, o OverSampling aumenta a quantidade de exemplos da classe minoritária. O ROS gera exemplos sintéticos aleatórios da classe minoritária para equilibrar as classes.

5. **SMOTE (Synthetic Minority Over-sampling Technique):** Similar ao ROS, o SMOTE também trabalha com OverSampling, mas em vez de gerar exemplos sintéticos aleatoriamente, cria novos exemplos sintéticos por meio da interpolação entre exemplos existentes da classe minoritária.

In [15]:
def criar_dataset_balanceado(X, Y):
    '''
    Função de aplicação de diversos algoritmos de balanceamento de dados em conjuntos desbalanceados:
    '''
    
    # Random Undersampler
    rus = RandomUnderSampler(random_state=32)
    X_rus_res, y_rus_res = rus.fit_resample(X, Y)

    # NearMiss
    nm = NearMiss(version=1)
    X_nm_res, y_nm_res = nm.fit_resample(X, Y)

    # OneSidedSelection (Algoritmo tipo KNN)
    oss = OneSidedSelection(random_state=32)
    X_oss_res, y_oss_res = oss.fit_resample(X, Y)

    # Random Oversampler
    ros = RandomOverSampler(random_state=32)
    X_ros_res, y_ros_res = ros.fit_resample(X, Y)

    # SMOTE
    smote = SMOTE(random_state=32)
    X_smote_res, y_smote_res = smote.fit_resample(X, Y)
    
    return X_rus_res, y_rus_res, X_nm_res, y_nm_res, X_oss_res, y_oss_res, X_ros_res, y_ros_res, X_smote_res, y_smote_res

Essa função `fit_predict` realiza uma tarefa de aprendizado supervisionado. Ela divide os dados fornecidos em conjuntos de treino e teste, treina um modelo de aprendizado de máquina usando os dados de treino e, em seguida, faz previsões nos dados de teste. Após as previsões, calcula três métricas de desempenho comuns: acurácia, F1-Score e Coeficiente de Correlação de Matthews (MCC). Essas métricas são então exibidas na tela e retornadas como uma tupla contendo os valores de acurácia, F1-Score e MCC, respectivamente. Este processo permite avaliar o desempenho do modelo usando diferentes métricas para entender sua eficácia na classificação dos dados de teste.

Retorna os dados de Acurácia, F1-Score e MCC: 

```python
Random Undersampler
Accuracy:  0.842946490618485
F1-Score:  0.8308383233532933
MCC:  0.6900884943956258
```

In [16]:
def fit_predict(model, X, Y):
    train_cmds, test_cmds, train_labels, test_labels = train_test_split(
        X, Y, test_size=0.3, random_state=42
    )

    # Treinando o modelo
    model.fit(train_cmds, train_labels)

    # Prevendo os resultados
    y_pred = model.predict(test_cmds)

    # Calculando as métricas
    acc = accuracy_score(test_labels, y_pred)*100
    f1 = f1_score(test_labels, y_pred)*100
    mcc = matthews_corrcoef(test_labels, y_pred)*100
    nota = (acc + 2*f1 + mcc)/4 # Nota final

    # Retornando as métricas
    print ("Accuracy: ", acc)
    print ("F1-Score: ", f1)
    print ("MCC: ", mcc)
    print("Nota: ", nota)

    return {"Accuracy": acc, "F1-Score": f1, "MCC": mcc, "Nota": nota}

Por fim a automação das funções anteriores: 

In [17]:
import pandas as pd

def treinar_modelo(nome_modelo,modelo, X_rus_res, y_rus_res, X_nm_res, y_nm_res, X_oss_res, y_oss_res, X_ros_res, y_ros_res, X_smote_res, y_smote_res):
    resultados = []
    tecnicas = ["Random Undersampler", "NearMiss", "OneSidedSelection", "Random Oversampler", "SMOTE"]

    # Executando fit_predict para cada técnica e coletando resultados
    for tecnica, X_res, y_res in zip(tecnicas, [X_rus_res, X_nm_res, X_oss_res, X_ros_res, X_smote_res], [y_rus_res, y_nm_res, y_oss_res, y_ros_res, y_smote_res]):
        print('###################')
        print(tecnica)
        metricas = fit_predict(modelo, X_res, y_res)
        metricas["Tecnica"] = tecnica
        metricas["Modelo"] = nome_modelo
        resultados.append(metricas)

    # Convertendo a lista de resultados em um DataFrame
    df_resultados = pd.DataFrame(resultados)

    print('###################')
    return df_resultados


## Criação dos dados balanceados

In [18]:
%%time 
X_rus_res, y_rus_res, X_nm_res, y_nm_res, X_oss_res, y_oss_res, X_ros_res, y_ros_res, X_smote_res, y_smote_res = criar_dataset_balanceado(X_train_features, y)

CPU times: user 7min 18s, sys: 6.91 s, total: 7min 24s
Wall time: 45.2 s


## KNN

Treinamento com KNN usando janela de 01 comando e BagOfWords. 

In [19]:
# Classify
from sklearn.neighbors import KNeighborsClassifier

In [20]:
%%time
knn = KNeighborsClassifier(n_neighbors=5, n_jobs=-1)
DF_KNN = treinar_modelo("KNN", knn, X_rus_res, y_rus_res, X_nm_res, y_nm_res, X_oss_res, y_oss_res, X_ros_res, y_ros_res, X_smote_res, y_smote_res)

###################
Random Undersampler
Accuracy:  85.6867469879518
F1-Score:  86.16674429436422
MCC:  72.30255697364608
Nota:  82.58069813758158
###################
NearMiss
Accuracy:  94.3132530120482
F1-Score:  93.7033084311633
MCC:  89.14176644649741
Nota:  92.71540908021805
###################
OneSidedSelection
Accuracy:  99.03338642680907
F1-Score:  56.43912737508797
MCC:  60.80251974580607
Nota:  68.17854023069778
###################
Random Oversampler
Accuracy:  98.72625592116728
F1-Score:  98.73755613478629
MCC:  97.48435315492175
Nota:  98.4214303364154
###################
SMOTE
Accuracy:  91.76343847530933
F1-Score:  92.36350817408675
MCC:  84.69251836802661
Nota:  90.29574329787735
###################
CPU times: user 12min 11s, sys: 8.37 s, total: 12min 19s
Wall time: 1min 21s


## Árvore de decisão 

In [21]:
%%time
DT = DecisionTreeClassifier(criterion='entropy',random_state=0)
DF_DT = treinar_modelo("DT", DT,X_rus_res, y_rus_res, X_nm_res, y_nm_res, X_oss_res, y_oss_res, X_ros_res, y_ros_res, X_smote_res, y_smote_res)

###################
Random Undersampler
Accuracy:  78.26506024096386
F1-Score:  77.8813143697891
MCC:  56.597416000946666
Nota:  72.65627624537218
###################
NearMiss
Accuracy:  95.18072289156626
F1-Score:  95.0
MCC:  90.35167386883846
Nota:  93.88309919010118
###################
OneSidedSelection
Accuracy:  98.00899465942096
F1-Score:  39.140811455847256
MCC:  38.20514821367229
Nota:  53.623941446196945
###################
Random Oversampler
Accuracy:  99.35441453715879
F1-Score:  99.35613332385309
MCC:  98.71710213400074
Nota:  99.19594582971642
###################
SMOTE
Accuracy:  96.37442372585986
F1-Score:  96.4124752118262
MCC:  92.78797803375932
Nota:  95.4968380458179
###################
CPU times: user 1min 40s, sys: 147 ms, total: 1min 41s
Wall time: 1min 44s


## Random Forest

In [22]:
%%time
RF = RandomForestClassifier(n_estimators=100,
                            random_state=0,
                            criterion='gini')
DF_RF= treinar_modelo("RF",RF,X_rus_res, y_rus_res, X_nm_res, y_nm_res, X_oss_res, y_oss_res, X_ros_res, y_ros_res, X_smote_res, y_smote_res)

###################
Random Undersampler
Accuracy:  89.20481927710844
F1-Score:  88.40579710144928
MCC:  78.45716109170804
Nota:  86.11839364292877
###################
NearMiss
Accuracy:  98.16867469879517
F1-Score:  98.10756972111552
MCC:  96.34522504380325
Nota:  97.68225979620738
###################
OneSidedSelection
Accuracy:  98.7601111839845
F1-Score:  32.597623089983024
MCC:  43.66449955600958
Nota:  51.904964229990036
###################
Random Oversampler
Accuracy:  99.99920787059774
F1-Score:  99.99920486621873
MCC:  99.99841573112674
Nota:  99.99900833354049
###################
SMOTE
Accuracy:  99.80909681405554
F1-Score:  99.80833923160732
MCC:  99.61819318654345
Nota:  99.76099211595341
###################
CPU times: user 10min 37s, sys: 902 ms, total: 10min 38s
Wall time: 10min 49s


## Regressão Logística

In [23]:
%%time
RL = LogisticRegression(random_state=1, max_iter=500)
DF_RL=treinar_modelo("RL",RL,X_rus_res, y_rus_res, X_nm_res, y_nm_res, X_oss_res, y_oss_res, X_ros_res, y_ros_res, X_smote_res, y_smote_res)

###################
Random Undersampler
Accuracy:  88.48192771084338
F1-Score:  87.78742973939703
MCC:  76.93976420161135
Nota:  85.2491378478122
###################
NearMiss
Accuracy:  96.67469879518073
F1-Score:  96.49568308786188
MCC:  93.35587524017868
Nota:  95.75548505277078
###################
OneSidedSelection
Accuracy:  98.94437677628908
F1-Score:  49.6274217585693
MCC:  55.76360579653367
Nota:  63.49070652249034
###################
Random Oversampler
Accuracy:  90.81288319259835
F1-Score:  90.56229148018554
MCC:  81.7071047966614
Nota:  88.4111427374077
###################
SMOTE
Accuracy:  91.97414489631026
F1-Score:  91.82111721020343
MCC:  83.98367245406322
Nota:  89.9000129426951
###################
CPU times: user 2min 6s, sys: 55.5 s, total: 3min 2s
Wall time: 21.1 s


## SVM

In [24]:
%%time
SVM_bow = SVC(kernel="linear",
              random_state=1,
              C=2.0)
DF_SVC=treinar_modelo("SVC",SVM_bow,X_rus_res, y_rus_res, X_nm_res, y_nm_res, X_oss_res, y_oss_res, X_ros_res, y_ros_res, X_smote_res, y_smote_res)

###################
Random Undersampler
Accuracy:  89.63855421686748
F1-Score:  88.96870189840944
MCC:  79.27794152532847
Nota:  86.7134748847537
###################
NearMiss
Accuracy:  97.25301204819277
F1-Score:  97.13998996487707
MCC:  94.49747616730116
Nota:  96.50761703631203
###################
OneSidedSelection
Accuracy:  99.09741091227085
F1-Score:  60.02766251728908
MCC:  64.03883729669599
Nota:  70.79789331088625
###################
Random Oversampler
Accuracy:  91.55906908952647
F1-Score:  91.29824102957748
MCC:  83.22883130137903
Nota:  89.34609561251511
###################
SMOTE
Accuracy:  92.65379192344861
F1-Score:  92.49906986525178
MCC:  85.35394082396745
Nota:  90.7514681194799
###################
CPU times: user 1h 55min 45s, sys: 14.7 s, total: 1h 55min 59s
Wall time: 4h 8min 9s


## Rede Neural

In [25]:
%%time
rede_neural = MLPClassifier(random_state=1,
                            tol=0.00000000001,
                            max_iter=1500,
                            hidden_layer_sizes=(64,64,64),
                            activation='tanh',
                            solver='adam',
                            verbose=False)
DF_NN=treinar_modelo("NN",rede_neural,X_rus_res, y_rus_res, X_nm_res, y_nm_res, X_oss_res, y_oss_res, X_ros_res, y_ros_res, X_smote_res, y_smote_res)

###################
Random Undersampler
Accuracy:  91.42168674698794
F1-Score:  91.23152709359604
MCC:  82.89669474720993
Nota:  89.1953589203475
###################
NearMiss
Accuracy:  97.63855421686746
F1-Score:  97.5413948820873
MCC:  95.26976675382699
Nota:  96.99777768371726
###################
OneSidedSelection
Accuracy:  99.27074549486242
F1-Score:  76.47355163727958
MCC:  76.10609145403356
Nota:  82.08098505586379
###################
Random Oversampler
Accuracy:  99.31163954943679
F1-Score:  99.3117758400849
MCC:  98.62643234504814
Nota:  99.1404058936637
###################
SMOTE
Accuracy:  99.64750241599468
F1-Score:  99.64702429583329
MCC:  99.29618214702758
Nota:  99.55943328867221
###################
CPU times: user 2h 42min 3s, sys: 6min 10s, total: 2h 48min 13s
Wall time: 14min 49s


In [26]:
df_resultados_concatenados = pd.concat([DF_KNN, DF_DT, DF_RF, DF_RL, DF_SVC, DF_NN])
df_resultados_concatenados['Metodo'] = 'DOC2VEC'
df_resultados_concatenados.to_csv("resultados_concatenados_com_doc2vec.csv", index=False)
df_resultados_concatenados

Unnamed: 0,Accuracy,F1-Score,MCC,Nota,Tecnica,Modelo,Metodo
0,85.686747,86.166744,72.302557,82.580698,Random Undersampler,KNN,DOC2VEC
1,94.313253,93.703308,89.141766,92.715409,NearMiss,KNN,DOC2VEC
2,99.033386,56.439127,60.80252,68.17854,OneSidedSelection,KNN,DOC2VEC
3,98.726256,98.737556,97.484353,98.42143,Random Oversampler,KNN,DOC2VEC
4,91.763438,92.363508,84.692518,90.295743,SMOTE,KNN,DOC2VEC
0,78.26506,77.881314,56.597416,72.656276,Random Undersampler,DT,DOC2VEC
1,95.180723,95.0,90.351674,93.883099,NearMiss,DT,DOC2VEC
2,98.008995,39.140811,38.205148,53.623941,OneSidedSelection,DT,DOC2VEC
3,99.354415,99.356133,98.717102,99.195946,Random Oversampler,DT,DOC2VEC
4,96.374424,96.412475,92.787978,95.496838,SMOTE,DT,DOC2VEC
