# Melhoria no tratamento de dados

In [1]:
'''
!pip uninstall -y scikit-learn
!pip install scikit-learn==1.2.2
'''


'\n!pip uninstall -y scikit-learn\n!pip install scikit-learn==1.2.2\n'

Importação de bibliotecas importantes

In [2]:
import re
from textacy.preprocessing.replace import emails, urls
from textacy.preprocessing.normalize import quotation_marks
from imblearn.under_sampling import RandomUnderSampler, NearMiss, OneSidedSelection
from imblearn.over_sampling import RandomOverSampler, SMOTE
import numpy as np
from sklearn.metrics import matthews_corrcoef, make_scorer, f1_score, accuracy_score
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.model_selection import train_test_split



# Importação dos algoritmos de ML 

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier

Criação da função de tratamento de texto

In [3]:
def replace_quoted_string(text):
    """
    Substitui tudo que está entre aspas por _STRING_
    """
    return re.sub(r"\"(.+?)\"|\'(.+?)\'", "_STRING_", text)

def replace_file_path(text):
    """Replace file name or directory name with _PATH_
    """
    #return re.sub(r"(~)*(/[^/ ]*)+", "_PATH_", text) # ((?:[^/]*/)*)(.*)
    return re.sub(r"((?<= )[^ ]*/)([^/ ]*)", r"_PATH_/\2", text) # Modificado para substituir apenas o diretório não o arquivo final

def replace_env_variables(text):
    """Replace $String with $_ENV and String= with _ENV_=.
    """
    text = re.sub(r"\$([A-Za-z_][A-Za-z0-9_]*)", r"$_ENV", text)
    text = re.sub(r"([A-Za-z_][A-Za-z0-9_]*)=", r"_ENV_=", text)
    return text

def replace_ip_address(text):
    """Replace IPv4 with _IP_
    """
    return re.sub("[0-9]+(?:\.[0-9]+){3}", "_IP_", text)

def refine_pipe(text):
    """Make clear seperation between different commands in pipe.
    E.g.: cmd1|cmd2 -> cmd1 | cmd2
    """
    return text.replace("|", " | ")

def normalize(line):
   
    # convert to lowercase
    line = line.lower()
    
    # Replace $String with $_ENV and String= with _ENV_=
    line = replace_env_variables(line)

    # normalize quoted mark
    line = quotation_marks(line) #Função do TEXTACY

    # refine pipe operator
    line = refine_pipe(line)

    # replace quoted text by _STRING_
    line = replace_quoted_string(line)

    # replace IP adress by '_IP_' token
    line = replace_ip_address(line)

    # replace file path by '_PATH_' token
    line = replace_file_path(line)

    # replace email by '_EMAIL_' token
    line = emails(line, "_EMAIL_") #Função do TEXTACY

    # replace URL by '_URL_' token
    line = urls(line, "_URL_") #Função do TEXTACY
    
    return line


# Leitura dos dados do GTFOBins 

https://gtfobins.github.io/gtfobins.json

In [4]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
from pprint import pprint

In [5]:
gtfo_df = pd.read_json('https://gtfobins.github.io/gtfobins.json')
gtfo_df.T.head(5)

Unnamed: 0,description,functions
7z,,{'file-read': [{'code': 'LFILE=file_to_read 7z...
aa-exec,,"{'shell': [{'code': 'aa-exec /bin/sh'}], 'suid..."
ab,,{'file-upload': [{'description': 'Upload local...
agetty,,{'suid': [{'code': './agetty -o -p -l /bin/sh ...
alpine,The file is displayed in the `alpine` curses t...,{'file-read': [{'code': 'LFILE=file_to_read al...


A lista de GTFOBins traz os comandos em uma linha só com `\n` para realizar a quebra de linha. 

In [6]:
gtfo_list = []
for function in gtfo_df.T.functions:
    for keys in function:
        for i in function[keys]:
            for chave in i:
                if chave == 'code':
                    gtfo_list.append(i[chave])                    
gtfo_list[10:20]                   


['LFILE=file_to_read\nalpine -F "$LFILE"\n',
 'LFILE=file_to_read\n./alpine -F "$LFILE"\n',
 'LFILE=file_to_read\nsudo alpine -F "$LFILE"\n',
 "TF=$(mktemp)\necho '[{hosts: localhost, tasks: [shell: /bin/sh </dev/tty >/dev/tty 2>/dev/tty]}]' >$TF\nansible-playbook $TF\n",
 "TF=$(mktemp)\necho '[{hosts: localhost, tasks: [shell: /bin/sh </dev/tty >/dev/tty 2>/dev/tty]}]' >$TF\nsudo ansible-playbook $TF\n",
 'ansible-test shell',
 'sudo ansible-test shell',
 'aoss /bin/sh',
 'sudo aoss /bin/sh',
 'LFILE=file_to_read\napache2ctl -c "Include $LFILE" -k stop\n']

Tratamento dos `\n`

Ao fim, cada entrada receberá uma lista com os comandos já separados a cada quebra de linha. 

In [11]:
# Separando os comandos por \n e criando uma lista de listas
gfto_split = []
for x in gtfo_list:
    gfto_split.append(x.split('\n'))

# Normalizando os comandos com a função normalize()
gtfo_norm = []
for x in gfto_split:
    a = []
    for y in x:
        a.append (normalize(y))
    gtfo_norm.append(a)

# Removendo itens vazios da lista
for cmd in gtfo_norm:
    for x in cmd:
        if x == '':
            cmd.remove(x)

gtfo_norm[1:5]

[['_ENV_=file_to_read',
  'sudo 7z a -ttar -an -so $_ENV  |  7z e -ttar -si -so'],
 ['aa-exec _PATH_/sh'],
 ['./aa-exec _PATH_/sh -p'],
 ['sudo aa-exec _PATH_/sh']]

Verificando quantidade de comandos únicos. 

In [12]:
lolbins_linux = []
for cmd in gtfo_norm:
    for x in cmd:
        lolbins_linux.append(x)
        

lolbins_unique = list(set(lolbins_linux))
len(lolbins_unique)

1667

# Leitura dos dados benignos

In [13]:
data_path = "./Datasets/"
bash_logs_path = data_path + "bash_logs/"

In [14]:
# read bash log data
bash_file_names = os.listdir(bash_logs_path)
benign_logs = []

for file in bash_file_names:
    with open(os.path.join(bash_logs_path, file)) as f:
        benign_logs.append([normalize(line) for line in f.read()
                           .replace("&&", "\n")
                           .replace(";", "\n")
                           .splitlines()])

In [15]:
print ("Comandos da Primeira sessão")
benign_logs[0]

Comandos da Primeira sessão


['w',
 'ls',
 'cd _PATH_/tmp',
 'wget',
 'ps ax',
 'wget _PATH_/psybnc.tar.gz',
 'tar xzvf psybnc.tar.gz',
 'cd psybnc',
 'make',
 './psybnc',
 'ls',
 'make',
 'cd ..',
 'wget _PATH_/psyl.tar.gz',
 'tar xzvf psyl.tar.gz',
 'cd .bash',
 './xinetd',
 'ls',
 'w',
 'rm -rf _PATH_/log',
 'cd _PATH_/home',
 'ls',
 'rm -rf .bash_history']

In [16]:
import itertools
benign_unique = set(itertools.chain.from_iterable(benign_logs))
print(len(benign_unique))

54993


In [17]:
while("" in benign_unique):
    benign_unique.remove("")

In [18]:
print(len(benign_unique))

54992


# Salvando os dados com JobLib

Salvando as bases de dados cruas para caso seja necessário retornar desse ponto.

In [19]:
import joblib
joblib.dump(benign_logs, "art_benign.z")
joblib.dump(gtfo_norm, "art_gtfo.z")

['art_gtfo.z']

# Criação da base completa

In [20]:
# create one-command flatten list of input

benign_one_cmd = [cmd.strip() for sess in benign_logs for cmd in sess]
gtfo_one_cmd = [cmd.strip() for sess in gtfo_norm for cmd in sess]

unique_benign_one_cmd = set(benign_one_cmd)
unique_gtfo_one_cmd = set(gtfo_one_cmd)

print(len(benign_one_cmd), len(unique_benign_one_cmd))
print(len(gtfo_one_cmd), len(unique_gtfo_one_cmd))


210402 53091
3457 1649


Dados claramente desbalanceados 

https://medium.com/turing-talks/dados-desbalanceados-o-que-são-e-como-evitá-los-43df4f49732b

In [21]:
# create dataset of one-command

one_cmd_corpus = benign_one_cmd + gtfo_one_cmd
one_cmd_labels = [0] * len(benign_one_cmd) + [1] * len(gtfo_one_cmd)

print(len(one_cmd_corpus), len(one_cmd_labels))
pprint(one_cmd_corpus[213815])

propocao = (len (gtfo_one_cmd) / len(one_cmd_corpus))* 100 
print ("Proporção dos comandos maliciosos: ", propocao, "%")

213859 213859
'_ENV_=_STRING_'
Proporção dos comandos maliciosos:  1.6164856283813165 %


# Lidando com dados desbalanceados

Função de aplicação de diversos algoritmos de balanceamento de dados em conjuntos desbalanceados:

1. **Random UnderSampler (RUS):** Este algoritmo reduz a quantidade de exemplos da classe majoritária aleatoriamente, de modo a equilibrar a distribuição das classes no conjunto de dados. Ele remove aleatoriamente exemplos da classe majoritária até que o número de exemplos de cada classe seja mais equilibrado.

2. **NearMiss (NM):** O NearMiss é uma técnica que visa selecionar exemplos da classe majoritária com base na distância de seus vizinhos da classe minoritária. Ele procura pelos exemplos da classe majoritária que estão mais próximos dos exemplos da classe minoritária e os mantém, reduzindo assim a discrepância entre as classes.

3. **OneSidedSelection (OSS):** Este algoritmo utiliza um método de seleção baseado em vizinhos para remover exemplos da classe majoritária que estão longe dos exemplos da classe minoritária. Funciona de forma semelhante ao algoritmo de seleção KNN (K-Nearest Neighbors).

4. **Random OverSampler (ROS):** Ao contrário do UnderSampling, o OverSampling aumenta a quantidade de exemplos da classe minoritária. O ROS gera exemplos sintéticos aleatórios da classe minoritária para equilibrar as classes.

5. **SMOTE (Synthetic Minority Over-sampling Technique):** Similar ao ROS, o SMOTE também trabalha com OverSampling, mas em vez de gerar exemplos sintéticos aleatoriamente, cria novos exemplos sintéticos por meio da interpolação entre exemplos existentes da classe minoritária.

In [22]:
def criar_dataset_balanceado(X, Y):
    '''
    Função de aplicação de diversos algoritmos de balanceamento de dados em conjuntos desbalanceados:
    '''
    
    # Random Undersampler
    rus = RandomUnderSampler(random_state=32)
    X_rus_res, y_rus_res = rus.fit_resample(X, Y)

    # NearMiss
    nm = NearMiss(version=1)
    X_nm_res, y_nm_res = nm.fit_resample(X, Y)

    # OneSidedSelection (Algoritmo tipo KNN)
    oss = OneSidedSelection(random_state=32)
    X_oss_res, y_oss_res = oss.fit_resample(X, Y)

    # Random Oversampler
    ros = RandomOverSampler(random_state=32)
    X_ros_res, y_ros_res = ros.fit_resample(X, Y)

    # SMOTE
    smote = SMOTE(random_state=32)
    X_smote_res, y_smote_res = smote.fit_resample(X, Y)
    
    return X_rus_res, y_rus_res, X_nm_res, y_nm_res, X_oss_res, y_oss_res, X_ros_res, y_ros_res, X_smote_res, y_smote_res

Essa função `fit_predict` realiza uma tarefa de aprendizado supervisionado. Ela divide os dados fornecidos em conjuntos de treino e teste, treina um modelo de aprendizado de máquina usando os dados de treino e, em seguida, faz previsões nos dados de teste. Após as previsões, calcula três métricas de desempenho comuns: acurácia, F1-Score e Coeficiente de Correlação de Matthews (MCC). Essas métricas são então exibidas na tela e retornadas como uma tupla contendo os valores de acurácia, F1-Score e MCC, respectivamente. Este processo permite avaliar o desempenho do modelo usando diferentes métricas para entender sua eficácia na classificação dos dados de teste.

Retorna os dados de Acurácia, F1-Score e MCC: 

```python
Random Undersampler
Accuracy:  0.842946490618485
F1-Score:  0.8308383233532933
MCC:  0.6900884943956258
```

In [23]:
def fit_predict(model, X, Y):
    train_cmds, test_cmds, train_labels, test_labels = train_test_split(
    X, Y, test_size=0.3, random_state=42
    )
    
    # Treinando o modelo
    model.fit(train_cmds, train_labels)
    
    # Prevendo os resultados
    y_pred = model.predict(test_cmds)
    
    # Calculando as métricas
    acc = accuracy_score(test_labels, y_pred)*100
    f1 = f1_score(test_labels, y_pred)*100
    mcc = matthews_corrcoef(test_labels, y_pred)*100
    nota = (acc + 2*f1 + mcc)/4 # Nota final
    
    # Retornando as métricas
    print ("Accuracy: ", acc)
    print ("F1-Score: ", f1)
    print ("MCC: ", mcc)
    print("Nota: ", nota)

    return {"Accuracy": acc, "F1-Score": f1, "MCC": mcc, "Nota": nota}

Por fim a automação das funções anteriores: 

In [122]:
import pandas as pd

def treinar_modelo(nome_modelo,modelo, X_rus_res, y_rus_res, X_nm_res, y_nm_res, X_oss_res, y_oss_res, X_ros_res, y_ros_res, X_smote_res, y_smote_res):
    resultados = []
    tecnicas = ["Random Undersampler", "NearMiss", "OneSidedSelection", "Random Oversampler", "SMOTE"]

    # Executando fit_predict para cada técnica e coletando resultados
    for tecnica, X_res, y_res in zip(tecnicas, [X_rus_res, X_nm_res, X_oss_res, X_ros_res, X_smote_res], [y_rus_res, y_nm_res, y_oss_res, y_ros_res, y_smote_res]):
        print('###################')
        print(tecnica)
        metricas = fit_predict(modelo, X_res, y_res)
        metricas["Tecnica"] = tecnica
        metricas["Modelo"] = nome_modelo
        resultados.append(metricas)

    # Convertendo a lista de resultados em um DataFrame
    df_resultados = pd.DataFrame(resultados)

    print('###################')
    return df_resultados


# 1 - Bag Of Words - Binário

Não há necessidade de dividir o corpus do dataset antes do tratamento com BoW, pois a função `treinar_modelo()` realiza a divisão e treino de modelo.

In [123]:
from sklearn.feature_extraction.text import CountVectorizer

# Criando o vetorizador de palavras com o máximo de 256 palavras

vocab_size = 256
vectorizer = CountVectorizer(max_features=vocab_size)
vectorizer.fit(one_cmd_corpus)

In [124]:
# Criando o vetor de features

X_train_encoded = vectorizer.transform(one_cmd_corpus)

## Criação dos dados balanceados

In [125]:
%%time 
X_rus_res, y_rus_res, X_nm_res, y_nm_res, X_oss_res, y_oss_res, X_ros_res, y_ros_res, X_smote_res, y_smote_res = criar_dataset_balanceado(X_train_encoded, one_cmd_labels)

CPU times: user 9min 12s, sys: 53.2 s, total: 10min 5s
Wall time: 10min 47s


## KNN

Treinamento com KNN usando janela de 01 comando e BagOfWords. 

In [126]:
# Classify
from sklearn.neighbors import KNeighborsClassifier

In [127]:
%%time
knn = KNeighborsClassifier(n_neighbors=5, n_jobs=-1)
DF_KNN = treinar_modelo("KNN", knn, X_rus_res, y_rus_res, X_nm_res, y_nm_res, X_oss_res, y_oss_res, X_ros_res, y_ros_res, X_smote_res, y_smote_res)

###################
Random Undersampler
Accuracy:  89.5421686746988
F1-Score:  88.84318766066838
MCC:  79.09448552186852
Nota:  86.58075737947603
###################
NearMiss
Accuracy:  89.6867469879518
F1-Score:  88.62911795961742
MCC:  79.70688022491014
Nota:  86.66296578302419
###################
OneSidedSelection
Accuracy:  98.85127341874747
F1-Score:  59.88023952095808
MCC:  59.73613403832566
Nota:  69.58697162474732
###################
Random Oversampler
Accuracy:  89.90351863880484
F1-Score:  90.3763099876174
MCC:  80.26522468883223
Nota:  87.73034082571797
###################
SMOTE
Accuracy:  91.21370067014148
F1-Score:  90.68228020362561
MCC:  82.8918318037954
Nota:  88.86752322029703
###################
CPU times: user 17min 30s, sys: 3min 53s, total: 21min 23s
Wall time: 12min 30s


In [128]:
DF_KNN

Unnamed: 0,Accuracy,F1-Score,MCC,Nota,Tecnica,Modelo
0,89.542169,88.843188,79.094486,86.580757,Random Undersampler,KNN
1,89.686747,88.629118,79.70688,86.662966,NearMiss,KNN
2,98.851273,59.88024,59.736134,69.586972,OneSidedSelection,KNN
3,89.903519,90.37631,80.265225,87.730341,Random Oversampler,KNN
4,91.213701,90.68228,82.891832,88.867523,SMOTE,KNN


## Árvore de decisão 

In [129]:
%%time
DT = DecisionTreeClassifier(criterion='entropy',random_state=0)
DF_DT = treinar_modelo("DT", DT,X_rus_res, y_rus_res, X_nm_res, y_nm_res, X_oss_res, y_oss_res, X_ros_res, y_ros_res, X_smote_res, y_smote_res)

###################
Random Undersampler
Accuracy:  90.45783132530121
F1-Score:  89.74093264248705
MCC:  80.98510122726414
Nota:  87.73119945938485
###################
NearMiss
Accuracy:  89.9277108433735
F1-Score:  89.12024986985946
MCC:  79.9567510924247
Nota:  87.03124041887929
###################
OneSidedSelection
Accuracy:  98.96349636834067
F1-Score:  61.31471785922048
MCC:  62.030525862432675
Nota:  70.90586448730357
###################
Random Oversampler
Accuracy:  91.98444257853963
F1-Score:  91.63200330783543
MCC:  84.21197772751148
Nota:  89.86510673043048
###################
SMOTE
Accuracy:  91.9384990732086
F1-Score:  91.53940159784517
MCC:  84.18921569202259
Nota:  89.80162949023038
###################
CPU times: user 1.53 s, sys: 21.8 ms, total: 1.55 s
Wall time: 1.56 s


In [130]:
DF_DT

Unnamed: 0,Accuracy,F1-Score,MCC,Nota,Tecnica,Modelo
0,90.457831,89.740933,80.985101,87.731199,Random Undersampler,DT
1,89.927711,89.12025,79.956751,87.03124,NearMiss,DT
2,98.963496,61.314718,62.030526,70.905864,OneSidedSelection,DT
3,91.984443,91.632003,84.211978,89.865107,Random Oversampler,DT
4,91.938499,91.539402,84.189216,89.801629,SMOTE,DT


## Random Forest

In [131]:
%%time
RF = RandomForestClassifier(n_estimators=100,
                            random_state=0,
                            criterion='gini')
DF_RF= treinar_modelo("RF",RF,X_rus_res, y_rus_res, X_nm_res, y_nm_res, X_oss_res, y_oss_res, X_ros_res, y_ros_res, X_smote_res, y_smote_res)

###################
Random Undersampler
Accuracy:  91.27710843373494
F1-Score:  90.61689994815968
MCC:  82.63737618010549
Nota:  88.78707112753995
###################
NearMiss
Accuracy:  90.02409638554218
F1-Score:  89.23556942277692
MCC:  80.14178484764636
Nota:  87.15925501968559
###################
OneSidedSelection
Accuracy:  98.97596558496213
F1-Score:  61.601402688486274
MCC:  62.40377659845261
Nota:  71.14563689009682
###################
Random Oversampler
Accuracy:  92.02088053104355
F1-Score:  91.6668734850553
MCC:  84.28981088310674
Nota:  89.91110959606522
###################
SMOTE
Accuracy:  91.97256063750574
F1-Score:  91.57255717255718
MCC:  84.26192197433515
Nota:  89.84489923923881
###################
CPU times: user 41.3 s, sys: 552 ms, total: 41.8 s
Wall time: 42.4 s


In [132]:
DF_RF

Unnamed: 0,Accuracy,F1-Score,MCC,Nota,Tecnica,Modelo
0,91.277108,90.6169,82.637376,88.787071,Random Undersampler,RF
1,90.024096,89.235569,80.141785,87.159255,NearMiss,RF
2,98.975966,61.601403,62.403777,71.145637,OneSidedSelection,RF
3,92.020881,91.666873,84.289811,89.91111,Random Oversampler,RF
4,91.972561,91.572557,84.261922,89.844899,SMOTE,RF


## Regressão Logística

In [133]:
%%time
RL = LogisticRegression(random_state=1, max_iter=500)
DF_RL=treinar_modelo("RL",RL,X_rus_res, y_rus_res, X_nm_res, y_nm_res, X_oss_res, y_oss_res, X_ros_res, y_ros_res, X_smote_res, y_smote_res)

###################
Random Undersampler
Accuracy:  90.3132530120482
F1-Score:  89.39313984168865
MCC:  80.88059280516238
Nota:  87.49503137514698
###################
NearMiss
Accuracy:  89.6867469879518
F1-Score:  88.90041493775934
MCC:  79.44178531185386
Nota:  86.7323405438311
###################
OneSidedSelection
Accuracy:  98.57383334891986
F1-Score:  32.47232472324724
MCC:  37.3961810934189
Nota:  50.22866597220831
###################
Random Oversampler
Accuracy:  89.39655582135899
F1-Score:  88.86541340875061
MCC:  79.09162545161543
Nota:  86.55475202261891
###################
SMOTE
Accuracy:  90.03659637838437
F1-Score:  90.68213470827037
MCC:  80.95199791228976
Nota:  88.08821592680371
###################
CPU times: user 5.79 s, sys: 93.6 ms, total: 5.88 s
Wall time: 6.01 s


In [134]:
DF_RL

Unnamed: 0,Accuracy,F1-Score,MCC,Nota,Tecnica,Modelo
0,90.313253,89.39314,80.880593,87.495031,Random Undersampler,RL
1,89.686747,88.900415,79.441785,86.732341,NearMiss,RL
2,98.573833,32.472325,37.396181,50.228666,OneSidedSelection,RL
3,89.396556,88.865413,79.091625,86.554752,Random Oversampler,RL
4,90.036596,90.682135,80.951998,88.088216,SMOTE,RL


## SVM

In [135]:
%%time
SVM_bow = SVC(kernel="linear",
                 random_state=1,
                 C=2.0)
DF_SVC=treinar_modelo("SVC",SVM_bow,X_rus_res, y_rus_res, X_nm_res, y_nm_res, X_oss_res, y_oss_res, X_ros_res, y_ros_res, X_smote_res, y_smote_res)

###################
Random Undersampler
Accuracy:  89.34939759036145
F1-Score:  88.3989501312336
MCC:  78.87331424732366
Nota:  86.25515302503808
###################
NearMiss
Accuracy:  89.59036144578313
F1-Score:  88.79668049792532
MCC:  79.24785457923046
Nota:  86.60789425521607
###################
OneSidedSelection
Accuracy:  98.56448143645376
F1-Score:  25.182778229082047
MCC:  33.47184538335076
Nota:  45.60047081949215
###################
Random Oversampler
Accuracy:  89.0044517672407
F1-Score:  88.44607586086349
MCC:  78.31274185729549
Nota:  86.05233633656579
###################
SMOTE
Accuracy:  89.78073858145467
F1-Score:  90.46115625485224
MCC:  80.48527194533833
Nota:  87.79708075912438
###################
CPU times: user 34min 36s, sys: 20.3 s, total: 34min 56s
Wall time: 47min 46s


## Rede Neural

In [None]:
%%time 
rede_neural = MLPClassifier(random_state=1,
                                 tol=0.00000000001,
                                 max_iter=1500,
                                 hidden_layer_sizes=(64,64,64),
                                 activation='tanh',
                                 solver='adam',
                                 verbose=False)
DF_NN=treinar_modelo("NN",rede_neural,X_rus_res, y_rus_res, X_nm_res, y_nm_res, X_oss_res, y_oss_res, X_ros_res, y_ros_res, X_smote_res, y_smote_res)

###################
Random Undersampler
Accuracy:  88.57831325301206
F1-Score:  89.10344827586208
MCC:  78.49790243731263
Nota:  86.32077806051221
###################
NearMiss
Accuracy:  90.8433734939759
F1-Score:  89.96832101372758
MCC:  81.95814882083982
Nota:  88.18454108556772
###################
OneSidedSelection


# Concatenando os resultados

In [None]:
df_resultados_concatenados = pd.concat([DF_KNN, DF_DT, DF_RF, DF_RL, DF_SVC, DF_NN])
df_resultados_concatenados['Metodo'] = 'BoW'
df_resultados_concatenados.to_csv("resultados_concatenados_com_BOW.csv", index=False)


In [None]:
df_resultados_concatenados