### Imports e utilitários

In [90]:
import os, json, math, glob
from pathlib import Path
from typing import List, Dict, Any

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, precision_recall_fscore_support

In [91]:
# CÉLULA — Parser robusto p/ JSONL com dicts em string (aspas simples / None)
import json, re
from ast import literal_eval
from typing import Any, Dict, List

def _try_literal_eval(s: str) -> Any:
    """Tenta converter string 'estilo Python' (ex.: "{'k': 'v'}", 'None') em objeto Python."""
    s = s.strip()
    if s in ("None", "none", "NULL", "Null", "null"):
        return None
    # se parece com dict/list/tuple/num/str python → tenta literal_eval
    if (s.startswith("{") and s.endswith("}")) or \
       (s.startswith("[") and s.endswith("]")) or \
       (s.startswith("(") and s.endswith(")")) or \
       (s.startswith("'") and s.endswith("'")) or \
       (s.startswith('"') and s.endswith('"')):
        try:
            return literal_eval(s)
        except Exception:
            pass
    # último recurso: se parece com JSON de verdade (aspas duplas), tenta json.loads
    if (s.startswith("{") and s.endswith("}")) or (s.startswith("[") and s.endswith("]")):
        try:
            return json.loads(s)
        except Exception:
            pass
    return s  # mantém como string

def _normalize_pyish(obj: Any) -> Any:
    """Converte recursivamente strings 'pythonizadas' em estruturas Python."""
    if isinstance(obj, dict):
        return {k: _normalize_pyish(v) for k, v in obj.items()}
    if isinstance(obj, list):
        return [_normalize_pyish(v) for v in obj]
    if isinstance(obj, str):
        val = _try_literal_eval(obj)
        # se mudou de tipo, normaliza de novo (pode ter aninhado)
        if val is not obj:
            return _normalize_pyish(val)
        return obj
    return obj

def read_jsonl_lines(path: str) -> List[Any]:
    """Lê JSONL tolerante e normaliza campos-string que são dict/list pythonizados."""
    out: List[Any] = []
    with open(path, "r", encoding="utf-8", errors="ignore") as f:
        for raw in f:
            line = raw.strip()
            if not line:
                continue
            try:
                obj = json.loads(line)
            except json.JSONDecodeError:
                # linha não é JSON puro; tenta avaliar direto como python literal
                obj = _try_literal_eval(line)
            obj = _normalize_pyish(obj)
            out.append(obj)
    return out

def event_to_text(evt: Any) -> str:
    """Extrai texto útil do evento (dict esperado, mas aceita outros tipos)."""
    if isinstance(evt, dict):
        parts = []

        # winlog.task
        winlog = evt.get("winlog")
        if isinstance(winlog, dict):
            task = winlog.get("task")
            if task: parts.append(str(task))
        elif isinstance(winlog, str):
            parts.append(winlog)

        # process.command_line / process.name
        proc = evt.get("process")
        if isinstance(proc, dict):
            cmd = proc.get("command_line")
            name = proc.get("name")
            if name: parts.append(str(name))
            if cmd:  parts.append(str(cmd))
        elif isinstance(proc, str):
            parts.append(proc)

        # file.path/name/target_path
        fobj = evt.get("file")
        if isinstance(fobj, dict):
            for k in ("path", "name", "target_path"):
                v = fobj.get(k)
                if v: parts.append(str(v))
        elif isinstance(fobj, str):
            parts.append(fobj)

        if parts:
            return " ".join(parts)
        # fallback: serializa curto
        try: return json.dumps(evt, ensure_ascii=False)
        except Exception: return str(evt)

    if isinstance(evt, list):
        return "\n".join(event_to_text(x) for x in evt)
    if isinstance(evt, str):
        return evt
    try: return json.dumps(evt, ensure_ascii=False)
    except Exception: return str(evt)

def file_to_document_text(path: str) -> str:
    events = read_jsonl_lines(path)
    texts = [event_to_text(e) for e in events]
    return "\n".join(texts)


### Montagem do dataset (e fatiamento do benigno)

In [92]:
ATTACK_GLOB = "./attacks/*.jsonl"
BENIGN_FILE = "./benign/benign.jsonl"

attack_files = sorted(glob.glob(ATTACK_GLOB))
assert len(attack_files) > 0, "Nenhum arquivo encontrado em ./attacks/*.jsonl"

assert os.path.exists(BENIGN_FILE), "Arquivo benigno ./benign/benign.jsonl não encontrado"

len(attack_files), attack_files[:3]


(1164,
 ['./attacks\\20250815_164702.jsonl',
  './attacks\\20250815_164734.jsonl',
  './attacks\\20250815_164800.jsonl'])

In [93]:
attack_line_counts = [count_lines(p) for p in attack_files]
median_attack_lines = int(np.median(attack_line_counts)) if attack_line_counts else 500

benign_total_lines = count_lines(BENIGN_FILE)

# número de pedaços benignos ~ benign_total / mediana_ataque, limitado para não explodir
K = 500

print(f"Mediana de linhas (ataque): {median_attack_lines}")
print(f"Linhas do benigno: {benign_total_lines}")
print(f"Numero de fatias benignas (K): {K}")

Mediana de linhas (ataque): 31
Linhas do benigno: 10000
Numero de fatias benignas (K): 500


In [94]:
# 1) Documentos de ataque (label=1)
X_text, y, ids = [], [], []

for path in attack_files:
    doc = file_to_document_text(path)
    X_text.append(doc)
    y.append(1)
    ids.append(Path(path).name)

# 2) Documentos benignos (label=0), fatiando o arquivo grande
benign_lines_raw = read_jsonl_lines(BENIGN_FILE)
chunk_size = math.ceil(len(benign_lines_raw) / K)

for i in range(K):
    chunk = benign_lines_raw[i*chunk_size : (i+1)*chunk_size]
    if not chunk:
        continue
    # transformar o chunk em texto
    chunk_texts = [event_to_text(evt) or repr(evt) for evt in chunk]
    doc = "\n".join(chunk_texts)
    X_text.append(doc)
    y.append(0)
    ids.append(f"benign_chunk_{i:03d}")

print(f"Total documentos: {len(X_text)} | Maliciosos: {sum(y)} | Benignos: {len(y)-sum(y)}")

Total documentos: 1664 | Maliciosos: 1164 | Benignos: 500


In [95]:
df = pd.DataFrame({"id": ids, "label": y, "n_chars": [len(t) for t in X_text]})
df.sample(min(10, len(df)))

Unnamed: 0,id,label,n_chars
845,20250818_023855.jsonl,1,6018
205,20250815_195101.jsonl,1,7083
137,20250815_191937.jsonl,1,3653
1222,benign_chunk_058,0,1749
492,20250815_222414.jsonl,1,1653
960,20250826_175043.jsonl,1,0
1551,benign_chunk_387,0,1819
508,20250815_222936.jsonl,1,804
789,20250816_011100.jsonl,1,5972
1398,benign_chunk_234,0,2041


### Split e avaliação auxiliar

In [None]:
def evaluate_model(name, model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    pred = model.predict(X_test)
    acc = accuracy_score(y_test, pred)
    pr, rc, f1, _ = precision_recall_fscore_support(y_test, pred, average="binary", zero_division=0)
    print(f"\n[{name}]")
    print(f"Accuracy: {acc:.3f} | Precision: {pr:.3f} | Recall: {rc:.3f} | F1: {f1:.3f}")
    print(classification_report(y_test, pred, digits=3))
    return {"model": name, "accuracy": acc, "precision": pr, "recall": rc, "f1": f1}

# Split estratificado
X_train_text, X_test_text, y_train, y_test = train_test_split(
    X_text, y, test_size=0.2, random_state=42, stratify=y
)

# Vetorizador compartilhado
vectorizer = TfidfVectorizer(
    lowercase=True,
    max_features=100_000,   # limitar um pouco
    ngram_range=(1,2),      # unigrams e bigrams
    token_pattern=r"(?u)\b\w+\b",
)

### Treinos

In [97]:
# Logistic Regression (regressor "softmax" no multi-classe; aqui binário)
logreg = make_pipeline(
    vectorizer,
    LogisticRegression(max_iter=500, n_jobs=None)  # simples
)
m1 = evaluate_model("LogisticRegression", logreg, X_train_text, X_test_text, y_train, y_test)


[LogisticRegression]
Accuracy: 0.991 | Precision: 0.987 | Recall: 1.000 | F1: 0.994
              precision    recall  f1-score   support

           0      1.000     0.970     0.985       100
           1      0.987     1.000     0.994       233

    accuracy                          0.991       333
   macro avg      0.994     0.985     0.989       333
weighted avg      0.991     0.991     0.991       333



In [98]:
# Linear SVM
linsvm = make_pipeline(
    vectorizer,
    LinearSVC()
)
m2 = evaluate_model("LinearSVC", linsvm, X_train_text, X_test_text, y_train, y_test)


[LinearSVC]
Accuracy: 0.997 | Precision: 0.996 | Recall: 1.000 | F1: 0.998
              precision    recall  f1-score   support

           0      1.000     0.990     0.995       100
           1      0.996     1.000     0.998       233

    accuracy                          0.997       333
   macro avg      0.998     0.995     0.996       333
weighted avg      0.997     0.997     0.997       333



In [99]:
# Decision Tree (tende a underfitting com TF-IDF esparso, mas incluímos)
dtree = make_pipeline(
    vectorizer,
    DecisionTreeClassifier(random_state=42, max_depth=30)
)
m3 = evaluate_model("DecisionTree", dtree, X_train_text, X_test_text, y_train, y_test)


[DecisionTree]
Accuracy: 1.000 | Precision: 1.000 | Recall: 1.000 | F1: 1.000
              precision    recall  f1-score   support

           0      1.000     1.000     1.000       100
           1      1.000     1.000     1.000       233

    accuracy                          1.000       333
   macro avg      1.000     1.000     1.000       333
weighted avg      1.000     1.000     1.000       333



In [100]:
#  Random Forest (com TF-IDF esparso nem sempre é ideal, mas funciona)
rf = make_pipeline(
    vectorizer,
    RandomForestClassifier(
        n_estimators=200,
        random_state=42,
        n_jobs=-1,
        max_depth=None
    )
)
m4 = evaluate_model("RandomForest", rf, X_train_text, X_test_text, y_train, y_test)


[RandomForest]
Accuracy: 1.000 | Precision: 1.000 | Recall: 1.000 | F1: 1.000
              precision    recall  f1-score   support

           0      1.000     1.000     1.000       100
           1      1.000     1.000     1.000       233

    accuracy                          1.000       333
   macro avg      1.000     1.000     1.000       333
weighted avg      1.000     1.000     1.000       333



In [101]:
# Tabela comparativa de métricas
score_table = pd.DataFrame([m1, m2, m3, m4]).sort_values("f1", ascending=False)
score_table

Unnamed: 0,model,accuracy,precision,recall,f1
3,RandomForest,1.0,1.0,1.0,1.0
2,DecisionTree,1.0,1.0,1.0,1.0
1,LinearSVC,0.996997,0.995726,1.0,0.997859
0,LogisticRegression,0.990991,0.987288,1.0,0.993603


### Salvar o melhor modelo

In [102]:
# Salvar todos os modelos e também destacar o melhor
import os, joblib

# dicionário com todos os pipelines treinados
all_models = {
    "LogisticRegression": logreg,
    "LinearSVC": linsvm,
    "DecisionTree": dtree,
    "RandomForest": rf,
}

os.makedirs("./models", exist_ok=True)

# salva cada um
for name, pipe in all_models.items():
    path = f"./models/model_{name}.joblib"
    joblib.dump(pipe, path)
    print("Salvo:", path)

# opcional: salva também o melhor (com base no score_table)
best_name = score_table.iloc[0]["model"]
best_pipe = all_models[best_name]
best_path = f"./models/best_model_{best_name}.joblib"
joblib.dump(best_pipe, best_path)
print("Salvo melhor modelo em:", best_path)

# opcional: salvar a tabela de métricas para referência
score_table.to_csv("./models/metrics_summary.csv", index=False)
print("Salvo resumo de métricas em: ./models/metrics_summary.csv")

Salvo: ./models/model_LogisticRegression.joblib
Salvo: ./models/model_LinearSVC.joblib
Salvo: ./models/model_DecisionTree.joblib
Salvo: ./models/model_RandomForest.joblib
Salvo melhor modelo em: ./models/best_model_RandomForest.joblib
Salvo resumo de métricas em: ./models/metrics_summary.csv


### Inferência em novos arquivos/diretórios

In [None]:
# Testar TODOS os modelos salvos em ./models contra conjuntos de arquivos
import os, glob, joblib
from pathlib import Path
import pandas as pd
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

EVAL_SETS = [
    ("../data/attack/*.jsonl", 1, "attack"),
    ("../data/safe/*.jsonl",   0, "safe"),
]

def eval_df_metrics(df: pd.DataFrame):
    """Computa métricas binárias em um DataFrame de previsões (somente linhas OK)."""
    ok = df[df["status"] == "ok"].copy()
    if "true" not in ok or ok["true"].isna().all():
        return {"n_ok": len(ok), "accuracy": None, "precision": None, "recall": None, "f1": None}
    acc = accuracy_score(ok["true"], ok["pred"])
    pr, rc, f1, _ = precision_recall_fscore_support(ok["true"], ok["pred"], average="binary", zero_division=0)
    return {"n_ok": len(ok), "accuracy": acc, "precision": pr, "recall": rc, "f1": f1}

def evaluate_all_models(models_dir: str = "./models", eval_sets = EVAL_SETS):
    model_paths = sorted(glob.glob(os.path.join(models_dir, "*.joblib")))
    assert model_paths, f"Nenhum modelo encontrado em {models_dir}"

    per_model_rows = []   # métricas por modelo x conjunto
    overall_rows   = []   # concat de previsões para calcular 'geral' por modelo

    for mp in model_paths:
        name = Path(mp).stem  # ex.: model_LinearSVC ou best_model_LinearSVC
        print(f"\n===== Avaliando {name} =====")
        clf = joblib.load(mp)

        model_all_preds = []  # guardar dfs de cada conjunto para métricas gerais

        for pattern, true_label, set_name in eval_sets:
            print(f"\n-- Conjunto: {set_name} | pattern={pattern} | true={true_label}")
            df_pred = batch_predict_dir(clf, pattern, true_label=true_label)
            if df_pred is None:
                continue

            # salva com identificação do conjunto / modelo
            df_pred = df_pred.copy()
            df_pred["set"] = set_name
            df_pred["model"] = name
            model_all_preds.append(df_pred)

            m = eval_df_metrics(df_pred)
            per_model_rows.append({
                "model": name, "set": set_name, **m
            })

        # métricas gerais (micro) juntando todos os conjuntos com rótulo conhecido
        if model_all_preds:
            cat = pd.concat(model_all_preds, ignore_index=True)
            cat_known = cat[(cat["status"] == "ok") & (~cat["true"].isna())]
            if len(cat_known) > 0:
                acc = accuracy_score(cat_known["true"], cat_known["pred"])
                pr, rc, f1, _ = precision_recall_fscore_support(cat_known["true"], cat_known["pred"],
                                                                average="binary", zero_division=0)
                per_model_rows.append({
                    "model": name, "set": "ALL", "n_ok": len(cat_known),
                    "accuracy": acc, "precision": pr, "recall": rc, "f1": f1
                })
                overall_rows.append(cat)

    metrics_df = pd.DataFrame(per_model_rows).sort_values(["set", "f1"], ascending=[True, False])
    print("\n=== RESUMO DE MÉTRICAS ===")
    display(metrics_df)

    # opcional: salvar CSV
    os.makedirs("./models/eval_reports", exist_ok=True)
    out_csv = "./models/eval_reports/all_models_metrics.csv"
    metrics_df.to_csv(out_csv, index=False)
    print("Relatório salvo em:", out_csv)

    # opcional: retornar também as previsões concatenadas (se quiser inspecionar)
    preds_concat = pd.concat(overall_rows, ignore_index=True) if overall_rows else None
    return metrics_df, preds_concat

# Executar
all_metrics, all_preds = evaluate_all_models("./models", EVAL_SETS)



===== Avaliando best_model_RandomForest =====

-- Conjunto: attack | pattern=../data/attack/*.jsonl | true=1
20250411_175749.jsonl                    => pred=1
20250411_175806.jsonl                    => pred=1
20250411_175826.jsonl                    => pred=1
20250411_175855.jsonl                    => pred=1
20250411_175959.jsonl                    => pred=1
20250411_180055.jsonl                    => pred=1
20250411_180121.jsonl                    => pred=1
20250411_180202.jsonl                    => pred=1
20250411_180213.jsonl                    => pred=1
20250411_180224.jsonl                    => pred=1
20250411_180237.jsonl                    => pred=1
20250411_180254.jsonl                    => pred=1
20250411_180316.jsonl                    => pred=1
20250411_180502.jsonl                    => pred=1
20250411_180536.jsonl                    => pred=1
20250411_180602.jsonl                    => pred=1
20250411_180623.jsonl                    => pred=1
20250411_180641.jsonl  

Unnamed: 0,model,set,n_ok,accuracy,precision,recall,f1
5,model_DecisionTree,ALL,60,0.716667,0.722222,0.95122,0.821053
11,model_LogisticRegression,ALL,60,0.683333,0.696429,0.95122,0.804124
8,model_LinearSVC,ALL,60,0.666667,0.684211,0.95122,0.795918
2,best_model_RandomForest,ALL,60,0.65,0.672414,0.95122,0.787879
14,model_RandomForest,ALL,60,0.65,0.672414,0.95122,0.787879
0,best_model_RandomForest,attack,41,0.95122,1.0,0.95122,0.975
3,model_DecisionTree,attack,41,0.95122,1.0,0.95122,0.975
6,model_LinearSVC,attack,41,0.95122,1.0,0.95122,0.975
9,model_LogisticRegression,attack,41,0.95122,1.0,0.95122,0.975
12,model_RandomForest,attack,41,0.95122,1.0,0.95122,0.975


Relatório salvo em: ./models/eval_reports/all_models_metrics.csv
