## Busca dos logs no Elastic

In [59]:
from elasticsearch import Elasticsearch
import pandas as pd
from datetime import timedelta

es = Elasticsearch(
    "http://192.168.56.102:9200",
    basic_auth=("elastic", "123456")
)

atomicData = pd.read_csv('data/labels/logs_malicious.csv').drop('IP Address', axis=1)
safeData = pd.read_csv('data/labels/logs_safe.csv')

# Conversão de Series para timestamp e adição do tempo final
atomicData['Execution Time (UTC)'] = pd.to_datetime(atomicData['Execution Time (UTC)'], format="%Y-%m-%dT%H:%M:%SZ")
atomicData['Execution Time End (UTC)'] = atomicData['Execution Time (UTC)'] + timedelta(seconds=5)

safeData['Execution Time (UTC)'] = pd.to_datetime(safeData['Execution Time (UTC)'], format="%Y-%m-%dT%H:%M:%SZ")
safeData['Execution Time End (UTC)'] = pd.to_datetime(safeData['Execution Time End (UTC)'], format="%Y-%m-%dT%H:%M:%SZ")

geralData = pd.concat([atomicData, safeData]).drop(['Execution Time (Local)', 'Username', 'GUID', 'ExitCode'], axis=1)
display(geralData)

for index, row in geralData.iterrows():
    query = {
        "size": 100,
        "_source": [
            "process.name",
            "process.command_line",
            "process.parent.name",
            "process.parent.command_line",
            "file.path",
            "winlog.task"
        ],
        "query": {
            "bool": {
                "must": [
                    {
                        "range": {
                            "@timestamp": {
                                "gte": row['Execution Time (UTC)'],
                                "lte": row['Execution Time End (UTC)']
                            }
                        }
                    },
                    {
                        "bool": {
                            "should": [
                                {"exists": {"field": "process.parent.name"}},
                                {"exists": {"field": "process.parent.command_line"}},
                                {"exists": {"field": "file.path"}}
                            ],
                            "minimum_should_match": 1
                        }
                    }
                ]
            }
        }
    }

    response = es.search(index="winlogbeat-*", body=query)
    events = [hit["_source"] for hit in response["hits"]["hits"]]
    
    # Salvando dados de eventos em data/events/{timestamp}.csv
    pd_events = pd.DataFrame(events)
    pd_events.to_json(f'data/events/{row['Execution Time (UTC)'].strftime("%Y%m%d_%H%M%S")}.jsonl', orient="records", lines=True, force_ascii=False)

Unnamed: 0,Execution Time (UTC),Technique,Test Number,Test Name,Hostname,ProcessId,Execution Time End (UTC)
0,2025-03-26 17:12:22,T1078.001,1.0,Enable Guest account with RDP capability and a...,WIN-NTAE0RPROG5,6264.0,2025-03-26 17:12:27
1,2025-03-26 17:12:27,T1078.001,2.0,Activate Guest Account,WIN-NTAE0RPROG5,5576.0,2025-03-26 17:12:32
2,2025-03-26 17:12:33,T1078.003,1.0,Create local account with admin privileges,WIN-NTAE0RPROG5,2760.0,2025-03-26 17:12:38
3,2025-03-26 17:12:39,T1485,1.0,Windows - Overwrite file with SysInternals SDe...,WIN-NTAE0RPROG5,4704.0,2025-03-26 17:12:44
4,2025-03-26 17:12:45,T1486,5.0,PureLocker Ransom Note,WIN-NTAE0RPROG5,6808.0,2025-03-26 17:12:50
5,2025-03-26 17:12:51,T1204.002,1.0,OSTap Style Macro Execution,WIN-NTAE0RPROG5,2824.0,2025-03-26 17:12:56
6,2025-03-26 17:13:02,T1059,1.0,AutoIt Script Execution,WIN-NTAE0RPROG5,6652.0,2025-03-26 17:13:07
7,2025-03-26 17:13:08,T1566.001,1.0,Download Macro-Enabled Phishing Attachment,WIN-NTAE0RPROG5,6860.0,2025-03-26 17:13:13
8,2025-03-26 17:13:16,T1566.002,1.0,Paste and run technique,WIN-NTAE0RPROG5,2556.0,2025-03-26 17:13:21
0,2025-03-28 19:41:27,,,Github (Git clone repositorio legitimo),,,2025-03-28 19:41:29


## Ofuscação dos logs

In [69]:
import pandas as pd
import glob
import hashlib
import re

users = [
    'admin', 'root', 'guest', 'art-test', 'test', 'default',
    'administrator', 'superuser', 'sysadmin'
]
tools = [
    'redcanaryco', 'atomic-red-team', 'AtomicRedTeam', 'atomics', 'atomic',
    'mimikatz', 'nmap', 'cobaltstrike', 'metasploit'
]
passwords = [
    re.compile(r'Password123\!'), '123456', 'admin123', 'qwerty', 'letmein',
    'passw0rd', '12345678', 'toor', re.compile(r'-4RTisCool!-321')
]
mitre = [re.compile(r"T\d{4}\.\d{3}"), re.compile(r"T\d{4}")]

ofuscation = users + tools + passwords + mitre

def hash_md5(text):
    text = str(text)
    
    hash = hashlib.md5(text.encode('utf-8')).hexdigest()[:8]
    return f'[HASH_{hash}]'

def apply_obfuscation(text, terms):
    text = str(text)

    for term in terms:
        if isinstance(term, re.Pattern):
            # Aplica a substituição para o padrão regex
            text = term.sub(lambda m: hash_md5(m.group()), text).replace('\\\\', '\\')
        else:
            # Cria um padrão com limites de palavra para substituir termos completos
            padrao = r'\b' + re.escape(term) + r'\b'
            text = re.sub(padrao, lambda m: hash_md5(m.group()), text)
    return text

jsonl_files = glob.glob("data/events/*.jsonl")
for file in jsonl_files:
    df = pd.read_json(file, lines=True)
    
    for column in df.columns:
        df[column] = df[column].apply(lambda x: apply_obfuscation(x, ofuscation))

    new_file = file.replace("events", "events_processed")
    df.to_json(new_file, orient="records", lines=True, force_ascii=False)

## Escolha de Logs relevantes

In [90]:
import ollama
import os
import glob
import pandas as pd

JSONL_DIR = "data/events_processed"
MODEL = "llama3.2:3b"
MODEL_PATH = "llama3.2_3b"

jsonl_files = glob.glob(f"{JSONL_DIR}/*.jsonl")

for file in jsonl_files:
    alerts = pd.read_json(file, orient='records', lines=True)
    results = []
    inferences = []
    for _, row in alerts.iterrows():
        alert = row.to_dict()
        response = ollama.chat(
            model=MODEL,
            messages=[
                {
                    "role": "system",
                    "content": (
                        "You are a cybersecurity expert. Please review each log individually and return ONLY the classification as 'IMPORTANT' or 'NOT IMPORTANT'. "
                        "Data is obfuscated."
                    )
                },
                {
                    "role": "user",
                    "content": f"Here are the logs:\n{alert}"
                }
            ],
            options={"max_tokens": 60}
        )

        analysis = response['message']['content'].strip()
        inferences.append(analysis)
        if not "NOT IMPORTANT" in analysis:
            results.append(alert)

    # Salvando os resultados
    dir_path = f"data/events_filtered/{MODEL_PATH}"
    os.makedirs(dir_path, exist_ok=True)

    filename = os.path.basename(file)  # Mantém o nome do arquivo original
    output_path = os.path.join(dir_path, filename)

    pd.DataFrame(results).to_json(output_path, orient="records", lines=True, force_ascii=False)
    print(f"Inferência salva em {output_path}")


Inferência salva em data/events_filtered/llama3.2_3b\20250326_171222.jsonl
Inferência salva em data/events_filtered/llama3.2_3b\20250326_171227.jsonl
Inferência salva em data/events_filtered/llama3.2_3b\20250326_171233.jsonl
Inferência salva em data/events_filtered/llama3.2_3b\20250326_171239.jsonl
Inferência salva em data/events_filtered/llama3.2_3b\20250326_171245.jsonl
Inferência salva em data/events_filtered/llama3.2_3b\20250326_171251.jsonl
Inferência salva em data/events_filtered/llama3.2_3b\20250326_171302.jsonl
Inferência salva em data/events_filtered/llama3.2_3b\20250326_171308.jsonl
Inferência salva em data/events_filtered/llama3.2_3b\20250326_171316.jsonl
Inferência salva em data/events_filtered/llama3.2_3b\20250328_194127.jsonl
Inferência salva em data/events_filtered/llama3.2_3b\20250328_200430.jsonl
Inferência salva em data/events_filtered/llama3.2_3b\20250328_202048.jsonl


## Inferência

In [89]:
import openai
import time
import glob
import os
from dotenv import load_dotenv

CSV_DIR = "data/events_filtered/llama3.2_3b"
load_dotenv()

OPENAI_API_KEY = os.getenv('OPENAI_KEY')

client = openai.OpenAI(api_key=OPENAI_API_KEY)
csv_files = glob.glob(f"{CSV_DIR}/*.jsonl")

for i in range(5):
    results = []
    for file in csv_files:
        alert = pd.read_json(file, orient='records', lines=True)
        response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages = [
            {
                "role": "system",
                "content": (
                    "Você é um especialista em cibersegurança com vasta experiência em analisar logs de sistema para "
                    "identificar comportamentos maliciosos. Sua tarefa é analisar o conjunto completo de eventos de log apresentado "
                    "e classificar o alerta como 'MALICIOUS' ou 'SAFE'. Essa classificação única ajudará a priorizar os alertas e "
                    "orientar as ações de resposta a incidentes."
                )
            },
            {
                "role": "system",
                "content": (
                    "A resposta deve ser estritamente 'MALICIOUS' ou 'SAFE', sem explicações adicionais. Considere todo o conjunto "
                    "de logs em conjunto, e não individualmente."
                )
            },
            {
                "role": "user",
                "content": f"Aqui estão os logs (todos os dados sensíveis foram ofuscados):\n{alert.to_json(orient='records', lines=True)}."
            }
        ],
        max_tokens=100
        )
        
        analysis = response.choices[0].message.content
        results.append({"file": file, "analysis": analysis})
        time.sleep(0.5)

    results_df = pd.DataFrame(results)

    dir_path = "data/inferences/gpt-4o-mini"
    os.makedirs(dir_path, exist_ok=True)
    results_df.to_csv(f"{dir_path}/inf{i}.csv", index=False)

    print(f"Inferencia {i} salva.")

Inferencia 0 salva.
Inferencia 1 salva.
Inferencia 2 salva.
Inferencia 3 salva.
Inferencia 4 salva.
