## Setup imports

In [None]:

import os
import json
import requests
import sys
import duckdb
import pandas as pd
from IPython.display import display

In [2]:
# Add project source code to path for grabbing functions
sys.path.append("../src")

#### Import all security rules and llm context helper

In [7]:
from security_rules_build import run_all_detections, build_llm_context, combined_rules

In [5]:
#connect to the database
con = duckdb.connect("../data/logs.duckdb")
#sample check
row_count = con.execute("SELECT COUNT(*) FROM logs").fetchone()[0]
print("Connected, total rows:", row_count)

Connected, total rows: 1920624


In [8]:
# Laying out the dictionary keys which are the names of each security rule dataframe. 
results = combined_rules(con)
results.keys()

dict_keys(['failed_logins', 'failed_by_ip', 'brute_force', 'priv_esc', 'account_lockout', 'log_cleared', 'susp_linux', 'susp_win', 'rdp_logins', 'ssh_root_logins', 'failed_count_windows'])

In [9]:
#run all rules seperately.
(
    failed_df,
    failed_by_ip_df,
    brute_force_df,
    priv_df,
    lockout_df,
    log_clear_df,
    susp_linux_df,
    susp_win_df,
    rdp_df,
    ssh_root_df,
    failed_count_df
) = combined_rules(con)

In [17]:
# Convert each DataFrame into a understandable view for the LLM for raw rules provided.
raw_rules_text = ""

for rule_name, df in results.items():
    raw_rules_text += f"\n=== {rule_name.upper()} ===\n"

    if df.empty:
        raw_rules_text += "No records detected.\n"
    else:
        # Only show a few rows to avoid token overload
        raw_rules_text += df.head(5).to_string(index=False)
        raw_rules_text += "\n\n"

In [None]:
'''# Total count of all rules applied.
combined_summary_1= f"""
Failed logins: {len(failed_df)}
Failed logins by IP address: {len(failed_by_ip_df)}
Failed Logins on windows: failed_count_df  (failed logins on windows with %4625%)
Brute-force attempts: {len(brute_force_df)}
Privilege escalations: {len(priv_df)}
Account Lockouts on windows: {len(lockout_df)}
Log clearing/tampering(Windows & Linux {len(log_clear_df)}
Suspicious processes (Linux): {len(susp_linux_df)}
Suspicious processes (Windows): {len(susp_win_df)}
Windows remote login events: {len(rdp_df)}
"""



context_summary = f"""
Security Summary:
{combined_summary_1}

Example Failed Login Entries:
{failed_df.head(3).to_string(index=False)}

Example Privilege Escalations:
{priv_df.head(3).to_string(index=False)}
"""

'''

context_summary = build_llm_context(results, examples=5)

print(context_summary[:500])

=== Security Detection Summary ===

 * FAILED_LOGINS (Windows: 63, Ubuntu: 16597)
Total: 16660 events
  AGENT_NAME                                                                                      Content          src_ip        dst_ip           entity_id    City Country                date day hour minute second           timestamp
UBUNTU_AGENT sshd[2756135]: Failed password for invalid user site from 185.213.165.107 port 48410 ssh2... 185.213.165.107 172.21.35.175 Nisir@172.21.35.175  Tehran


### Setup and keep LLM from OpenRouter ready 

In [None]:
# Set up OpenRouter API key 
os.environ["OPENROUTER_API_KEY"] = "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"
API_KEY = os.environ["OPENROUTER_API_KEY"]

headers = {
    "Authorization": f"Bearer {API_KEY}",
    "HTTP-Referer": "https://openrouter.ai/",
    "X-Title": "Mini-SOC Threat Analyzer",
    "Content-Type": "application/json"
}

### Using full context summary 

In [12]:
prompt = f"""
You are a SOC (Security Operations Center) analyst.
Analyze the following SIEM detections.

For each rule, describe:
1. What the detected activity means,
2. The potential threat or attack pattern it indicates,
3. Recommended actions or mitigations.

Here are the detections:
{context_summary}
"""

In [None]:
data = {
    "model": "gpt-4o-mini",  # or "mistralai/mistral-7b-instruct"
    "messages": [
        {"role": "system", "content": "You are a senior SOC analyst."},
        {"role": "user", "content": prompt}
    ],
    "temperature": 0.5,
    "max_tokens": 1000
}

response = requests.post(
    "https://openrouter.ai/api/v1/chat/completions",
    headers=headers,
    data=json.dumps(data)
).json()


print("AI Analysis Complete !:\n")

AI Analysis Complete !:

{'id': 'gen-1760289648-GyMOEcElAY7rzWoKknG1', 'provider': 'OpenAI', 'model': 'openai/gpt-4o-mini', 'object': 'chat.completion', 'created': 1760289648, 'choices': [{'logprobs': None, 'finish_reason': 'length', 'native_finish_reason': 'length', 'index': 0, 'message': {'role': 'assistant', 'content': '### Detection Analysis\n\n#### 1. **FAILED_LOGINS (Windows: 63, Ubuntu: 16597)**\n   - **Detected Activity**: A high number of failed login attempts for both Windows and Ubuntu systems.\n   - **Potential Threat**: This indicates potential brute-force attacks where an attacker is trying to guess user credentials for unauthorized access. The invalid user attempts suggest that the attacker is either targeting specific accounts or is using automated tools to try common usernames and passwords.\n   - **Recommended Actions**:\n     - Implement account lockout policies to temporarily disable accounts after a certain number of failed attempts.\n     - Use multi-factor authen

In [15]:
print(response["choices"][0]["message"]["content"])

### Detection Analysis

#### 1. **FAILED_LOGINS (Windows: 63, Ubuntu: 16597)**
   - **Detected Activity**: A high number of failed login attempts for both Windows and Ubuntu systems.
   - **Potential Threat**: This indicates potential brute-force attacks where an attacker is trying to guess user credentials for unauthorized access. The invalid user attempts suggest that the attacker is either targeting specific accounts or is using automated tools to try common usernames and passwords.
   - **Recommended Actions**:
     - Implement account lockout policies to temporarily disable accounts after a certain number of failed attempts.
     - Use multi-factor authentication (MFA) to enhance account security.
     - Monitor and analyze the source IPs for potential malicious activity and consider blocking them.
     - Ensure strong password policies are in place.

#### 2. **FAILED_BY_IP**
   - **Detected Activity**: Multiple failed login attempts from specific IP addresses.
   - **Potential Th

### Using Raw Rules for LLM to provide output.

In [None]:
prompt = f"""
You are a SOC (Security Operations Center) analyst.
Analyze the following SIEM detections directly.

For each section, describe:
1. What type of event or attack this may represent,
2. What patterns or correlations you notice,
3. What an SOC analyst should do next.

Here are the detection results:
{raw_rules_text}
"""

data = {
    "model": "gpt-4o-mini",
    "messages": [
        {"role": "system", "content": "You are a senior SOC analyst."},
        {"role": "user", "content": prompt}
    ],
    "temperature": 0.5,
    "max_tokens": 1000
}

response = requests.post(
    "https://openrouter.ai/api/v1/chat/completions",
    headers=headers,
    data=json.dumps(data)
).json()

print(" AI Analysis Complete!\n")
print(response["choices"][0]["message"]["content"])

ðŸ§  AI Analysis Complete!

### Analysis of SIEM Detections

#### 1. **FAILED_LOGINS**
   - **Event Type/Attack Representation**: These entries indicate multiple failed login attempts, which could signify a brute force attack or an unauthorized access attempt. The attempts are made against both the `root` user and an invalid user `site`.
   - **Patterns/Correlations**: 
     - The failed login attempts are concentrated from specific IP addresses (185.213.165.107, 218.92.0.177, 36.110.15.2).
     - Several attempts are made from IPs originating in China and Iran, which may suggest targeted attacks from these regions.
     - The use of the `root` account is particularly concerning as it is a high-privilege account.
   - **Next Steps for SOC Analyst**: 
     - Investigate the source IP addresses to determine if they are associated with known malicious activity.
     - Implement blocking rules for these IPs if they are deemed malicious.
     - Review logs for successful logins from these I

### Adding structured and summarized detection into the mix.

In [None]:
context_summary = build_llm_context(results, examples=2)

prompt = f"""
You are a SOC analyst reviewing structured detections.

First, review the summarized results below:
{context_summary}

Then, review the raw detections:
{raw_rules_text}

Correlate findings, describe key threats, and recommend next actions.
"""

In [14]:
type(response)

dict

In [18]:
usage_log = []
from datetime import datetime  

def record_usage(resp):
    if isinstance(resp,dict):
        usage = resp.get("usage", {})
        usage_log.append({
            "timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
            "model": resp.get("model"),
            "prompt_tokens": usage.get("prompt_tokens", 0),
            "completion_tokens": usage.get("completion_tokens", 0),
            "total_tokens": usage.get("total_tokens", 0)
        })
    else: print("Value is not a response dictionary")

def show_usage_log():
    """Display all logged usage so far."""
    if usage_log:
        df = pd.DataFrame(usage_log)
        display(df)
    else:
        print("No usage recorded yet.")

    


record_usage(response)
show_usage_log()


Unnamed: 0,timestamp,model,prompt_tokens,completion_tokens,total_tokens
0,2025-10-12 12:41:48,openai/gpt-4o-mini,2794,1000,3794
