# TTP Target Extraction from FireEye APT39 PDF ➜ Per-Sentence TTP Classification

This notebook loads the **cleaned text** from the uploaded FireEye APT39 report, extracts single-sentence CTI statements, and classifies each sentence to a MITRE ATT&CK Technique (TTP) using your **local SFT model** (with TF‑IDF shortlisting and fallback).

In [1]:

# === 0) Setup & Config ===
# !pip install -q transformers accelerate torch --index-url https://download.pytorch.org/whl/cpu
# !pip install -q scikit-learn pandas numpy tqdm rapidfuzz

import os, re, json, logging
from typing import List, Dict, Any, Optional
from pathlib import Path
import numpy as np
import pandas as pd
from tqdm.auto import tqdm

try:
    import requests
    HAS_REQUESTS = True
except Exception:
    HAS_REQUESTS = False

try:
    from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
    HAS_TRANSFORMERS = True
except Exception:
    HAS_TRANSFORMERS = False



#DEFAULT_MITRE_JSON = "./data/input/MITRE-ATTACK_dataset_test.json"
#CLEAN_TEXT_PATH = "./apt39_report_clean.txt"
CLEAN_TEXT_PATH = "./operation_colbalt_kitty.txt"

MODEL_PATH = "./qwen3-4b-sft-merged-final--with-reasoning"
DEVICE = "auto"
MAX_NEW_TOKENS = 2056
TEMPERATURE = 0.0

USE_LLM_FOR_EXTRACTION = True
USE_RAG_FOR_EXTRACTION = False

RAG_API_URL = os.environ.get("DASHSCOPE_API_URL")
RAG_API_KEY = os.environ.get("DASHSCOPE_API_KEY")

logging.basicConfig(level=logging.INFO, format="%(asctime)s | %(levelname)s | %(message)s")
logger = logging.getLogger("ttp_pipeline")


In [4]:

CTI_HINT_WORDS = [
    "adversaries","attackers","threat actors","operators","may","can","often","attempt","abuse","leverage","used to",
    "persistence","privilege","credential","lateral movement","c2","exfiltrate","execute","injection","phishing",
    "masquerading","tunnel","obfuscation","process","dll","registry","wmi","service","scheduled task","cloud","api",
    "bypass","discovery","enumerate","ssh","rdp","web shell","mimikatz","psexec","winrar","7-zip"
]

def split_into_sentences(text: str) -> List[str]:
    text = re.sub(r"(e\.g|i\.e|etc|vs|Mr|Ms|Dr|Sr|Jr)\.", lambda m: m.group(0).replace(".","<DOT>"), text, flags=re.I)
    parts = re.split(r"(?<=[\.!?])\s+(?=[A-Z0-9])", text)
    parts = [p.replace("<DOT>", ".").strip() for p in parts if p and len(p.strip()) > 2]
    return parts

def is_cti_like(s: str) -> bool:
    sl = s.lower()
    return any(w in sl for w in CTI_HINT_WORDS) and len(s.split()) >= 6

def heuristic_extract_cti(text: str) -> List[str]:
    sents = split_into_sentences(text)
    return [s for s in sents if is_cti_like(s)]


In [5]:

from transformers import AutoModelForCausalLM, AutoTokenizer
from typing import List, Dict, Any, Optional

model: Optional[AutoModelForCausalLM] = None
tokenizer: Optional[AutoTokenizer] = None

def load_model_and_tokenizer(model_path: str = MODEL_PATH):
    global model, tokenizer
    if not HAS_TRANSFORMERS:
        logger.error("Transformers library not found. Cannot load local model.")
        return

    try:
        logger.info(f"Loading tokenizer from: {model_path}")
        tokenizer = AutoTokenizer.from_pretrained(
            model_path,
            trust_remote_code=True
        )

        logger.info(f"Loading CausalLM model from: {model_path}")
        model = AutoModelForCausalLM.from_pretrained(
            model_path,
            torch_dtype="auto", # Or torch.bfloat16
            device_map=DEVICE,
            trust_remote_code=True
        )

        if tokenizer.pad_token is None:
            tokenizer.pad_token = tokenizer.eos_token

        # 确定设备
        try:
            model_device = next(model.parameters()).device
            logger.info(f"Model loaded successfully. Device: {model_device}")
        except Exception:
             logger.info(f"Model loaded successfully (device_map='{DEVICE}').")

    except Exception as e:
        logger.error(f"Failed to load model or tokenizer: {e}")
        model = None
        tokenizer = None

def _gen_raw(prompt_messages: List[Dict[str, str]], temperature: float = 0.0, max_new_tokens: int = MAX_NEW_TOKENS) -> str:
    if model is None or tokenizer is None:
        logger.warning("Model or tokenizer not loaded. Call load_model_and_tokenizer() first.")
        return "{}"


    if temperature == 0.0:
        temperature = 1e-9

    try:

        input_ids = tokenizer.apply_chat_template(
            prompt_messages,
            add_generation_prompt=True,
            tokenize=True,
            return_tensors="pt"
        ).to(model.device)

        outputs = model.generate(
            input_ids,
            max_new_tokens=max_new_tokens,
            temperature=temperature,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.eos_token_id,
        )


        response_ids = outputs[0][input_ids.shape[-1]:]
        response_text = tokenizer.decode(response_ids, skip_special_tokens=True)

        return response_text

    except Exception as e:
        logger.error(f"Error during model.generate: {e}")
        return "{}"

def _best_json(text: str) -> Optional[Dict[str, Any]]:
    try:
        return json.loads(text)
    except Exception:
        pass
    m = re.search(r"\{[\s\S]*\}", text)
    if m:
        try:
            return json.loads(m.group(0))
        except Exception:
            pass
    cleaned = re.sub(r"'", '\"', text)
    cleaned = re.sub(r",\s*([}\]])", r"\1", cleaned)
    try:
        return json.loads(cleaned)
    except Exception:
        return None


In [6]:

EXTRACT_SYSTEM = """You are a cyber threat intel analyst. Extract a set of *single-sentence* CTI statements from the input.
Each sentence must be self-contained and describe one concrete attacker action or capability from a defender's perspective.
Return strict JSON: {"statements": ["...", "...", "..."]}"""
EXTRACT_USER_TMPL = """Long text:
```
{TEXT}
```
Respond with only JSON as specified.
"""

def llm_extract_cti(text: str) -> List[str]:

    messages = [
        {"role": "system", "content": EXTRACT_SYSTEM},
        {"role": "user", "content": EXTRACT_USER_TMPL.format(TEXT=text)}
    ]

    # 调用 _gen_raw (来自 Cell 4)，传递消息列表
    out = _gen_raw(messages, temperature=0.0, max_new_tokens=MAX_NEW_TOKENS)

    js = _best_json(out) or {}


    stmts = js.get("statements") or []
    clean = []
    for s in stmts:
        s = re.sub(r"\s+", " ", str(s)).strip()
        if len(s) >= 6 and s.endswith(('.', '!', '?')):
            clean.append(s)
        elif len(s) >= 6:
            clean.append(s + ".")
    return clean

def maybe_rag_extract(text: str, current: List[str]) -> List[str]:
    if not USE_RAG_FOR_EXTRACTION or not RAG_API_URL:
        return current
    if not HAS_REQUESTS:
        logger.warning("requests not installed; skipping RAG call.")
        return current
    try:
        payload = {"text": text}
        headers = {"Authorization": f"Bearer {RAG_API_KEY}"} if RAG_API_KEY else {}
        resp = requests.post(RAG_API_URL, json=payload, headers=headers, timeout=20)
        resp.raise_for_status()
        data = resp.json()
        rag_stmts = data.get("statements") or data.get("cti_statements") or []
        merged = list(dict.fromkeys([*current, *rag_stmts]))
        return merged
    except Exception as e:
        logger.warning(f"RAG call failed: {e}")
        return current

def extract_cti_statements(long_text: str) -> List[str]:
    heuristics = heuristic_extract_cti(long_text)
    if USE_LLM_FOR_EXTRACTION:
        llm = llm_extract_cti(long_text)
        base = llm if len(llm) >= max(3, len(heuristics)//2) else list(dict.fromkeys([*llm, *heuristics]))
    else:
        base = heuristics
    final = maybe_rag_extract(long_text, base)
    return final


In [7]:


SFT_SYSTEM_PROMPT = "You are a helpful assistant specialized in cybersecurity and the MITRE ATT&CK framework."
SFT_INSTRUCTION = """Analyze the CTI sentence. Identify the single best-matching MITRE ATT&CK technique.
Respond with the technique ID and its full name in the format:
[Final Answer]: <TTP_ID> : <TTP_Name>"""

def create_sft_input_text(sentence: str) -> str:
    return f"""CTI Sentence:
"{sentence}"
"""

def classify_sentence_ttp(sentence: str, temperature: float = 0.0) -> Dict[str, Any]:


    input_text = create_sft_input_text(sentence)
    messages = [
        {"role": "system", "content": SFT_SYSTEM_PROMPT},
        {"role": "user", "content": f"{SFT_INSTRUCTION}\n\n{input_text}"}
    ]

    raw = _gen_raw(messages, temperature=temperature, max_new_tokens=2056)

    tid = None
    tname = None # <--- 新增
    confidence = 0.0
    justification = "N/A (Parsing failed)"

    try:
        #寻找 [Final Answer]: 标签
        parts = re.split(r'\[Final Answer\]:', raw, flags=re.IGNORECASE)

        if len(parts) > 1:
            # 4b. 在标签 *之后* 的文本 (parts[1]) 中查找
            answer_part = parts[1].strip()

            # 匹配 Txxxx | Technique Name
            # (T\d{4}(\.\d{3})?)   -> 匹配 TTP ID (Group 1)
            # \s*\|\s* -> 匹配 | 分隔符
            # (.*)                 -> 匹配 TTP Name (Group 3)
            match = re.search(r'\b(T\d{4}(\.\d{3})?)\b\s*:\s*(.*)', answer_part)

            if match:
                tid = match.group(1).strip()
                tname = match.group(3).strip() # <--- 获取 Name

                # 如果名称后面还有多余的行（如模型的思考过程），清理它
                if '\n' in tname:
                    tname = tname.split('\n')[0].strip()

                confidence = 0.90 # 置信度高（因为模型遵循了格式）
                justification = f"Parsed from [Final Answer]: {answer_part}"
                logger.info(f"Parsed TTP: '{tid} : {tname}' from [Final Answer].")

            else:
                logger.warning(f"Found [Final Answer] tag, but could not parse 'TID | Name' format from: {answer_part}")
                justification = "N/A (Found [Final Answer], but format mismatch)"

        else:
            # 4c. 如果 [Final Answer] 标签未找到 (这是您当前遇到的错误)
            logger.warning(f"Could not parse [Final Answer] tag. Output was: {raw[:200]}...")
            justification = "N/A (No [Final Answer] tag found)"

    except Exception as e:
        logger.warning(f"Error parsing raw output: {e}")
        tid = None
        tname = None



    return {
        "techniques": tid,
        "sub_techniques": tname,
        "confidence": confidence,
        "justification": justification,
        "raw_output": raw
    }

def classify_all(statements: List[str]) -> pd.DataFrame:
    rows = []
    for s in tqdm(statements, desc="Classifying sentences"):
        pred = classify_sentence_ttp(s)

        rows.append({"sentence": s,
                     "techniques": pred.get("techniques"),
                     "sub_techniques": pred.get("sub_techniques"),
                     "confidence": pred.get("confidence", None),
                    })
    return pd.DataFrame(rows)


In [8]:

def load_statements(path: str):
    p = Path(path)
    suf = p.suffix.lower()
    if suf == ".jsonl":
        with p.open("r", encoding="utf-8") as f:
            return [json.loads(line)["sentence"].strip()
                    for line in f if line.strip()]
    elif suf == ".json":
        data = json.loads(p.read_text(encoding="utf-8"))
        if isinstance(data, dict) and "statements" in data:
            return [s.strip() for s in data["statements"] if str(s).strip()]
        elif isinstance(data, list):
            return [d["sentence"].strip() for d in data if "sentence" in d and str(d["sentence"]).strip()]
        else:
            raise ValueError("Unsupported JSON schema")
    else:
        return [ln.strip() for ln in p.read_text(encoding="utf-8").splitlines() if ln.strip()]



from pathlib import Path
CLEAN_PATH = Path(CLEAN_TEXT_PATH)
assert CLEAN_PATH.exists(), f"Clean text not found at {CLEAN_PATH}; run the preprocessing step first."

LONG_TEXT = CLEAN_PATH.read_text(encoding="utf-8")

try:
    if USE_LLM_FOR_EXTRACTION or True:
        load_model_and_tokenizer()
except Exception as e:
    logger.warning(f"Model load failed (heuristics + TF-IDF fallback still works): {e}")

#cti_statements = extract_cti_statements(LONG_TEXT)
cti_statements = load_statements("operation_colbalt_kitty_ttp.jsonl")
print(f"Extracted {len(cti_statements)} CTI statements.")
for i, s in enumerate(cti_statements[:20], 1):
    print(f"{i:02d}. {s}")

# (修改) 不再需要 shortlist_k
df_result = classify_all(cti_statements)
df_result


2025-11-08 18:44:57,631 | INFO | Loading tokenizer from: ./qwen3-4b-sft-merged-final--with-reasoning
2025-11-08 18:44:57,848 | INFO | Loading CausalLM model from: ./qwen3-4b-sft-merged-final--with-reasoning
`torch_dtype` is deprecated! Use `dtype` instead!
2025-11-08 18:44:57,987 | INFO | We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

2025-11-08 18:45:01,024 | INFO | Model loaded successfully. Device: cuda:0


Extracted 115 CTI statements.
01. Dubbed Operation Cobalt Kitty, the APT targeted a global corporation based in Asia with the goal of stealing proprietary business information. The threat actor targeted the company’s top-level management by using spear-phishing attacks as the initial penetration vector, ultimately compromising the computers of vice presidents, senior directors and other key personnel in the operational departments. During Operation Cobalt Kitty, the attackers compromised more than 40 PCs and servers, including the domain controller, file servers, Web application server and database server.
02. The attackers arsenal consisted of modified publicly-available tools as well as six undocumented custom-built tools, which Cybereason considers the threat actor’s signature tools. Among these tools are two backdoors that exploited DLL sideloading attack in Microsoft, Google and Kaspersky applications. In addition, they developed a novel and stealthy backdoor that targets Microsof

Classifying sentences:   0%|          | 0/115 [00:00<?, ?it/s]

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
2025-11-08 18:45:09,199 | INFO | Parsed TTP: 'T1566 : Phishing' from [Final Answer].
2025-11-08 18:45:18,542 | INFO | Parsed TTP: 'T1574 : Hijack Execution Flow' from [Final Answer].
2025-11-08 18:45:27,061 | INFO | Parsed TTP: 'T1588 : Obtain Capabilities' from [Final Answer].
2025-11-08 18:45:37,521 | INFO | Parsed TTP: 'T1105 : Ingress Tool Transfer' from [Final Answer].
2025-11-08 18:45:44,932 | INFO | Parsed TTP: 'T1055 : Process Injection' from [Final Answer].
2025-11-08 18:45:54,138 | INFO | Parsed TTP: 'T1059 : Command and Scripting Interpreter' from [Final Answer].
2025-11-08 18:46:04,236 | INFO | Parsed TTP: 'T1566 : Phishing' from [Final Answer].
2025-11-08 18:46:12,620 | INFO | Parsed TTP: 'T1053 : Scheduled Task/Job' from [Final Answer].
2025-11-08

Unnamed: 0,sentence,techniques,sub_techniques,confidence
0,"Dubbed Operation Cobalt Kitty, the APT targete...",T1566,Phishing,0.9
1,The attackers arsenal consisted of modified pu...,T1574,Hijack Execution Flow,0.9
2,Cybereason also attributes the recently report...,T1588,Obtain Capabilities,0.9
3,"Finally, this report offers a rare glimpse int...",T1105,Ingress Tool Transfer,0.9
4,Phase one: Fileless operation (PowerShell and ...,T1055,Process Injection,0.9
...,...,...,...,...
110,Customized Windows Credentials Dumper - A Powe...,T1059,Command and Scripting Interpreter,0.9
111,The attackers specifically used it to obtain O...,T1003,OS Credential Dumping,0.9
112,An analysis of this arsenal shows that the att...,T1071,Application Layer Protocol,0.9
113,The attackers’ preference to use a fileless in...,T1546,Event Triggered Execution,0.9


In [9]:

from pathlib import Path
import json


def _get_show_df():
    try:
        from IPython.display import display  # type: ignore
        def _show_df(name, df):
            print(f"[Table] {name} (shape={df.shape})")
            display(df)
        return _show_df
    except Exception:
        def _show_df(name, df):
            print(f"[Table] {name} (shape={df.shape})")
            print(df.head(20).to_string(index=False))
        return _show_df

show_df = _get_show_df()



csv_path   = "./ttp_results_from_pdf1.csv"
jsonl_path = "./ttp_results_from_pdf1.jsonl"

df_result.to_csv(csv_path, index=False, encoding="utf-8")
with open(jsonl_path, "w", encoding="utf-8") as f:
    for _, row in df_result.iterrows():
        f.write(json.dumps(row.to_dict(), ensure_ascii=False) + "\n")

#Final CTI sentences actually used for classification
sent_jsonl = "./final_cti_sentences.jsonl"

sent_list = df_result["sentence"].tolist()

with open(sent_jsonl, "w", encoding="utf-8") as f:
    for s in sent_list:
        f.write(json.dumps({"sentence": s}, ensure_ascii=False) + "\n")


show_df("Per-sentence TTP Results (from PDF)", df_result)
print(f"Saved CSV   : {csv_path}")
print(f"Saved JSONL : {jsonl_path}")
print(f"Saved final sentences (jsonl): {sent_jsonl}")


[Table] Per-sentence TTP Results (from PDF) (shape=(115, 4))


Unnamed: 0,sentence,techniques,sub_techniques,confidence
0,"Dubbed Operation Cobalt Kitty, the APT targete...",T1566,Phishing,0.9
1,The attackers arsenal consisted of modified pu...,T1574,Hijack Execution Flow,0.9
2,Cybereason also attributes the recently report...,T1588,Obtain Capabilities,0.9
3,"Finally, this report offers a rare glimpse int...",T1105,Ingress Tool Transfer,0.9
4,Phase one: Fileless operation (PowerShell and ...,T1055,Process Injection,0.9
...,...,...,...,...
110,Customized Windows Credentials Dumper - A Powe...,T1059,Command and Scripting Interpreter,0.9
111,The attackers specifically used it to obtain O...,T1003,OS Credential Dumping,0.9
112,An analysis of this arsenal shows that the att...,T1071,Application Layer Protocol,0.9
113,The attackers’ preference to use a fileless in...,T1546,Event Triggered Execution,0.9


Saved CSV   : ./ttp_results_from_pdf1.csv
Saved JSONL : ./ttp_results_from_pdf1.jsonl
Saved final sentences (jsonl): ./final_cti_sentences.jsonl
