# TTP Target Extraction from FireEye APT39 PDF ➜ Per-Sentence TTP Classification

This notebook loads the **cleaned text** from the uploaded FireEye APT39 report, extracts single-sentence CTI statements, and classifies each sentence to a MITRE ATT&CK Technique (TTP) using your **local SFT model** (with TF‑IDF shortlisting and fallback).

In [1]:

# === 0) Setup & Config ===
# !pip install -q transformers accelerate torch --index-url https://download.pytorch.org/whl/cpu
# !pip install -q scikit-learn pandas numpy tqdm rapidfuzz

import os, re, json, logging
from typing import List, Dict, Any, Optional
from pathlib import Path
import numpy as np
import pandas as pd
from tqdm.auto import tqdm

try:
    import requests
    HAS_REQUESTS = True
except Exception:
    HAS_REQUESTS = False

try:
    from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
    HAS_TRANSFORMERS = True
except Exception:
    HAS_TRANSFORMERS = False

try:
    from sklearn.feature_extraction.text import TfidfVectorizer
    from sklearn.metrics.pairwise import cosine_similarity
    HAS_SKLEARN = True
except Exception:
    HAS_SKLEARN = False

DEFAULT_MITRE_JSON = "./data/input/MITRE-ATTACK_dataset_test.json"
CLEAN_TEXT_PATH = "./apt39_report_clean.txt"

MODEL_PATH = "./qwen3-4b-sft-merged-final--with-reasoning"
DEVICE = "auto"
MAX_NEW_TOKENS = 256
TEMPERATURE = 0.0

USE_LLM_FOR_EXTRACTION = True
USE_RAG_FOR_EXTRACTION = False

RAG_API_URL = os.environ.get("DASHSCOPE_API_URL")
RAG_API_KEY = os.environ.get("DASHSCOPE_API_KEY")

logging.basicConfig(level=logging.INFO, format="%(asctime)s | %(levelname)s | %(message)s")
logger = logging.getLogger("ttp_pipeline")


In [2]:

# === 1) Load MITRE ATT&CK labels ===
with open(DEFAULT_MITRE_JSON, "r", encoding="utf-8") as f:
    raw = f.read().strip()
    if raw.startswith("["):
        mitre = json.loads(raw)
    else:
        mitre = [json.loads(line) for line in raw.splitlines() if line.strip()]

id2name: Dict[str, str] = {}
id2desc: Dict[str, str] = {}
label_texts: List[str] = []
label_ids: List[str] = []

for item in mitre:
    tid = item.get("ID") or item.get("id") or ""
    name = item.get("name") or item.get("technique") or ""
    desc = item.get("description") or item.get("desc") or ""
    if not tid or not name:
        continue
    id2name[tid] = name
    id2desc[tid] = desc
    label_ids.append(tid)
    label_texts.append(f"{tid} | {name}\n{desc}")

print(f"Loaded {len(label_ids)} ATT&CK techniques/sub-techniques.")
pd.DataFrame({"technique_id": label_ids[:10], "name": [id2name[x] for x in label_ids[:10]]})


Loaded 508 ATT&CK techniques/sub-techniques.


Unnamed: 0,technique_id,name
0,T1578.005,Modify Cloud Compute Infrastructure: Modify Cl...
1,T1071.003,Application Layer Protocol: Mail Protocols
2,T1055.003,Process Injection: Thread Execution Hijacking
3,T1550,Use Alternate Authentication Material
4,T1584.003,Compromise Infrastructure: Virtual Private Server
5,T1027.009,Obfuscated Files or Information: Embedded Payl...
6,T1648,Serverless Execution
7,T1505.004,Server Software Component: IIS Components
8,T1222.002,File and Directory Permissions Modification: L...
9,T1584.006,Compromise Infrastructure: Web Services


In [3]:

# === 2) Build TF-IDF index for shortlisting ===
if not HAS_SKLEARN:
    raise ImportError("scikit-learn is required for TF-IDF shortlisting.")

vectorizer = TfidfVectorizer(lowercase=True, stop_words="english", ngram_range=(1,2), min_df=1, max_df=0.95)
label_vecs = vectorizer.fit_transform(label_texts)

def shortlist_ttp_candidates(query: str, top_k: int = 20) -> List[str]:
    qv = vectorizer.transform([query])
    sims = cosine_similarity(qv, label_vecs).ravel()
    idxs = np.argsort(-sims)[:top_k]
    return [label_ids[i] for i in idxs]


In [4]:

# === 3) Sentence splitting & CTI candidate heuristics ===
CTI_HINT_WORDS = [
    "adversaries","attackers","threat actors","operators","may","can","often","attempt","abuse","leverage","used to",
    "persistence","privilege","credential","lateral movement","c2","exfiltrate","execute","injection","phishing",
    "masquerading","tunnel","obfuscation","process","dll","registry","wmi","service","scheduled task","cloud","api",
    "bypass","discovery","enumerate","ssh","rdp","web shell","mimikatz","psexec","winrar","7-zip"
]

def split_into_sentences(text: str) -> List[str]:
    text = re.sub(r"(e\.g|i\.e|etc|vs|Mr|Ms|Dr|Sr|Jr)\.", lambda m: m.group(0).replace(".","<DOT>"), text, flags=re.I)
    parts = re.split(r"(?<=[\.!?])\s+(?=[A-Z0-9])", text)
    parts = [p.replace("<DOT>", ".").strip() for p in parts if p and len(p.strip()) > 2]
    return parts

def is_cti_like(s: str) -> bool:
    sl = s.lower()
    return any(w in sl for w in CTI_HINT_WORDS) and len(s.split()) >= 6

def heuristic_extract_cti(text: str) -> List[str]:
    sents = split_into_sentences(text)
    return [s for s in sents if is_cti_like(s)]


In [5]:

# === 4) Local SFT model loader & helpers ===
tokenizer = None
lm = None
gen = None

def load_local_model(model_path: str = MODEL_PATH, device: str = DEVICE):
    global tokenizer, lm, gen
    if not HAS_TRANSFORMERS:
        raise ImportError("transformers is required for local SFT inference.")
    logger.info(f"Loading local model from: {model_path}")
    tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True, trust_remote_code=True)
    lm = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True, device_map=device)
    gen = pipeline("text-generation", model=lm, tokenizer=tokenizer, device_map=device, max_new_tokens=MAX_NEW_TOKENS)
    return gen

def _gen_raw(prompt: str, temperature: float = 0.0, max_new_tokens: int = 256) -> str:
    if gen is None:
        load_local_model()
    out = gen(prompt, do_sample=(temperature > 0), temperature=temperature, max_new_tokens=max_new_tokens)[0]["generated_text"]
    if out.startswith(prompt):
        return out[len(prompt):].strip()
    return out.strip()

def _best_json(text: str) -> Optional[Dict[str, Any]]:
    try:
        return json.loads(text)
    except Exception:
        pass
    m = re.search(r"\{[\s\S]*\}", text)
    if m:
        try:
            return json.loads(m.group(0))
        except Exception:
            pass
    cleaned = re.sub(r"'", '\"', text)
    cleaned = re.sub(r",\s*([}\]])", r"\1", cleaned)
    try:
        return json.loads(cleaned)
    except Exception:
        return None


In [6]:

# === 5) LLM-based CTI extraction ===
EXTRACT_SYSTEM = """You are a cyber threat intel analyst. Extract a set of *single-sentence* CTI statements from the input.
Each sentence must be self-contained and describe one concrete attacker action or capability from a defender's perspective.
Return strict JSON: {"statements": ["...", "...", "..."]}"""
EXTRACT_USER_TMPL = """Long text:
```
{TEXT}
```
Respond with only JSON as specified.
"""

def llm_extract_cti(text: str) -> List[str]:
    prompt = f"<<SYSTEM>>\n{EXTRACT_SYSTEM}\n<</SYSTEM>>\n\n{EXTRACT_USER_TMPL.format(TEXT=text)}"
    out = _gen_raw(prompt)
    js = _best_json(out) or {}
    stmts = js.get("statements") or []
    clean = []
    for s in stmts:
        s = re.sub(r"\s+", " ", str(s)).strip()
        if len(s) >= 6 and s.endswith(('.', '!', '?')):
            clean.append(s)
        elif len(s) >= 6:
            clean.append(s + ".")
    return clean

def maybe_rag_extract(text: str, current: List[str]) -> List[str]:
    if not USE_RAG_FOR_EXTRACTION or not RAG_API_URL:
        return current
    if not HAS_REQUESTS:
        logger.warning("requests not installed; skipping RAG call.")
        return current
    try:
        payload = {"text": text}
        headers = {"Authorization": f"Bearer {RAG_API_KEY}"} if RAG_API_KEY else {}
        resp = requests.post(RAG_API_URL, json=payload, headers=headers, timeout=20)
        resp.raise_for_status()
        data = resp.json()
        rag_stmts = data.get("statements") or data.get("cti_statements") or []
        merged = list(dict.fromkeys([*current, *rag_stmts]))
        return merged
    except Exception as e:
        logger.warning(f"RAG call failed: {e}")
        return current

def extract_cti_statements(long_text: str) -> List[str]:
    heuristics = heuristic_extract_cti(long_text)
    if USE_LLM_FOR_EXTRACTION:
        llm = llm_extract_cti(long_text)
        base = llm if len(llm) >= max(3, len(heuristics)//2) else list(dict.fromkeys([*llm, *heuristics]))
    else:
        base = heuristics
    final = maybe_rag_extract(long_text, base)
    return final


In [7]:

# === 6) Per-sentence TTP classification ===
CLASSIFY_SYSTEM = """You are a cyber threat intel analyst. Your task is to map a single CTI sentence to the *best* MITRE ATT&CK technique ID from a shortlist.
Return strict JSON: {"technique_id": "TXXXX(.YYY)?", "technique_name": "...", "confidence": 0-1}"""
CLASSIFY_USER_TMPL = """CTI sentence:
"{SENTENCE}"

Shortlist (ID :: Name):
{SHORTLIST}

Respond with only JSON as specified.
"""

def classify_sentence_ttp(sentence: str, shortlist_ids: List[str], temperature: float = 0.0) -> Dict[str, Any]:
    lines = [f"- {tid} :: {id2name.get(tid, '')}" for tid in shortlist_ids]
    shortlist_text = "\n".join(lines)
    prompt = f"<<SYSTEM>>\n{CLASSIFY_SYSTEM}\n<</SYSTEM>>\n\n" + CLASSIFY_USER_TMPL.format(SENTENCE=sentence, SHORTLIST=shortlist_text)
    raw = _gen_raw(prompt, temperature=temperature, max_new_tokens=192)
    js = _best_json(raw) or {}
    tid = js.get("technique_id")
    if not tid or tid not in id2name:
        tid = shortlist_ids[0] if shortlist_ids else None
        js["technique_id"] = tid
        js["technique_name"] = id2name.get(tid, None) if tid else None
        js["confidence"] = js.get("confidence", 0.35)
    if "technique_name" not in js and tid:
        js["technique_name"] = id2name.get(tid, None)
    return js

def classify_all(statements: List[str], shortlist_k: int = 20) -> pd.DataFrame:
    rows = []
    for s in tqdm(statements, desc="Classifying sentences"):
        cands = shortlist_ttp_candidates(s, top_k=shortlist_k)
        pred = classify_sentence_ttp(s, cands)
        rows.append({"sentence": s, "pred_ttp_id": pred.get("technique_id"), "pred_ttp_name": pred.get("technique_name"),
                     "confidence": pred.get("confidence", None), "shortlist": cands})
    return pd.DataFrame(rows)


In [8]:

# === 7) End-to-end (reads from cleaned file) ===
from pathlib import Path
CLEAN_PATH = Path(CLEAN_TEXT_PATH)
assert CLEAN_PATH.exists(), f"Clean text not found at {CLEAN_PATH}; run the preprocessing step first."

LONG_TEXT = CLEAN_PATH.read_text(encoding="utf-8")

try:
    if USE_LLM_FOR_EXTRACTION or True:
        load_local_model()
except Exception as e:
    logger.warning(f"Model load failed (heuristics + TF-IDF fallback still works): {e}")

cti_statements = extract_cti_statements(LONG_TEXT)
print(f"Extracted {len(cti_statements)} CTI statements.")
for i, s in enumerate(cti_statements[:20], 1):
    print(f"{i:02d}. {s}")

df_result = classify_all(cti_statements, shortlist_k=20)
df_result


2025-11-08 00:57:40,225 | INFO | Loading local model from: ./qwen3-4b-sft-merged-final--with-reasoning
2025-11-08 00:57:40,682 | INFO | We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Device set to use cuda:0
The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Extracted 12 CTI statements.
01. APT39 primarily leverages the SEAWEED and CACHEMONEY backdoors along with a specific variant of the POWBAT backdoor.
02. Government entities targeting suggests a potential secondary intent to collect geopolitical data that may benefit nation-state decision making.
03. Initial	Compromise For initial compromise, FireEye Intelligence has observed APT39 leverage spear phishing emails with malicious attachments and/or hyperlinks typically resulting in a POWBAT infection.
04. APT39 frequently registers and leverages domains that masquerade as legitimate web services and organizations that are relevant to the intended target.
05. Furthermore, this group has routinely identified and exploited vulnerable web servers of targeted organizations to install web shells, such as ANTAK and ASPXSPY, and used stolen legitimate credentials to compromise externally facing Outlook Web Access (OWA) resources.
06. Establish	Foothold,	Escalate	Privileges,	and	Internal	Reconnais

Classifying sentences:   0%|          | 0/12 [00:00<?, ?it/s]

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


Unnamed: 0,sentence,pred_ttp_id,pred_ttp_name,confidence,shortlist
0,APT39 primarily leverages the SEAWEED and CACH...,T1195.003,Supply Chain Compromise: Compromise Hardware S...,0.35,"[T1195.003, T1566.003, T1566.001, T1566.004, T..."
1,Government entities targeting suggests a poten...,T1565,Data Manipulation,0.35,"[T1565, T1565.001, T1565.002, T1588.001, T1584..."
2,"Initial\tCompromise For initial compromise, Fi...",T1566,Phishing,0.35,"[T1566, T1584, T1584.004, T1566.002, T1087, T1..."
3,APT39 frequently registers and leverages domai...,T1584.006,Compromise Infrastructure: Web Services,0.35,"[T1584.006, T1036.004, T1505.003, T1102, T1608..."
4,"Furthermore, this group has routinely identifi...",T1505.003,Server Software Component: Web Shell,0.35,"[T1505.003, T1584.006, T1606.001, T1606, T1556..."
5,"Establish\tFoothold,\tEscalate\tPrivileges,\ta...",T1588.001,Obtain Capabilities: Malware,0.35,"[T1588.001, T1219, T1587.001, T1595.003, T1037..."
6,"During privilege escalation, freely available ...",T1555.004,Credentials from Password Stores: Windows Cred...,0.35,"[T1555.004, T1562.001, T1078.003, T1003.001, T..."
7,Internal reconnaissance has been performed usi...,T1560.003,Archive Collected Data: Archive via Custom Method,0.35,"[T1560.003, T1571, T1205.001, T1560, T1003, T1..."
8,"Lateral\tMovement,\tMaintain\tPresence,\tand\t...",T1021.001,Remote Services: Remote Desktop Protocol,0.35,"[T1021.001, T1210, T1021.004, T1563.001, T1570..."
9,"Custom tools such as REDTRIP, PINKTRIP, and BL...",T1136.001,Create Account: Local Account,0.35,"[T1136.001, T1090, T1003, T1095, T1560.003, T1..."


In [10]:

from pathlib import Path
import json

# Try to show DataFrame in current environment; fall back gracefully.
def _get_show_df():
    try:
        from IPython.display import display  # type: ignore
        def _show_df(name, df):
            print(f"[Table] {name} (shape={df.shape})")
            display(df)
        return _show_df
    except Exception:
        def _show_df(name, df):
            print(f"[Table] {name} (shape={df.shape})")
            print(df.head(20).to_string(index=False))
        return _show_df

show_df = _get_show_df()



# 1) Results table: sentence ➜ TTP
csv_path   = "./ttp_results_from_pdf.csv"
jsonl_path = "./ttp_results_from_pdf.jsonl"

df_result.to_csv(csv_path, index=False, encoding="utf-8")
with open(jsonl_path, "w", encoding="utf-8") as f:
    for _, row in df_result.iterrows():
        f.write(json.dumps(row.to_dict(), ensure_ascii=False) + "\n")

# 2) Final CTI sentences actually used for classification
sent_jsonl = "./final_cti_sentences.jsonl"

sent_list = df_result["sentence"].tolist()

with open(sent_jsonl, "w", encoding="utf-8") as f:
    for s in sent_list:
        f.write(json.dumps({"sentence": s}, ensure_ascii=False) + "\n")

# Show & paths
show_df("Per-sentence TTP Results (from PDF)", df_result)
print(f"Saved CSV   : {csv_path}")
print(f"Saved JSONL : {jsonl_path}")
print(f"Saved final sentences (jsonl): {sent_jsonl}")


[Table] Per-sentence TTP Results (from PDF) (shape=(12, 5))


Unnamed: 0,sentence,pred_ttp_id,pred_ttp_name,confidence,shortlist
0,APT39 primarily leverages the SEAWEED and CACH...,T1195.003,Supply Chain Compromise: Compromise Hardware S...,0.35,"[T1195.003, T1566.003, T1566.001, T1566.004, T..."
1,Government entities targeting suggests a poten...,T1565,Data Manipulation,0.35,"[T1565, T1565.001, T1565.002, T1588.001, T1584..."
2,"Initial\tCompromise For initial compromise, Fi...",T1566,Phishing,0.35,"[T1566, T1584, T1584.004, T1566.002, T1087, T1..."
3,APT39 frequently registers and leverages domai...,T1584.006,Compromise Infrastructure: Web Services,0.35,"[T1584.006, T1036.004, T1505.003, T1102, T1608..."
4,"Furthermore, this group has routinely identifi...",T1505.003,Server Software Component: Web Shell,0.35,"[T1505.003, T1584.006, T1606.001, T1606, T1556..."
5,"Establish\tFoothold,\tEscalate\tPrivileges,\ta...",T1588.001,Obtain Capabilities: Malware,0.35,"[T1588.001, T1219, T1587.001, T1595.003, T1037..."
6,"During privilege escalation, freely available ...",T1555.004,Credentials from Password Stores: Windows Cred...,0.35,"[T1555.004, T1562.001, T1078.003, T1003.001, T..."
7,Internal reconnaissance has been performed usi...,T1560.003,Archive Collected Data: Archive via Custom Method,0.35,"[T1560.003, T1571, T1205.001, T1560, T1003, T1..."
8,"Lateral\tMovement,\tMaintain\tPresence,\tand\t...",T1021.001,Remote Services: Remote Desktop Protocol,0.35,"[T1021.001, T1210, T1021.004, T1563.001, T1570..."
9,"Custom tools such as REDTRIP, PINKTRIP, and BL...",T1136.001,Create Account: Local Account,0.35,"[T1136.001, T1090, T1003, T1095, T1560.003, T1..."


Saved CSV   : ./ttp_results_from_pdf.csv
Saved JSONL : ./ttp_results_from_pdf.jsonl
Saved final sentences (jsonl): ./final_cti_sentences.jsonl
