In [ ]:
# Parse Zeek dns.log and compute suspicion metrics\nimport math, json, re\nfrom collections import defaultdict, Counter\nfrom pathlib import Path\n\ndef canon(name: str) -> str:\n    if not name: return name\n    s = name.strip().lower()\n    return s if s.endswith('.') else s + '.'\n\ndef shannon_entropy(s: str) -> float:\n    if not s: return 0.0\n    freq = Counter(s)\n    n = len(s)\n    return -sum((c/n)*math.log2(c/n) for c in freq.values() if c)\n\ndef parse_zeek_ascii(path: Path):\n    sep = '\t'\n    fields = []\n    with path.open() as f:\n        for raw in f:\n            line = raw.rstrip('\n')\n            if not line: continue\n            if line.startswith('#separator '):\n                token = line.split(' ', 1)[1].strip()\n                if token.startswith('\\x'): sep = bytes(token, 'utf-8').decode('unicode_escape')\n                else: sep = token\n                continue\n            if line.startswith('#fields '):\n                fields = line.split(' ', 1)[1].split(sep) if sep != ' ' else line.split(' ')[1:]\n                continue\n            if line.startswith('#'):\n                continue\n            if not fields:\n                continue\n            parts = line.split(sep)\n            if len(parts) != len(fields):\n                continue\n            rec = dict(zip(fields, parts))\n            yield rec\n\ndns_path = Path('../../logs/zeek/dns.log').resolve()\nrows = list(parse_zeek_ascii(dns_path))\nlen(rows)

In [ ]:
# Aggregate per FQDN and compute heuristics\nper = defaultdict(lambda: {\n    'total': 0, 'txt': 0, 'nx': 0, 'len': 0, 'labels': 0, 'max_label': 0, 'entropy': 0.0\n})\n\nfor r in rows:\n    qname = canon(r.get('query', ''))\n    if not qname: continue\n    qtype = r.get('qtype_name') or r.get('qtype')\n    rcode = (r.get('rcode_name') or r.get('rcode') or '').upper()\n    per[qname]['total'] += 1\n    if str(qtype).upper() == 'TXT' or str(qtype) == '16':\n        per[qname]['txt'] += 1\n    if rcode == 'NXDOMAIN' or rcode == '3':\n        per[qname]['nx'] += 1\n    labels = qname.rstrip('.').split('.')\n    per[qname]['len'] = max(per[qname]['len'], len(qname))\n    per[qname]['labels'] = max(per[qname]['labels'], len(labels))\n    per[qname]['max_label'] = max(per[qname]['max_label'], max(len(l) for l in labels))\n    clean = re.sub(r'[^a-z0-9]', '', qname.lower())\n    per[qname]['entropy'] = shannon_entropy(clean)\n\nsuspects = []\nfor name, m in per.items():\n    txt_ratio = (m['txt'] / m['total']) if m['total'] else 0\n    nx_ratio = (m['nx'] / m['total']) if m['total'] else 0\n    if (m['max_label'] >= 40) or (m['labels'] >= 8) or (txt_ratio >= 0.20) or (nx_ratio >= 0.50) or (m['entropy'] >= 3.2):\n        suspects.append((name, m, txt_ratio, nx_ratio))\n\nsorted([s[0] for s in suspects])