In [None]:
%load_ext lab_black

---

In [2]:
import sys

sys.path.append("../")

In [3]:
from readability import Document
from IPython.core.display import display, HTML

In [9]:
# with open("./reports/546fc7bf11d4083bc021c37f/operation_doubletap.html", "r") as f:
with open(
    "../data/hellokitty-ransomware-lacks-stealth-but-still-strikes-home.html",
    # "../data/hafnium-targeting-exchange-servers.html",
    "r",
    encoding="utf-8"
) as f:
    data = f.read()

---

In [11]:
from lxml import etree
from collections import namedtuple
from functional import seq

In [12]:
class Analyzer(object):

    ArticleElement = namedtuple("ArticleElement", ["xpath", "tag"])
    ArticleContent = namedtuple

    def __init__(self, document):
        self.document = Document(document, xpath=True)

    @staticmethod
    def has_printable_content(element):
        # check element.text and element.tail is not None and contral char (\n\t\r)
        def check(content):
            if content is not None:
                return content.isspace()
            else:
                return True

        result = not all((check(element.text), check(element.tail)))
        return result

    @staticmethod
    def has_printable_tag(element):
        # check element.tag is not script (javascript) or style (css)
        result = element.tag not in ["script", "style"]
        return result

    @property
    def article_contents(self):
        # get all of elements
        text = self.document.summary()
        root = etree.HTML(text)
        elements = root.xpath("//*")
        # filter printable elements
        printable_elements = (
            seq(elements)
            .filter(lambda element: self.has_printable_content(element))
            .filter(lambda element: self.has_printable_tag(element))
        )
        """
        contents = printable_elements.map(
            lambda element: (
                element.get("x"),
                element.tag,
                (("text", element.text), ("tail", element.tail)),
            )
        )

        return contents
        """
        return printable_elements

In [34]:
a = Analyzer(data)

0,1
<Element a at 0x7ffadcf82608>,
,
,
,
,
,
<Element code at 0x7ffadcf82a88>,<Element code at 0x7ffadcf82d08>
,
,
,


---

# Yara

In [8]:
from pyparsing import (
    Combine,
    Literal,
    Keyword,
    Word,
    alphanums,
    empty,
    printables,
    QuotedString,
)

In [9]:
yara_sample = """rule Agent_BTZ_Proxy_DLL_2 {
   meta:
      description = "Detects Agent-BTZ Proxy DLL - activeds.dll"
      author = "Florian Roth"
      reference = "http://www.intezer.com/new-variants-of-agent-btz-comrat-found/"
      date = "2017-08-07"
      hash1 = "73db4295c5b29958c5d93c20be9482c1efffc89fc4e5c8ba59ac9425a4657a88"
      hash2 = "380b0353ba8cd33da8c5e5b95e3e032e83193019e73c71875b58ec1ed389bdac"
      hash3 = "f27e9bba6a2635731845b4334b807c0e4f57d3b790cecdc77d8fef50629f51a2"
   strings:
      $s1 = { 38 21 38 2C 38 37 38 42 38 4D 38 58 38 63 38 6E
               38 79 38 84 38 8F 38 9A 38 A5 38 B0 38 BB 38 C6
               38 D1 38 DC 38 E7 38 F2 38 FD 38 08 39 13 39 1E
               39 29 39 34 39 3F 39 4A 39 55 39 60 39 6B 39 76
               39 81 39 8C 39 97 39 A2 39 AD 39 B8 39 C3 39 CE
               39 D9 39 E4 39 EF 39 FA 39 05 3A 10 3A 1B 3A 26
               3A 31 3A 3C 3A 47 3A 52 3A 5D 3A 68 3A 73 3A 7E
               3A 89 3A 94 3A 9F 3A AA 3A B5 3A C0 3A CB 3A D6
               3A E1 3A EC 3A F7 3A }
      $s2 = "activeds.dll" ascii fullword
   condition:
      uint16(0) == 0x5a4d and filesize < 200KB and all of them and pe.imphash() == "09b7c73fbe5529e6de7137e3e8268b7b"
}"""

In [10]:
from pyparsing import nestedExpr, originalTextFor

In [11]:
expr = Combine(
    Literal("rule")
    + Word(" ", exact=1)
    + Word(alphanums + "_")
    + Word(" ", exact=1)
    + originalTextFor(
        nestedExpr(
            opener="{",
            closer="}",
        )
    )
)

parser = expr.setResultsName("YARA")

In [12]:
seq(parser.scanString(yara_sample))

0,1,2
"['rule Agent_BTZ_Proxy_DLL_2 {\n meta:\n description = ""Detects Agent-BTZ Proxy DLL - activeds.dll""\n author = ""Florian Roth""\n reference = ""http://www.intezer.com/new-variants-of-agent-btz-comrat-found/""\n date = ""2017-08-07""\n hash1 = ""73db4295c5b29958c5d93c20be9482c1efffc89fc4e5c8ba59ac9425a4657a88""\n hash2 = ""380b0353ba8cd33da8c5e5b95e3e032e83193019e73c71875b58ec1ed389bdac""\n hash3 = ""f27e9bba6a2635731845b4334b807c0e4f57d3b790cecdc77d8fef50629f51a2""\n strings:\n $s1 = { 38 21 38 2C 38 37 38 42 38 4D 38 58 38 63 38 6E\n 38 79 38 84 38 8F 38 9A 38 A5 38 B0 38 BB 38 C6\n 38 D1 38 DC 38 E7 38 F2 38 FD 38 08 39 13 39 1E\n 39 29 39 34 39 3F 39 4A 39 55 39 60 39 6B 39 76\n 39 81 39 8C 39 97 39 A2 39 AD 39 B8 39 C3 39 CE\n 39 D9 39 E4 39 EF 39 FA 39 05 3A 10 3A 1B 3A 26\n 3A 31 3A 3C 3A 47 3A 52 3A 5D 3A 68 3A 73 3A 7E\n 3A 89 3A 94 3A 9F 3A AA 3A B5 3A C0 3A CB 3A D6\n 3A E1 3A EC 3A F7 3A }\n $s2 = ""activeds.dll"" ascii fullword\n condition:\n uint16(0) == 0x5a4d and filesize < 200KB and all of them and pe.imphash() == ""09b7c73fbe5529e6de7137e3e8268b7b""\n}']",0,1213


---

In [13]:
# from box import Box

In [23]:
# feed = seq.json("../data/hellokitty.feeds.json").to_dict()
feed = seq.json("../data/hafnium.feeds.json").to_dict()
# feed = seq.json("../data/sunspot.feeds.json").to_dict()

feed

{'id': '603eb1abdd4812819c64e197',
 'name': 'HAFNIUM targeting Exchange Servers with 0-day exploits',
 'description': '"Microsoft has detected multiple 0-day exploits being used to attack on-premises versions of Microsoft Exchange Server in limited and targeted attacks. In the attacks observed, the threat actor used these vulnerabilities to access on-premises Exchange servers which enabled access to email accounts, and allowed installation of additional malware to facilitate long-term access to victim environments. Microsoft Threat Intelligence Center (MSTIC) attributes this campaign with high confidence to HAFNIUM, a group assessed to be state-sponsored and operating out of China, based on observed victimology, tactics and procedures."',
 'author_name': 'AlienVault',
 'modified': '2021-03-09T21:04:17.962000',
 'created': '2021-03-02T21:44:11.611000',
 'revision': 16,
 'tlp': 'white',
 'public': 1,
 'adversary': 'HAFNIUM',
 'indicators': [{'id': 2881717437,
   'indicator': '097549cf7d0

---

```python
{
    "string": 'fadd8d7c13a18c251ded1f645ffea18a37f1c2de', 
    "masked": 'FileHash-SHA1',
    "label": {
        "label": "indicator", 
        "group": 2, 
        "role": None
}
```

In [None]:
# indicator process
Indicator = namedtuple("Indicator", ["index", "indicator", "type", "role"])

indicators = (
    seq(feed.get("indicators"))
    .filter(lambda indicator: indicator.get("type") != "YARA")  # ignore yara
    .enumerate()
    .starmap(
        lambda index, indicator: (
            index,
            *itemgetter("indicator", "type", "role")(indicator),  # flatten
        )
    )
    .starmap(
        lambda index, indicator, type, role: (
            index,
            indicator,
            type,
            role if role != "" else None,  # replace "" with None
        )
    )
    .map(lambda fields: Indicator(*fields))
    .to_list()
)

In [19]:
from operator import itemgetter

In [24]:
class Label(object):
    def __init__(self, feed):
        self.feed = feed

    @staticmethod
    def get_label(indicator):
        _indicator, _type, _role = itemgetter("indicator", "type", "role")(indicator)
        """
        Todo: ignore yara
        """
        target = _indicator
        replacement = "<{}>".format(_type)
        label = "Indicator" if _role is None else "Indicator-{}".format(_role.title())
        return target, replacement, label

    """
    tags or keyword is replacement
    """

In [25]:
l = Label(feed)

- CVE
- FileHash-SHA256
- IPv4
- YARA


```bash
cat *.json | jq -c '.objects[] | select(.id | contains("indicator"))'
```

feed.indicators[].type | stix.objects[].type | stix.objects[].pattern_type | stix.objects[].labels
-----------------------|---------------------|------------------------|-----------------------
CVE                    | "vulnerability"     |                        |
URL                    |                     |                   | ["malicious-activity"]
domain                 | "indicator"         |           | ["malicious-activity"]
FileHash-MD5           | "indicator"         | "stix"       | ["malicious-activity"]
FileHash-SHA1          | "indicator"         |         | ["malicious-activity"]
FileHash-SHA256        | "indicator"         |         | ["malicious-activity"]
IPv4                   |                     |         | 
YARA                   | "indicator"         |         | []
AttackTechnique*       | "attack-pattern"    |
?                      | "malware"           |                        | ["adware"]
adversery*             | "threat-actor"      |                        | ["activist"]


**stix.objects[].type**
- identity
- indicator
    - **.pattern (.pattern_type)**
        - domain-name:value (stix)
        - email-message:from_ref.value (stix)
        - file:hashes.'SHA-1' (stix)
        - file:hashes.'SHA-256' (stix)
        - file:hashes.MD5 (stix)
        - ipv4-addr:value (stix)
        - ipv6-addr:value (stix)
        - mutex:value (stix)
        - url:value (stix)
        - rule / import (yara)
    
- report
- threat-actor
    - name: .name, description: .description, aliases: .aliases}
- vulnerability
    - **.name**


  "activist"
  "backdoor"
  "command_and_control"
  "file_scanning"
  "hacking_tool"
  "malware_hosting"
  "memory_scanning"
  "phishing"
  "ransomware"
  "rat"
  "scanning_host"
  "threat-report"
  "trojan"
  "unknown"
  "worm"

In [27]:
seq(feed.get("indicators")).map(lambda indicator: Label.get_label(indicator)).show(30)

----------------------------------------------------------------  -----------------  -----------------------
097549cf7d0f76f0d99edf8b2d91c60977fd6a96e4b8c3c94b0b1733dc026d3e  <FileHash-SHA256>  Indicator
1631a90eb5395c4e19c7dbcbf611bbe6444ff312eb7937e286e4637cb9e72944  <FileHash-SHA256>  Indicator
2b6f1ebb2208e93ade4a6424555d6a8341fd6d9f60c25e44afe11008f5c1aad1  <FileHash-SHA256>  Indicator
4edc7770464a14f54d17f36dc9d0fe854f68b346b27b35a6f5839adf1f13f8ea  <FileHash-SHA256>  Indicator
511df0e2df9bfa5521b588cc4bb5f8c5a321801b803394ebc493db1ef3c78fa1  <FileHash-SHA256>  Indicator
65149e036fff06026d80ac9ad4d156332822dc93142cf1a122b1841ec8de34b5  <FileHash-SHA256>  Indicator
811157f9c7003ba8d17b45eb3cf09bef2cecd2701cedb675274949296a6a183d  <FileHash-SHA256>  Indicator
b75f163ca9b9240bf4b37ad92bc7556b40a17e27c2b8ed5c8991385fe07d17d0  <FileHash-SHA256>  Indicator
CVE-2021-26858                                                    <CVE>              Indicator
CVE-2021-26855                      

In [73]:
attribs = Label.get_label(feed.get("indicators")[1])
attribs

('501487b025f25ddf1ca32deb57a2b4db43ccf6635c1edc74b9cff54ce0e5bcfe',
 '<FileHash-SHA256>',
 'Indicator')

In [35]:
text = """IOCs

SHA1
fadd8d7c13a18c251ded1f645ffea18a37f1c2de

SHA256
501487b025f25ddf1ca32deb57a2b4db43ccf6635c1edc74b9cff54ce0e5bcfe

MITRE ATT&CK
Data from Local System – T1005
Modify Registry – T1112
Query Registry – T1012
System Information Discovery – T1082
Data Encrypted for Impact – T1486
File Deletion – T1070.004
Command and Scripting Interpreter: Windows Command Shell – T1059.003
Windows Management Instrumentation – T1047

501487b025f25ddf1ca32deb57a2b4db43ccf6635c1edc74b9cff54ce0e5bcfe
"""

In [None]:
expr = Keyword(indicators[2].indicator)
parser = (
    Group(expr)
    .setName(indicators[2].type)
    .setParseAction(lambda string, loc, tokens: tokens.asDict())
)

---

In [38]:
from pyparsing import Group, OneOrMore, MatchFirst

In [79]:
seq(Word(nums).setResultsName("integer").scanString("123 abc 321")).starmap(
    lambda tokens, start, end: tokens.asDict()
).to_list()

[{'integer': '123'}, {'integer': '321'}]

In [59]:
expr1 = Keyword(indicators[1].indicator)
parser1 = Group(expr1)(indicators[1].type)

expr2 = Keyword(indicators[2].indicator)
parser2 = Group(expr2).setResultsName(indicators[2].type)

In [60]:
p = MatchFirst([parser1, parser2])
_p = OneOrMore(parser1 | parser2)

In [61]:
seq(p.scanString(text)).to_list()  # .map(lambda result: (result.get_name(), result[0]))

[(([(['fadd8d7c13a18c251ded1f645ffea18a37f1c2de'], {})], {'FileHash-SHA1': [(['fadd8d7c13a18c251ded1f645ffea18a37f1c2de'], {})]}),
  11,
  51),
 (([(['501487b025f25ddf1ca32deb57a2b4db43ccf6635c1edc74b9cff54ce0e5bcfe'], {})], {'FileHash-SHA256': [(['501487b025f25ddf1ca32deb57a2b4db43ccf6635c1edc74b9cff54ce0e5bcfe'], {})]}),
  60,
  124),
 (([(['501487b025f25ddf1ca32deb57a2b4db43ccf6635c1edc74b9cff54ce0e5bcfe'], {})], {'FileHash-SHA256': [(['501487b025f25ddf1ca32deb57a2b4db43ccf6635c1edc74b9cff54ce0e5bcfe'], {})]}),
  427,
  491)]

In [62]:
seq(
    _p.scanString(text)
).to_list()  # .map(lambda result: (result.get_name(), result[0]))

[(([(['fadd8d7c13a18c251ded1f645ffea18a37f1c2de'], {})], {'FileHash-SHA1': [(['fadd8d7c13a18c251ded1f645ffea18a37f1c2de'], {})]}),
  11,
  51),
 (([(['501487b025f25ddf1ca32deb57a2b4db43ccf6635c1edc74b9cff54ce0e5bcfe'], {})], {'FileHash-SHA256': [(['501487b025f25ddf1ca32deb57a2b4db43ccf6635c1edc74b9cff54ce0e5bcfe'], {})]}),
  60,
  124),
 (([(['501487b025f25ddf1ca32deb57a2b4db43ccf6635c1edc74b9cff54ce0e5bcfe'], {})], {'FileHash-SHA256': [(['501487b025f25ddf1ca32deb57a2b4db43ccf6635c1edc74b9cff54ce0e5bcfe'], {})]}),
  427,
  491)]

In [673]:
seq(_p.scanString(text)).starmap(
    lambda tokens, start, end: (tokens.getName(), tokens[0])
).to_list()

[('FileHash-SHA1', (['fadd8d7c13a18c251ded1f645ffea18a37f1c2de'], {})),
 ('FileHash-SHA256',
  (['501487b025f25ddf1ca32deb57a2b4db43ccf6635c1edc74b9cff54ce0e5bcfe'], {})),
 ('FileHash-SHA256',
  (['501487b025f25ddf1ca32deb57a2b4db43ccf6635c1edc74b9cff54ce0e5bcfe'], {}))]

---

In [56]:
from pyparsing import CaselessKeyword

In [180]:
# example
from pprint import pprint

pprint(feed.get("indicators")[1])

seq(
    CaselessKeyword(feed.get("indicators")[1].get("indicator"))
    .setResultsName(feed.get("indicators")[1].get("type"))
    .scanString(text)
).starmap(lambda string, loc, tokens: (string.asDict(), (loc, tokens))).to_list()

{'content': '',
 'created': '2021-03-09T15:29:16',
 'description': 'SHA256 of fadd8d7c13a18c251ded1f645ffea18a37f1c2de',
 'expiration': None,
 'id': 2892282606,
 'indicator': '501487b025f25ddf1ca32deb57a2b4db43ccf6635c1edc74b9cff54ce0e5bcfe',
 'is_active': 1,
 'role': None,
 'title': 'Ransom:Win32/Death.DB!MTB',
 'type': 'FileHash-SHA256'}


[({'FileHash-SHA256': '501487b025f25ddf1ca32deb57a2b4db43ccf6635c1edc74b9cff54ce0e5bcfe'},
  (60, 124))]

In [203]:
# example
from pprint import pprint

pprint(feed.get("indicators")[1])

seq(
    CaselessKeyword(feed.get("indicators")[1].get("indicator"))
    .setResultsName(feed.get("indicators")[1].get("type"))
    .scanString(text)
).starmap(lambda string, loc, tokens: ((loc, tokens), string.getName())).to_dict()

{'content': '',
 'created': '2021-03-09T15:29:16',
 'description': 'SHA256 of fadd8d7c13a18c251ded1f645ffea18a37f1c2de',
 'expiration': None,
 'id': 2892282606,
 'indicator': '501487b025f25ddf1ca32deb57a2b4db43ccf6635c1edc74b9cff54ce0e5bcfe',
 'is_active': 1,
 'role': None,
 'title': 'Ransom:Win32/Death.DB!MTB',
 'type': 'FileHash-SHA256'}


{(60, 124): 'FileHash-SHA256'}

In [243]:
seq(
    CaselessKeyword(feed.get("indicators")[1].get("indicator"))
    .setResultsName(feed.get("indicators")[1].get("type"))
    .addParseAction(lambda string, loc, tokens: tokens)
    .scanString(text)
).starmap(lambda string, loc, tokens: ((loc, tokens), string)).to_dict()

{(60,
  124): (['501487b025f25ddf1ca32deb57a2b4db43ccf6635c1edc74b9cff54ce0e5bcfe'], {'FileHash-SHA256': ['501487b025f25ddf1ca32deb57a2b4db43ccf6635c1edc74b9cff54ce0e5bcfe']})}

In [256]:
# sample
seq(
    CaselessKeyword(feed.get("indicators")[1].get("indicator")).setResultsName(
        feed.get("indicators")[1].get("type")
    )
    # .addParseAction(lambda string, loc, tokens: tokens)
    .scanString(text)
).starmap(lambda results, start, end: (results.getName(), (start, end))).to_dict()

{'FileHash-SHA256': (60, 124)}

In [395]:
# parser
seq(
    CaselessKeyword(feed.get("indicators")[1].get("indicator"))
    .setResultsName(feed.get("indicators")[1].get("type"))
    .addParseAction(
        lambda string, loc, tokens: (
            tokens.getName(),
            tokens.get(tokens.getName()),
            feed.get("indicators")[1].get("role"),
            # tokens.asDict(),
        )
    )
    .scanString(text)
)  # .to_list()

0,1,2
"[('FileHash-SHA256', '501487b025f25ddf1ca32deb57a2b4db43ccf6635c1edc74b9cff54ce0e5bcfe', None)]",60,124
"[('FileHash-SHA256', '501487b025f25ddf1ca32deb57a2b4db43ccf6635c1edc74b9cff54ce0e5bcfe', None)]",427,491


In [410]:
seq(
    CaselessKeyword(feed.get("indicators")[1].get("indicator"))
    .setResultsName(feed.get("indicators")[1].get("type"))
    .addParseAction(
        lambda string, loc, tokens: (
            tokens.getName(),
            feed.get("indicators")[1].get("role"),
            # tokens.asDict(),
        )
    )
    .scanString(text)
).to_list()

[(([('FileHash-SHA256', None)], {'FileHash-SHA256': ['FileHash-SHA256']}),
  60,
  124),
 (([('FileHash-SHA256', None)], {'FileHash-SHA256': ['FileHash-SHA256']}),
  427,
  491)]

In [591]:
seq(
    CaselessKeyword(feed.get("indicators")[1].get("indicator"))
    .setResultsName(feed.get("indicators")[1].get("type"))
    .setParseAction(
        lambda string, loc, tokens: (
            # tokens
            tokens.getName(),
            # feed.get("indicators")[1].get("role"),
            # tokens.asDict(),
        )
    )
    .scanString(text)
)

0,1,2
"[('FileHash-SHA256',)]",60,124
"[('FileHash-SHA256',)]",427,491


In [435]:
seq(
    CaselessKeyword(feed.get("indicators")[1].get("indicator"))
    # .setResultsName(feed.get("indicators")[1].get("type"))
    .setParseAction(
        lambda string, loc, tokens: (
            feed.get("indicators")[1].get("type"),
            # tokens.getName(),
            feed.get("indicators")[1].get("role"),
            # tokens.asDict(),
        )
    ).scanString(text)
)  # .to_list()

0,1,2
"[('FileHash-SHA256', None)]",60,124
"[('FileHash-SHA256', None)]",427,491


In [595]:
text

'IOCs\n\nSHA1\nfadd8d7c13a18c251ded1f645ffea18a37f1c2de\n\nSHA256\n501487b025f25ddf1ca32deb57a2b4db43ccf6635c1edc74b9cff54ce0e5bcfe\n\nMITRE ATT&CK\nData from Local System – T1005\nModify Registry – T1112\nQuery Registry – T1012\nSystem Information Discovery – T1082\nData Encrypted for Impact – T1486\nFile Deletion – T1070.004\nCommand and Scripting Interpreter: Windows Command Shell – T1059.003\nWindows Management Instrumentation – T1047\n\n501487b025f25ddf1ca32deb57a2b4db43ccf6635c1edc74b9cff54ce0e5bcfe\n'

In [594]:
text[:60] + "<MASK>" + text[124:]

'IOCs\n\nSHA1\nfadd8d7c13a18c251ded1f645ffea18a37f1c2de\n\nSHA256\n<MASK>\n\nMITRE ATT&CK\nData from Local System – T1005\nModify Registry – T1112\nQuery Registry – T1012\nSystem Information Discovery – T1082\nData Encrypted for Impact – T1486\nFile Deletion – T1070.004\nCommand and Scripting Interpreter: Windows Command Shell – T1059.003\nWindows Management Instrumentation – T1047\n\n501487b025f25ddf1ca32deb57a2b4db43ccf6635c1edc74b9cff54ce0e5bcfe\n'

---

In [579]:
# indicator process
Indicator = namedtuple("Indicator", ["index", "indicator", "type", "role"])

indicators = (
    seq(feed.get("indicators"))
    .filter(lambda indicator: indicator.get("type") != "YARA")  # ignore yara
    .enumerate()
    .starmap(
        lambda index, indicator: (
            index,
            *itemgetter("indicator", "type", "role")(indicator),  # flatten
        )
    )
    .starmap(
        lambda index, indicator, type, role: (
            index,
            indicator,
            type,
            role if role != "" else None,  # replace "" with None
        )
    )
    .map(lambda fields: Indicator(*fields))
)

In [596]:
%%time
indicators

CPU times: user 2 µs, sys: 1 µs, total: 3 µs
Wall time: 4.77 µs


index,indicator,type,role
0,136bd70f7aa98f52861879d7dca03cf2,FileHash-MD5,
1,501487b025f25ddf1ca32deb57a2b4db43ccf6635c1edc74b9cff54ce0e5bcfe,FileHash-SHA256,
2,fadd8d7c13a18c251ded1f645ffea18a37f1c2de,FileHash-SHA1,
3,6x7dp6h3w6q3ugjv4yv5gycj3femb24kysgry5b44hhgfwc5ml5qrdad.onion,domain,


In [635]:
from more_itertools import flatten, collapse

In [636]:
def generate_parser(indicator):
    # parser = Keyword(indicator.indicator).setResultsName(indicator.type)
    """
    parser = Keyword(indicator.indicator).setParseAction(
        lambda string, loc, tokens: indicator  # {"group": indicator.index, "role": indicator.role}
    )
    """

    parser = Keyword(indicator.indicator).setParseAction(
        lambda tokens: (indicator.indicator, indicator.type)
    )

    return parser

In [637]:
%%time
seq(generate_parser(indicators.to_list()[1]).scanString(text))

CPU times: user 273 µs, sys: 114 µs, total: 387 µs
Wall time: 314 µs


0,1,2
"[('501487b025f25ddf1ca32deb57a2b4db43ccf6635c1edc74b9cff54ce0e5bcfe', 'FileHash-SHA256')]",60,124
"[('501487b025f25ddf1ca32deb57a2b4db43ccf6635c1edc74b9cff54ce0e5bcfe', 'FileHash-SHA256')]",427,491


In [460]:
indicators.map(lambda indicator: generate_parser(indicator)).map(
    lambda parser: parser.scanString(text)
).flatten()

0,1,2
"[{'group': 1, 'role': None}]",60,124
"[{'group': 1, 'role': None}]",427,491
"[{'group': 2, 'role': None}]",11,51


In [529]:
seq.open("../pulse_20210310.json.gz").map(lambda line: json.loads(line.decode())).map(
    lambda feed: feed.get("indicators")
).flatten().filter(lambda indicator: indicator.get("type") == "YARA").filter(
    lambda indicator: "406b680edc9a1bb0e2c7c451c56904857848b5f15570401450b73b232ff38928"
    in indicator.get("content")
)

0,1,2,3,4,5,6,7,8,9
2892396777,8a7252d3fd1b2d8e2b39dc1cedbe1e9a42cc0e07,YARA,2021-03-09T18:29:01,"rule WEBSHELL_ASPX_reGeorgTunnel { meta: author = ""threatintel@volexity.com"" date = ""2021-03-01"" description = ""variation on reGeorgtunnel"" hash = ""406b680edc9a1bb0e2c7c451c56904857848b5f15570401450b73b232ff38928"" reference = ""https://github.com/sensepost/reGeorg/blob/master/tunnel.aspx"" strings: $s1 = ""System.Net.Sockets"" $s2 = ""System.Text.Encoding.Default.GetString(Convert.FromBase64String(StrTr(Request.Headers.Get"" $t1 = "".Split('|')"" $t2 = ""Request.Headers.Get"" $t3 = "".Substring("" $t4 = ""new Socket("" $t5 = ""IPAddress ip;"" condition: all of ($s*) or all of ($t*) }",WEBSHELL_ASPX_reGeorgTunnel,variation on reGeorgtunnel,,1,file_scanning


---

In [62]:
text = """IOCs

SHA1
fadd8d7c13a18c251ded1f645ffea18a37f1c2de

SHA256
501487b025f25ddf1ca32deb57a2b4db43ccf6635c1edc74b9cff54ce0e5bcfe

MITRE ATT&CK
Data from Local System – T1005
Modify Registry – T1112
Query Registry – T1012
System Information Discovery – T1082
Data Encrypted for Impact – T1486
File Deletion – T1070.004
Command and Scripting Interpreter: Windows Command Shell – T1059.003
Windows Management Instrumentation – T1047
"""

In [18]:
from magpie.src.entity import (
    attack_technique,
    bitcoin_address,
    cve,
    defender_threat,
    domain,
    email,
    filehash_md5,
    filehash_sha1,
    filehash_sha256,
    filepath,
    hostname,
    ipv4,
    ipv6,
    keyword,
    sslcert_fingerprint,
    uri,
    url,
)

In [19]:
parser = (
    defender_threat.parser
    | uri.parser
    | url.parser
    | email.parser
    | hostname.parser
    | domain.parser
    | sslcert_fingerprint.parser
    | ipv6.parser
    | ipv4.parser
    | cve.parser
    | attack_technique.parser
    | filepath.parser
    | filehash_sha256.parser  # len = 64
    | filehash_sha1.parser  # len = 40
    | bitcoin_address.parser  # len = 34
    | filehash_md5.parser  # len = 32
    # | keyword.make_parser(self.extracted_keywords)  # generate keywords parser
)

In [23]:
seq(parser.scanString(text)).starmap(
    lambda token, start, end: (
        token.asDict(),
        list(token.items()),
        (start, end),
    )
)

0,1,2
{'FileHash-SHA1': 'fadd8d7c13a18c251ded1f645ffea18a37f1c2de'},"[('FileHash-SHA1', 'fadd8d7c13a18c251ded1f645ffea18a37f1c2de')]","(11, 51)"
{'FileHash-SHA256': '501487b025f25ddf1ca32deb57a2b4db43ccf6635c1edc74b9cff54ce0e5bcfe'},"[('FileHash-SHA256', '501487b025f25ddf1ca32deb57a2b4db43ccf6635c1edc74b9cff54ce0e5bcfe')]","(60, 124)"
{'AttackTechnique': 'T1005'},"[('AttackTechnique', 'T1005')]","(164, 169)"
{'AttackTechnique': 'T1112'},"[('AttackTechnique', 'T1112')]","(188, 193)"
{'AttackTechnique': 'T1012'},"[('AttackTechnique', 'T1012')]","(211, 216)"
{'AttackTechnique': 'T1082'},"[('AttackTechnique', 'T1082')]","(248, 253)"
{'AttackTechnique': 'T1486'},"[('AttackTechnique', 'T1486')]","(282, 287)"
{'AttackTechnique': 'T1070.004'},"[('AttackTechnique', 'T1070.004')]","(304, 313)"
{'AttackTechnique': 'T1059.003'},"[('AttackTechnique', 'T1059.003')]","(373, 382)"
{'AttackTechnique': 'T1047'},"[('AttackTechnique', 'T1047')]","(420, 425)"


In [150]:
seq(parser.scanString(text)).starmap(
    lambda token, start, end: (
        token,
        (start, end)
        # token.getName(),
        # token.get(token.getName()),
    )
).starmap(lambda token, span: (dir(token), span))

0,1
"['FileHash-SHA1', '_ParseResults__lookup', '__add__', '__bool__', '__class__', '__contains__', '__delattr__', '__delitem__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattr__', '__getattribute__', '__getitem__', '__getnewargs__', '__getstate__', '__gt__', '__hash__', '__iadd__', '__init__', '__init_subclass__', '__iter__', '__le__', '__len__', '__lt__', '__module__', '__ne__', '__new__', '__nonzero__', '__radd__', '__reduce__', '__reduce_ex__', '__repr__', '__reversed__', '__setattr__', '__setitem__', '__setstate__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_asStringList', '_iteritems', '_iterkeys', '_itervalues', 'append', 'asDict', 'asList', 'asXML', 'clear', 'copy', 'dump', 'extend', 'from_dict', 'get', 'getName', 'haskeys', 'insert', 'items', 'keys', 'pop', 'pprint', 'values']","(11, 51)"
"['FileHash-SHA256', '_ParseResults__lookup', '__add__', '__bool__', '__class__', '__contains__', '__delattr__', '__delitem__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattr__', '__getattribute__', '__getitem__', '__getnewargs__', '__getstate__', '__gt__', '__hash__', '__iadd__', '__init__', '__init_subclass__', '__iter__', '__le__', '__len__', '__lt__', '__module__', '__ne__', '__new__', '__nonzero__', '__radd__', '__reduce__', '__reduce_ex__', '__repr__', '__reversed__', '__setattr__', '__setitem__', '__setstate__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_asStringList', '_iteritems', '_iterkeys', '_itervalues', 'append', 'asDict', 'asList', 'asXML', 'clear', 'copy', 'dump', 'extend', 'from_dict', 'get', 'getName', 'haskeys', 'insert', 'items', 'keys', 'pop', 'pprint', 'values']","(60, 124)"
"['AttackTechnique', '_ParseResults__lookup', '__add__', '__bool__', '__class__', '__contains__', '__delattr__', '__delitem__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattr__', '__getattribute__', '__getitem__', '__getnewargs__', '__getstate__', '__gt__', '__hash__', '__iadd__', '__init__', '__init_subclass__', '__iter__', '__le__', '__len__', '__lt__', '__module__', '__ne__', '__new__', '__nonzero__', '__radd__', '__reduce__', '__reduce_ex__', '__repr__', '__reversed__', '__setattr__', '__setitem__', '__setstate__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_asStringList', '_iteritems', '_iterkeys', '_itervalues', 'append', 'asDict', 'asList', 'asXML', 'clear', 'copy', 'dump', 'extend', 'from_dict', 'get', 'getName', 'haskeys', 'insert', 'items', 'keys', 'pop', 'pprint', 'values']","(164, 169)"
"['AttackTechnique', '_ParseResults__lookup', '__add__', '__bool__', '__class__', '__contains__', '__delattr__', '__delitem__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattr__', '__getattribute__', '__getitem__', '__getnewargs__', '__getstate__', '__gt__', '__hash__', '__iadd__', '__init__', '__init_subclass__', '__iter__', '__le__', '__len__', '__lt__', '__module__', '__ne__', '__new__', '__nonzero__', '__radd__', '__reduce__', '__reduce_ex__', '__repr__', '__reversed__', '__setattr__', '__setitem__', '__setstate__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_asStringList', '_iteritems', '_iterkeys', '_itervalues', 'append', 'asDict', 'asList', 'asXML', 'clear', 'copy', 'dump', 'extend', 'from_dict', 'get', 'getName', 'haskeys', 'insert', 'items', 'keys', 'pop', 'pprint', 'values']","(188, 193)"
"['AttackTechnique', '_ParseResults__lookup', '__add__', '__bool__', '__class__', '__contains__', '__delattr__', '__delitem__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattr__', '__getattribute__', '__getitem__', '__getnewargs__', '__getstate__', '__gt__', '__hash__', '__iadd__', '__init__', '__init_subclass__', '__iter__', '__le__', '__len__', '__lt__', '__module__', '__ne__', '__new__', '__nonzero__', '__radd__', '__reduce__', '__reduce_ex__', '__repr__', '__reversed__', '__setattr__', '__setitem__', '__setstate__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_asStringList', '_iteritems', '_iterkeys', '_itervalues', 'append', 'asDict', 'asList', 'asXML', 'clear', 'copy', 'dump', 'extend', 'from_dict', 'get', 'getName', 'haskeys', 'insert', 'items', 'keys', 'pop', 'pprint', 'values']","(211, 216)"
"['AttackTechnique', '_ParseResults__lookup', '__add__', '__bool__', '__class__', '__contains__', '__delattr__', '__delitem__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattr__', '__getattribute__', '__getitem__', '__getnewargs__', '__getstate__', '__gt__', '__hash__', '__iadd__', '__init__', '__init_subclass__', '__iter__', '__le__', '__len__', '__lt__', '__module__', '__ne__', '__new__', '__nonzero__', '__radd__', '__reduce__', '__reduce_ex__', '__repr__', '__reversed__', '__setattr__', '__setitem__', '__setstate__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_asStringList', '_iteritems', '_iterkeys', '_itervalues', 'append', 'asDict', 'asList', 'asXML', 'clear', 'copy', 'dump', 'extend', 'from_dict', 'get', 'getName', 'haskeys', 'insert', 'items', 'keys', 'pop', 'pprint', 'values']","(248, 253)"
"['AttackTechnique', '_ParseResults__lookup', '__add__', '__bool__', '__class__', '__contains__', '__delattr__', '__delitem__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattr__', '__getattribute__', '__getitem__', '__getnewargs__', '__getstate__', '__gt__', '__hash__', '__iadd__', '__init__', '__init_subclass__', '__iter__', '__le__', '__len__', '__lt__', '__module__', '__ne__', '__new__', '__nonzero__', '__radd__', '__reduce__', '__reduce_ex__', '__repr__', '__reversed__', '__setattr__', '__setitem__', '__setstate__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_asStringList', '_iteritems', '_iterkeys', '_itervalues', 'append', 'asDict', 'asList', 'asXML', 'clear', 'copy', 'dump', 'extend', 'from_dict', 'get', 'getName', 'haskeys', 'insert', 'items', 'keys', 'pop', 'pprint', 'values']","(282, 287)"
"['AttackTechnique', '_ParseResults__lookup', '__add__', '__bool__', '__class__', '__contains__', '__delattr__', '__delitem__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattr__', '__getattribute__', '__getitem__', '__getnewargs__', '__getstate__', '__gt__', '__hash__', '__iadd__', '__init__', '__init_subclass__', '__iter__', '__le__', '__len__', '__lt__', '__module__', '__ne__', '__new__', '__nonzero__', '__radd__', '__reduce__', '__reduce_ex__', '__repr__', '__reversed__', '__setattr__', '__setitem__', '__setstate__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_asStringList', '_iteritems', '_iterkeys', '_itervalues', 'append', 'asDict', 'asList', 'asXML', 'clear', 'copy', 'dump', 'extend', 'from_dict', 'get', 'getName', 'haskeys', 'insert', 'items', 'keys', 'pop', 'pprint', 'values']","(304, 313)"
"['AttackTechnique', '_ParseResults__lookup', '__add__', '__bool__', '__class__', '__contains__', '__delattr__', '__delitem__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattr__', '__getattribute__', '__getitem__', '__getnewargs__', '__getstate__', '__gt__', '__hash__', '__iadd__', '__init__', '__init_subclass__', '__iter__', '__le__', '__len__', '__lt__', '__module__', '__ne__', '__new__', '__nonzero__', '__radd__', '__reduce__', '__reduce_ex__', '__repr__', '__reversed__', '__setattr__', '__setitem__', '__setstate__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_asStringList', '_iteritems', '_iterkeys', '_itervalues', 'append', 'asDict', 'asList', 'asXML', 'clear', 'copy', 'dump', 'extend', 'from_dict', 'get', 'getName', 'haskeys', 'insert', 'items', 'keys', 'pop', 'pprint', 'values']","(373, 382)"
"['AttackTechnique', '_ParseResults__lookup', '__add__', '__bool__', '__class__', '__contains__', '__delattr__', '__delitem__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattr__', '__getattribute__', '__getitem__', '__getnewargs__', '__getstate__', '__gt__', '__hash__', '__iadd__', '__init__', '__init_subclass__', '__iter__', '__le__', '__len__', '__lt__', '__module__', '__ne__', '__new__', '__nonzero__', '__radd__', '__reduce__', '__reduce_ex__', '__repr__', '__reversed__', '__setattr__', '__setitem__', '__setstate__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_asStringList', '_iteritems', '_iterkeys', '_itervalues', 'append', 'asDict', 'asList', 'asXML', 'clear', 'copy', 'dump', 'extend', 'from_dict', 'get', 'getName', 'haskeys', 'insert', 'items', 'keys', 'pop', 'pprint', 'values']","(420, 425)"


In [127]:
from pyparsing import Combine, Optional, Literal, Word, nums

expr = Combine(
    Literal("T1") + Word(nums, exact=3) + Optional(".0" + Word(nums, exact=2))
).setResultsName("AttackTechnique")
# parser = expr.setResultsName("AttackTechnique")

In [129]:
seq(expr.addParseAction(lambda s, l, t: s).scanString(text)).to_list()

[((['IOCs\n\nSHA1\nfadd8d7c13a18c251ded1f645ffea18a37f1c2de\n\nSHA256\n501487b025f25ddf1ca32deb57a2b4db43ccf6635c1edc74b9cff54ce0e5bcfe\n\nMITRE ATT&CK\nData from Local System – T1005\nModify Registry – T1112\nQuery Registry – T1012\nSystem Information Discovery – T1082\nData Encrypted for Impact – T1486\nFile Deletion – T1070.004\nCommand and Scripting Interpreter: Windows Command Shell – T1059.003\nWindows Management Instrumentation – T1047\n'], {'AttackTechnique': ['IOCs\n\nSHA1\nfadd8d7c13a18c251ded1f645ffea18a37f1c2de\n\nSHA256\n501487b025f25ddf1ca32deb57a2b4db43ccf6635c1edc74b9cff54ce0e5bcfe\n\nMITRE ATT&CK\nData from Local System – T1005\nModify Registry – T1112\nQuery Registry – T1012\nSystem Information Discovery – T1082\nData Encrypted for Impact – T1486\nFile Deletion – T1070.004\nCommand and Scripting Interpreter: Windows Command Shell – T1059.003\nWindows Management Instrumentation – T1047\n']}),
  164,
  169),
 ((['IOCs\n\nSHA1\nfadd8d7c13a18c251ded1f645ffea18a37f1c2de\n\

In [105]:
from pyparsing import Keyword, CaselessKeyword

In [106]:
parser = CaselessKeyword(feed_box.indicators[1].indicator).setResultsName(
    feed_box.indicators[1].type
)

In [107]:
list(
    parser.scanString(
        "FileHash-SHA256: 501487b025f25ddf1ca32deb57a2b4db43ccf6635c1edc74b9cff54ce0e5bcfe"
    )
)

[((['501487b025f25ddf1ca32deb57a2b4db43ccf6635c1edc74b9cff54ce0e5bcfe'], {'FileHash-SHA256': ['501487b025f25ddf1ca32deb57a2b4db43ccf6635c1edc74b9cff54ce0e5bcfe']}),
  17,
  81)]

In [80]:
from magpie.src.entity import filehash_sha256

In [83]:
filehash_sha256.expr.scanString(
    "FileHash-SHA256: 501487b025f25ddf1ca32deb57a2b4db43ccf6635c1edc74b9cff54ce0e5bcfe"
)

<generator object ParserElement.scanString at 0x1141c1bf8>

In [84]:
list(
    filehash_sha256.expr.scanString(
        "FileHash-SHA256: 501487b025f25ddf1ca32deb57a2b4db43ccf6635c1edc74b9cff54ce0e5bcfe"
    )
)

[((['501487b025f25ddf1ca32deb57a2b4db43ccf6635c1edc74b9cff54ce0e5bcfe'], {}),
  17,
  81)]

In [86]:
list(
    filehash_sha256.parser.scanString(
        "FileHash-SHA256: 501487b025f25ddf1ca32deb57a2b4db43ccf6635c1edc74b9cff54ce0e5bcfe"
    )
)

[((['501487b025f25ddf1ca32deb57a2b4db43ccf6635c1edc74b9cff54ce0e5bcfe'], {'FileHash-SHA256': ['501487b025f25ddf1ca32deb57a2b4db43ccf6635c1edc74b9cff54ce0e5bcfe']}),
  17,
  81)]

---

In [66]:
seq(feed_box.__reversed__())

['more_indicators', 'extract_source', 'industries', 'references', 'attack_ids', 'malware_families', 'targeted_countries', 'tags', 'indicators', 'adversary', 'public', 'tlp', 'revision', 'created', 'modified', 'author_name', 'description', 'name', 'id']

In [69]:
feed_box

<Box: {'id': '6047944bd5bd6b6f323e59fa', 'name': 'HelloKitty Ransomware Lacks Stealth, But Still Strikes Home', 'description': 'HelloKitty is a ransomware family that emerged in late 2020. While it lacks the sophistication of some of the more well-known families such as Ryuk, REvil, and Conti, it has nevertheless struck some notable targets, including CEMIG0. In this post, Sentinel Labs analyzes a recent HelloKitty sample and outlines the basic behaviors and traits associated with this family of ransomware.', 'author_name': 'AlienVault', 'modified': '2021-03-09T15:29:14.999000', 'created': '2021-03-09T15:29:14.999000', 'revision': 1, 'tlp': 'white', 'public': 1, 'adversary': '', 'indicators': [{'id': 2892282605, 'indicator': '136bd70f7aa98f52861879d7dca03cf2', 'type': 'FileHash-MD5', 'created': '2021-03-09T15:29:16', 'content': '', 'title': 'Ransom:Win32/Death.DB!MTB', 'description': 'MD5 of fadd8d7c13a18c251ded1f645ffea18a37f1c2de', 'expiration': None, 'is_active': 1, 'role': None}, {

In [58]:
seq(feed_box.items()).show(20)

------------------  --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

---

In [34]:
a.article_contents.map(
    lambda element: [(element.text, element), (element.tail, element)]
).flatten()

0,1
Game studio CD Projekt Red recently,<Element p at 0x10ef91f88>
,<Element p at 0x10ef91f88>
disclosed,<Element a at 0x10ef91608>
"that it became a victim of a targeted, highly-impactful ransomware. In the days following the disclosure, it was revealed that the ransomware family most likely behind the attack was “HelloKitty”.",<Element a at 0x10ef91608>
"HelloKitty is a ransomware family that emerged in late 2020. While it lacks the sophistication of some of the more well-known families such as Ryuk, REvil, and Conti, it has nevertheless struck some notable targets, including CEMIG0. In this post, we analyse a recent HelloKitty sample and outline the basic behaviors and traits associated with this family of ransomware.",<Element p at 0x10ef91808>
,<Element p at 0x10ef91808>
Execution and Behavior,<Element h2 at 0x10ef91948>
,<Element h2 at 0x10ef91948>
"The “HelloKitty” name is based on internal mutex names, which are apparent upon execution.",<Element p at 0x10ef918c8>
,<Element p at 0x10ef918c8>


In [6]:
class ArticleAnalyzer(object):

    ArticlePart = namedtuple("ArticlePart", ["xpath", "tag"])

    def __init__(self, document):
        self.document = Document(document, xpath=True)

    @staticmethod
    def has_printable_content(element):
        # check element.text and element.tail is not None and contral char (\n\t\r)
        def check(content):
            if content is not None:
                return content.isspace()
            else:
                return True

        result = not all((check(element.text), check(element.tail)))
        return result

    @staticmethod
    def has_printable_tag(element):
        # check element.tag is not script (javascript) or style (css)
        result = element.tag not in ["script", "style"]
        return result

    @property
    def article_parts(self):
        cleaned_text = self.document.summary()
        all_elements = self.get_elements(cleaned_text, "//*")
        parts = (
            seq(all_elements)
            .filter(lambda element: self.has_printable_content(element))
            .filter(lambda element: self.has_printable_tag(element))
            .map(lambda element: self.ArticlePart(element.get("x"), element.tag))
        )
        return parts.to_list()

    @staticmethod
    def get_elements(text, xpath):
        root = etree.HTML(text)
        elements = root.xpath(xpath)
        return elements

    @property
    def utils(self):
        # self.document.title(): original title
        # self.document.short_title(): shorter title
        # self.document.input: original input
        # self.document.content(): document body
        # self.document.summary(): cleaned content
        # analyzer.get_root(analyzer.raw_html)
        # analyzer.get_root(analyzer.document.summary())
        pass

In [7]:
aa = ArticleAnalyzer(data)

In [8]:
aa.article_parts[:3]

[ArticlePart(xpath='/html/body/div[1]/div/div[2]/div/div/div/div[1]/article/div/section/section/p[1]', tag='p'),
 ArticlePart(xpath='/html/body/div[1]/div/div[2]/div/div/div/div[1]/article/div/section/section/p[1]/a', tag='a'),
 ArticlePart(xpath='/html/body/div[1]/div/div[2]/div/div/div/div[1]/article/div/section/section/p[3]', tag='p')]

In [232]:
class ContentExtactor(ArticleAnalyzer):

    Content = namedtuple("Content", ["prop", "text"])

    def __init__(self, document):
        super().__init__(document)

    @property
    def part_contents(self):
        
        # get elements from raw document
        raw_text = self.document.input
        elements = (
            seq(self.article_parts)
            .map(lambda part: self.get_elements(raw_text, part.xpath))
            .flatten()
        )
        """
        contents = seq(cleaned_elements).map(
            lambda element: [
                self.Content("text", element.text),
                self.Content("tail", element.tail),
            ],
        )
        return contents.to_list()
        """
        return elements

In [233]:
ce = ContentExtactor(data)

In [235]:
ce.part_contents.to_list()

[<Element p at 0x111705f48>,
 <Element a at 0x111705648>,
 <Element p at 0x111803ac8>,
 <Element h2 at 0x111803188>,
 <Element p at 0x111803308>,
 <Element p at 0x1116e7f48>,
 <Element p at 0x1116e7488>,
 <Element code at 0x1116e2f08>,
 <Element code at 0x1116e2108>,
 <Element p at 0x1116e2288>,
 <Element p at 0x111754dc8>,
 <Element pre at 0x1117d3308>,
 <Element p at 0x1117d3708>,
 <Element code at 0x1117d3788>,
 <Element code at 0x1117d38c8>,
 <Element p at 0x1117d3088>,
 <Element p at 0x11180bb88>,
 <Element pre at 0x11180bd48>,
 <Element h2 at 0x11180b9c8>,
 <Element p at 0x11180b7c8>,
 <Element p at 0x11180bd08>,
 <Element code at 0x11180bdc8>,
 <Element p at 0x11180bac8>,
 <Element p at 0x11180bf48>,
 <Element code at 0x11180bf08>,
 <Element h2 at 0x11180bf88>,
 <Element p at 0x11180f3c8>,
 <Element p at 0x11180f2c8>,
 <Element p at 0x11180f048>,
 <Element h2 at 0x11180f088>,
 <Element strong at 0x11180f0c8>,
 <Element br at 0x11180f108>,
 <Element strong at 0x11180f188>,
 <Elem

In [231]:
seq(ce.part_contents).flatten().map(lambda 

prop,text
text,Game studio CD Projekt Red recently
tail,
text,disclosed
tail,"that it became a victim of a targeted, highly-impactful ransomware. In the days following the disclosure, it was revealed that the ransomware family most likely behind the attack was “HelloKitty”."
text,"HelloKitty is a ransomware family that emerged in late 2020. While it lacks the sophistication of some of the more well-known families such as Ryuk, REvil, and Conti, it has nevertheless struck some notable targets, including CEMIG0. In this post, we analyse a recent HelloKitty sample and outline the basic behaviors and traits associated with this family of ransomware."
tail,
text,Execution and Behavior
tail,
text,"The “HelloKitty” name is based on internal mutex names, which are apparent upon execution."
tail,


In [None]:
seq(ce.article_parts).map(lambda part: product([part], 

In [228]:
seq(ce.article_parts).zip(ce.part_contents).starmap(
    lambda part, contents: product([part], contents)
).flatten()

0,1
"ArticlePart(xpath='/html/body/div[1]/div/div[2]/div/div/div/div[1]/article/div/section/section/p[1]', tag='p')","Content(prop='text', text='Game studio CD Projekt Red recently ')"
"ArticlePart(xpath='/html/body/div[1]/div/div[2]/div/div/div/div[1]/article/div/section/section/p[1]', tag='p')","Content(prop='tail', text='\n')"
"ArticlePart(xpath='/html/body/div[1]/div/div[2]/div/div/div/div[1]/article/div/section/section/p[1]/a', tag='a')","Content(prop='text', text='disclosed')"
"ArticlePart(xpath='/html/body/div[1]/div/div[2]/div/div/div/div[1]/article/div/section/section/p[1]/a', tag='a')","Content(prop='tail', text=' that it became a victim of a targeted, highly-impactful ransomware. In the days following the disclosure, it was revealed that the ransomware family most likely behind the attack was “HelloKitty”.')"
"ArticlePart(xpath='/html/body/div[1]/div/div[2]/div/div/div/div[1]/article/div/section/section/p[3]', tag='p')","Content(prop='text', text='HelloKitty is a ransomware family that emerged in late 2020. While it lacks the sophistication of some of the more well-known families such as Ryuk, REvil, and Conti, it has nevertheless struck some notable targets, including CEMIG0. In this post, we analyse a recent HelloKitty sample and outline the basic behaviors and traits associated with this family of ransomware.')"
"ArticlePart(xpath='/html/body/div[1]/div/div[2]/div/div/div/div[1]/article/div/section/section/p[3]', tag='p')","Content(prop='tail', text='\n')"
"ArticlePart(xpath='/html/body/div[1]/div/div[2]/div/div/div/div[1]/article/div/section/section/h2[1]', tag='h2')","Content(prop='text', text='Execution and Behavior')"
"ArticlePart(xpath='/html/body/div[1]/div/div[2]/div/div/div/div[1]/article/div/section/section/h2[1]', tag='h2')","Content(prop='tail', text='\n')"
"ArticlePart(xpath='/html/body/div[1]/div/div[2]/div/div/div/div[1]/article/div/section/section/p[4]', tag='p')","Content(prop='text', text='The “HelloKitty” name is based on internal mutex names, which are apparent upon execution.')"
"ArticlePart(xpath='/html/body/div[1]/div/div[2]/div/div/div/div[1]/article/div/section/section/p[4]', tag='p')","Content(prop='tail', text='\n')"


---

In [22]:
class DocumentAnalyzer(object):

    Content = namedtuple("Content", ["xpath", "tag", "prop", "text"])

    def __init__(self, document):
        self.document = Document(document, xpath=True)

    @staticmethod
    def has_printable_content(element):
        # check element.text and element.tail is not None and contral char (\n\t\r)
        def check(content):
            if content is not None:
                return content.isspace()
            else:
                return True

        result = not all((check(element.text), check(element.tail)))
        return result

    @staticmethod
    def has_printable_tag(element):
        # check element.tag is not script (javascript) or style (css)
        result = element.tag not in ["script", "style"]
        return result

    def get_printable_elements(self):
        cleaned_document = self.document.summary()
        root = etree.HTML(cleaned_document)
        elements = root.xpath("//*")
        printable_elements = (
            seq(elements)
            .filter(lambda element: self.has_printable_content(element))
            .filter(lambda element: self.has_printable_tag(element))
        )
        return printable_elements.to_list()

    def query_elements(xpath):
        pass

    @property
    def utils(self):
        # self.document.title(): original title
        # self.document.short_title(): shorter title
        # self.document.input: original input
        # self.document.content(): document body
        # self.document.summary(): cleaned content
        # analyzer.get_root(analyzer.raw_html)
        # analyzer.get_root(analyzer.document.summary())
        pass

    @property
    def contents(self):
        # elements' content
        printable_elements = self.get_printable_elements()
        # format element's content with namedtuple
        contents = (
            seq(printable_elements)
            .map(
                lambda element: [
                    (element, "text", element.text),
                    (element, "tail", element.tail),
                ]
            )
            .flatten()
            .starmap(
                lambda element, prop, text: self.Content(
                    # element.sourceline,
                    element.attrib.get("x"),
                    element.tag,
                    prop,
                    text,
                )
            )
        ).filter(lambda content: content.text is not None)

        return contents.to_list()

In [23]:
da = DocumentAnalyzer(data)

In [87]:
seq(da.contents)  # .group_by(lambda x: x.xpath)  # .filter(lambda x: x[]

xpath,tag,prop,text
/html/body/div[1]/div/div[2]/div/div/div/div[1]/article/div/section/section/p[1],p,text,Game studio CD Projekt Red recently
/html/body/div[1]/div/div[2]/div/div/div/div[1]/article/div/section/section/p[1],p,tail,
/html/body/div[1]/div/div[2]/div/div/div/div[1]/article/div/section/section/p[1]/a,a,text,disclosed
/html/body/div[1]/div/div[2]/div/div/div/div[1]/article/div/section/section/p[1]/a,a,tail,"that it became a victim of a targeted, highly-impactful ransomware. In the days following the disclosure, it was revealed that the ransomware family most likely behind the attack was “HelloKitty”."
/html/body/div[1]/div/div[2]/div/div/div/div[1]/article/div/section/section/p[3],p,text,"HelloKitty is a ransomware family that emerged in late 2020. While it lacks the sophistication of some of the more well-known families such as Ryuk, REvil, and Conti, it has nevertheless struck some notable targets, including CEMIG0. In this post, we analyse a recent HelloKitty sample and outline the basic behaviors and traits associated with this family of ransomware."
/html/body/div[1]/div/div[2]/div/div/div/div[1]/article/div/section/section/p[3],p,tail,
/html/body/div[1]/div/div[2]/div/div/div/div[1]/article/div/section/section/h2[1],h2,text,Execution and Behavior
/html/body/div[1]/div/div[2]/div/div/div/div[1]/article/div/section/section/h2[1],h2,tail,
/html/body/div[1]/div/div[2]/div/div/div/div[1]/article/div/section/section/p[4],p,text,"The “HelloKitty” name is based on internal mutex names, which are apparent upon execution."
/html/body/div[1]/div/div[2]/div/div/div/div[1]/article/div/section/section/p[4],p,tail,


---

In [503]:
from collections import namedtuple, deque
from more_itertools import pairwise, collapse

from magpie.src.entity import (
    attack_technique,
    bitcoin_address,
    cve,
    defender_threat,
    domain,
    email,
    filehash_md5,
    filehash_sha1,
    filehash_sha256,
    filepath,
    hostname,
    ipv4,
    ipv6,
    keyword,
    sslcert_fingerprint,
    uri,
    url,
)

In [504]:
_parser = (
    defender_threat.parser
    | uri.parser
    | url.parser
    | email.parser
    | hostname.parser
    | domain.parser
    | sslcert_fingerprint.parser
    | ipv6.parser
    | ipv4.parser
    | cve.parser
    | attack_technique.parser
    | filepath.parser
    | filehash_sha256.parser  # len = 64
    | filehash_sha1.parser  # len = 40
    | bitcoin_address.parser  # len = 34
    | filehash_md5.parser  # len = 32
    # | keyword.make_parser(self.extracted_keywords)  # generate keywords parser
)

In [505]:
'\ndsa*\nNtrtsca\nds_moni\nNotifie\nTmListe\niVPAgen\nCNTAoSM\nIBM*\nbes10*\nblack*\nrobo*\ncopy*\nstore.e\nsql*\nvee*\nwrsa*\nwrsa.ex\npostg*\nsage*\nMSSQLServerADHelper100\nMSSQL$ISARS\nMSSQL$MSFW\nSQLAgent$ISARS\nSQLAgent$MSFW\nSQLBrowser\nReportServer$ISARS\nSQLWriter\nWinDefend\nmr2kserv\nMSExchangeADTopology\nMSExchangeFBA\nMSExchangeIS\nMSExchangeSA\nShadowProtectSvc\nSPAdminV4\nSPTimerV4\nSPTraceV4\nSPUserCodeV4\nSPWriterV4\nSPSearch4\nIISADMIN\nfirebirdguardiandefaultinstance\nibmiasrw\nQBCFMonitorService\nQBVSS\nQBPOSDBServiceV12\n"IBM Domino Server(CProgramFilesIBMDominodata)"\n"IBM Domino Diagnostics(CProgramFilesIBMDomino)"\n"Simply Accounting Database Connection Manager"\nQuickBooksDB1\nQuickBooksDB2\nQuickBooksDB3\nQuickBooksDB4\nQuickBooksDB5\nQuickBooksDB6\nQuickBooksDB7\nQuickBooksDB8\nQuickBooksDB9\nQuickBooksDB10\nQuickBooksDB11\nQuickBooksDB12\nQuickBooksDB13\nQuickBooksDB14\nQuickBooksDB15\nQuickBooksDB16\nQuickBooksDB17\nQuickBooksDB18\nQuickBooksDB19\nQuickBooksDB20\nQuickBooksDB21\nQuickBooksDB22\nQuickBooksDB23\nQuickBooksDB24\nQuickBooksDB25\n'[
    109:116
]

'wrsa.ex'

In [506]:
%%time
seq(da.contents).starmap(
    lambda xpath, tag, prop, text: (tag, text, list(_parser.scanString(text)))
)

CPU times: user 56.9 ms, sys: 2.32 ms, total: 59.2 ms
Wall time: 80.3 ms


0,1,2
p,Game studio CD Projekt Red recently,[]
a,disclosed,[]
a,"that it became a victim of a targeted, highly-impactful ransomware. In the days following the disclosure, it was revealed that the ransomware family most likely behind the attack was “HelloKitty”.",[]
p,"HelloKitty is a ransomware family that emerged in late 2020. While it lacks the sophistication of some of the more well-known families such as Ryuk, REvil, and Conti, it has nevertheless struck some notable targets, including CEMIG0. In this post, we analyse a recent HelloKitty sample and outline the basic behaviors and traits associated with this family of ransomware.",[]
h2,Execution and Behavior,[]
p,"The “HelloKitty” name is based on internal mutex names, which are apparent upon execution.",[]
p,"While still somewhat unclear, current intelligence indicates that the primary delivery method of HelloKitty binaries is via phish email or via secondary infection in conjunction with other malware.",[]
p,"Once launched, HelloKitty will attempt to disable and terminate a number of processes and services so as to reduce interference with the encryption process. This includes processes and services associated with IIS, MSSQL, Quickbooks, Sharepoint, and more. These actions are carried out via",[]
code,taskkill.exe,"[((['taskkill.exe'], {'FilePath': ['taskkill.exe']}), 0, 12)]"
code,and,[]


In [508]:
%%time
seq(da.contents).starmap(
    lambda xpath, tag, prop, text: (tag, text, list(_parser.scanString(text)))
).filter(lambda x: x[2] != [])

CPU times: user 60.9 ms, sys: 2.54 ms, total: 63.4 ms
Wall time: 129 ms


0,1,2
code,taskkill.exe,"[((['taskkill.exe'], {'FilePath': ['taskkill.exe']}), 0, 12)]"
code,net.exe,"[((['net.exe'], {'FilePath': ['net.exe']}), 0, 7)]"
code,"taskkill.exe /f /PID ""8512""","[((['taskkill.exe'], {'FilePath': ['taskkill.exe']}), 0, 12)]"
code,"taskkill.exe /f /PID ""8656""","[((['taskkill.exe'], {'FilePath': ['taskkill.exe']}), 0, 12)]"
code,6x7dp6h3w6q3ugjv4yv5gycj3femb24kysgry5b44hhgfwc5ml5qrdad.onion,"[((['6x7dp6h3w6q3ugjv4yv5gycj3femb24kysgry5b44hhgfwc5ml5qrdad.onion'], {'domain': ['6x7dp6h3w6q3ugjv4yv5gycj3femb24kysgry5b44hhgfwc5ml5qrdad.onion']}), 0, 62)]"
a,T1005,"[((['T1005'], {'AttackTechnique': ['T1005']}), 0, 5)]"
a,T1112,"[((['T1112'], {'AttackTechnique': ['T1112']}), 0, 5)]"
a,T1012,"[((['T1012'], {'AttackTechnique': ['T1012']}), 0, 5)]"
a,T1082,"[((['T1082'], {'AttackTechnique': ['T1082']}), 0, 5)]"
a,T1486,"[((['T1486'], {'AttackTechnique': ['T1486']}), 0, 5)]"


In [478]:
class EntityParser(object):

    Text = namedtuple("Text", ["entity", "string", "span"])

    def __init__(self):
        # self.extracted_keywords = extracted_keywords
        self.parser = self.construct_parser()

    def construct_parser(self):
        parser = (
            defender_threat.parser
            | uri.parser
            | url.parser
            | email.parser
            | hostname.parser
            | domain.parser
            | sslcert_fingerprint.parser
            | ipv6.parser
            | ipv4.parser
            | cve.parser
            | attack_technique.parser
            | filepath.parser
            | filehash_sha256.parser  # len = 64
            | filehash_sha1.parser  # len = 40
            | bitcoin_address.parser  # len = 34
            | filehash_md5.parser  # len = 32
            # | keyword.make_parser(self.extracted_keywords)  # generate keywords parser
        )
        return parser

    def iterscan(self, content):

        """
        for each scanning result, returns a tuple of:
            - matched tokens (packaged as a ParseResults object)
            - start location of the matched text in the given source string
            - end location in the given source string
        """

        # generator
        scanning = self.parser.scanString(content.text)
        scan_results = (
            seq(scanning)
            .starmap(lambda token, start, end: ((start, end), token))
            .cache()
        )

        def refill(spans, text=content.text):

            # get head and tail index of text
            head, tail = 0, len(content.text)

            # append head and tail index
            indexes = deque(collapse(spans))
            indexes.appendleft(head)
            indexes.append(tail)
            indexes = sorted(set(indexes))

            # bi-grams tokenize indexes
            filled_spans = pairwise(indexes)

            return filled_spans

        # filled with other non-entity spans
        entity_spans = scan_results.starmap(lambda span, token: span).to_list()
        filled_spans = refill(entity_spans, content.text)

        scan_results_dict = scan_results.to_dict()

        # join together
        for span in filled_spans:

            token = scan_results_dict.get(span)

            if token != None:
                entity = token.getName()
                string = token.get(entity)
                yield self.Text(entity, string, span)

            else:
                entity = None
                start, end = span
                string = content.text[start:end]
                yield self.Text(None, string, span)

In [479]:
parser = EntityParser()

In [480]:
%%time
seq(da.contents).map(
    lambda content: product([content], parser.iterscan(content))
).flatten()

CPU times: user 58.8 ms, sys: 4.49 ms, total: 63.3 ms
Wall time: 106 ms


0,1
"Content(xpath='/html/body/div[1]/div/div[2]/div/div/div/div[1]/article/div/section/section/p[1]', tag='p', prop='text', text='Game studio CD Projekt Red recently ')","Text(entity=None, string='Game studio CD Projekt Red recently ', span=(0, 36))"
"Content(xpath='/html/body/div[1]/div/div[2]/div/div/div/div[1]/article/div/section/section/p[1]', tag='p', prop='tail', text='\n')","Text(entity=None, string='\n', span=(0, 1))"
"Content(xpath='/html/body/div[1]/div/div[2]/div/div/div/div[1]/article/div/section/section/p[1]/a', tag='a', prop='text', text='disclosed')","Text(entity=None, string='disclosed', span=(0, 9))"
"Content(xpath='/html/body/div[1]/div/div[2]/div/div/div/div[1]/article/div/section/section/p[1]/a', tag='a', prop='tail', text=' that it became a victim of a targeted, highly-impactful ransomware. In the days following the disclosure, it was revealed that the ransomware family most likely behind the attack was “HelloKitty”.')","Text(entity=None, string=' that it became a victim of a targeted, highly-impactful ransomware. In the days following the disclosure, it was revealed that the ransomware family most likely behind the attack was “HelloKitty”.', span=(0, 197))"
"Content(xpath='/html/body/div[1]/div/div[2]/div/div/div/div[1]/article/div/section/section/p[3]', tag='p', prop='text', text='HelloKitty is a ransomware family that emerged in late 2020. While it lacks the sophistication of some of the more well-known families such as Ryuk, REvil, and Conti, it has nevertheless struck some notable targets, including CEMIG0. In this post, we analyse a recent HelloKitty sample and outline the basic behaviors and traits associated with this family of ransomware.')","Text(entity=None, string='HelloKitty is a ransomware family that emerged in late 2020. While it lacks the sophistication of some of the more well-known families such as Ryuk, REvil, and Conti, it has nevertheless struck some notable targets, including CEMIG0. In this post, we analyse a recent HelloKitty sample and outline the basic behaviors and traits associated with this family of ransomware.', span=(0, 371))"
"Content(xpath='/html/body/div[1]/div/div[2]/div/div/div/div[1]/article/div/section/section/p[3]', tag='p', prop='tail', text='\n')","Text(entity=None, string='\n', span=(0, 1))"
"Content(xpath='/html/body/div[1]/div/div[2]/div/div/div/div[1]/article/div/section/section/h2[1]', tag='h2', prop='text', text='Execution and Behavior')","Text(entity=None, string='Execution and Behavior', span=(0, 22))"
"Content(xpath='/html/body/div[1]/div/div[2]/div/div/div/div[1]/article/div/section/section/h2[1]', tag='h2', prop='tail', text='\n')","Text(entity=None, string='\n', span=(0, 1))"
"Content(xpath='/html/body/div[1]/div/div[2]/div/div/div/div[1]/article/div/section/section/p[4]', tag='p', prop='text', text='The “HelloKitty” name is based on internal mutex names, which are apparent upon execution.')","Text(entity=None, string='The “HelloKitty” name is based on internal mutex names, which are apparent upon execution.', span=(0, 90))"
"Content(xpath='/html/body/div[1]/div/div[2]/div/div/div/div[1]/article/div/section/section/p[4]', tag='p', prop='tail', text='\n')","Text(entity=None, string='\n', span=(0, 1))"


In [472]:
%%time
seq(da.contents).map(
    lambda content: product([content], parser.iterscan(content.text))
).flatten().starmap(lambda content, text: (content.xpath[-10:], content.tag, content.prop, text.entity, text.string, text.span)).to_list()

CPU times: user 2.7 s, sys: 16.6 ms, total: 2.72 s
Wall time: 3 s


[('ction/p[1]',
  'p',
  'text',
  None,
  'Game studio CD Projekt Red recently ',
  (0, 36)),
 ('ction/p[1]', 'p', 'tail', None, '\n', (0, 1)),
 ('ion/p[1]/a', 'a', 'text', None, 'disclosed', (0, 9)),
 ('ion/p[1]/a',
  'a',
  'tail',
  None,
  ' that it became a victim of a targeted, highly-impactful ransomware. In the days following the disclosure, it was revealed that the ransomware family most likely behind the attack was “HelloKitty”.',
  (0, 197)),
 ('ction/p[3]',
  'p',
  'text',
  None,
  'HelloKitty is a ransomware family that emerged in late 2020. While it lacks the sophistication of some of the more well-known families such as Ryuk, REvil, and Conti, it has nevertheless struck some notable targets, including CEMIG0. In this post, we analyse a recent HelloKitty sample and outline the basic behaviors and traits associated with this family of ransomware.',
  (0, 371)),
 ('ction/p[3]', 'p', 'tail', None, '\n', (0, 1)),
 ('tion/h2[1]', 'h2', 'text', None, 'Execution and Behavior'

In [449]:
seq(da.contents).starmap(
    lambda xpath, tag, prop, text: (xpath, tag, prop, text, list(parser.iterscan(text)))
)  # .filter(lambda x: x[4] is True)

0,1,2,3,4
/html/body/div[1]/div/div[2]/div/div/div/div[1]/article/div/section/section/p[1],p,text,Game studio CD Projekt Red recently,"[Text(entity=None, string='Game studio CD Projekt Red recently ', span=(0, 36))]"
/html/body/div[1]/div/div[2]/div/div/div/div[1]/article/div/section/section/p[1],p,tail,,"[Text(entity=None, string='\n', span=(0, 1))]"
/html/body/div[1]/div/div[2]/div/div/div/div[1]/article/div/section/section/p[1]/a,a,text,disclosed,"[Text(entity=None, string='disclosed', span=(0, 9))]"
/html/body/div[1]/div/div[2]/div/div/div/div[1]/article/div/section/section/p[1]/a,a,tail,"that it became a victim of a targeted, highly-impactful ransomware. In the days following the disclosure, it was revealed that the ransomware family most likely behind the attack was “HelloKitty”.","[Text(entity=None, string=' that it became a victim of a targeted, highly-impactful ransomware. In the days following the disclosure, it was revealed that the ransomware family most likely behind the attack was “HelloKitty”.', span=(0, 197))]"
/html/body/div[1]/div/div[2]/div/div/div/div[1]/article/div/section/section/p[3],p,text,"HelloKitty is a ransomware family that emerged in late 2020. While it lacks the sophistication of some of the more well-known families such as Ryuk, REvil, and Conti, it has nevertheless struck some notable targets, including CEMIG0. In this post, we analyse a recent HelloKitty sample and outline the basic behaviors and traits associated with this family of ransomware.","[Text(entity=None, string='HelloKitty is a ransomware family that emerged in late 2020. While it lacks the sophistication of some of the more well-known families such as Ryuk, REvil, and Conti, it has nevertheless struck some notable targets, including CEMIG0. In this post, we analyse a recent HelloKitty sample and outline the basic behaviors and traits associated with this family of ransomware.', span=(0, 371))]"
/html/body/div[1]/div/div[2]/div/div/div/div[1]/article/div/section/section/p[3],p,tail,,"[Text(entity=None, string='\n', span=(0, 1))]"
/html/body/div[1]/div/div[2]/div/div/div/div[1]/article/div/section/section/h2[1],h2,text,Execution and Behavior,"[Text(entity=None, string='Execution and Behavior', span=(0, 22))]"
/html/body/div[1]/div/div[2]/div/div/div/div[1]/article/div/section/section/h2[1],h2,tail,,"[Text(entity=None, string='\n', span=(0, 1))]"
/html/body/div[1]/div/div[2]/div/div/div/div[1]/article/div/section/section/p[4],p,text,"The “HelloKitty” name is based on internal mutex names, which are apparent upon execution.","[Text(entity=None, string='The “HelloKitty” name is based on internal mutex names, which are apparent upon execution.', span=(0, 90))]"
/html/body/div[1]/div/div[2]/div/div/div/div[1]/article/div/section/section/p[4],p,tail,,"[Text(entity=None, string='\n', span=(0, 1))]"


In [424]:
seq(
    etree.HTML(da.document.input).xpath(
        "/html/body/div[1]/div/div[2]/div/div/div/div[1]/article/div/section/section/p[12]/code[2]"
    )
).map(lambda element: (element.text, element.tail)).to_list()

[('taskkill.exe /f /PID "8656"', None)]

In [416]:
da.contents

[('/html/body/div[1]/div/div[2]/div/div/div/div[1]/article/div/section/section/p[1]',
  'p',
  'text',
  'Game studio CD Projekt Red recently '),
 ('/html/body/div[1]/div/div[2]/div/div/div/div[1]/article/div/section/section/p[1]',
  'p',
  'tail',
  '\n'),
 ('/html/body/div[1]/div/div[2]/div/div/div/div[1]/article/div/section/section/p[1]/a',
  'a',
  'text',
  'disclosed'),
 ('/html/body/div[1]/div/div[2]/div/div/div/div[1]/article/div/section/section/p[1]/a',
  'a',
  'tail',
  ' that it became a victim of a targeted, highly-impactful ransomware. In the days following the disclosure, it was revealed that the ransomware family most likely behind the attack was “HelloKitty”.'),
 ('/html/body/div[1]/div/div[2]/div/div/div/div[1]/article/div/section/section/p[3]',
  'p',
  'text',
  'HelloKitty is a ransomware family that emerged in late 2020. While it lacks the sophistication of some of the more well-known families such as Ryuk, REvil, and Conti, it has nevertheless struck some notable

---

In [25]:
from transformers import AutoTokenizer

None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


In [26]:
model_checkpoint = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [27]:
tokenizer.tokenize("HelloKitty Ransomware Lacks Stealth, But Still Strikes Home")

['hello',
 '##kit',
 '##ty',
 'ransom',
 '##ware',
 'lacks',
 'stealth',
 ',',
 'but',
 'still',
 'strikes',
 'home']

In [30]:
len(tokenizer.tokenize("HelloKitty Ransomware Lacks Stealth, But Still Strikes Home"))

12

In [29]:
tokenizer("HelloKitty Ransomware Lacks Stealth, But Still Strikes Home")

{'input_ids': [101, 7592, 23615, 3723, 16540, 8059, 14087, 22150, 1010, 2021, 2145, 9326, 2188, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [32]:
tokenizer.convert_ids_to_tokens(
    [
        101,
        7592,
        23615,
        3723,
        16540,
        8059,
        14087,
        22150,
        1010,
        2021,
        2145,
        9326,
        2188,
        102,
    ]
)

['[CLS]',
 'hello',
 '##kit',
 '##ty',
 'ransom',
 '##ware',
 'lacks',
 'stealth',
 ',',
 'but',
 'still',
 'strikes',
 'home',
 '[SEP]']

In [31]:
dir(tokenizer)

['SPECIAL_TOKENS_ATTRIBUTES',
 '__annotations__',
 '__call__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__len__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_add_tokens',
 '_additional_special_tokens',
 '_batch_encode_plus',
 '_bos_token',
 '_cls_token',
 '_convert_encoding',
 '_convert_id_to_token',
 '_convert_token_to_id_with_added_voc',
 '_create_or_get_repo',
 '_decode',
 '_decode_use_source_tokenizer',
 '_encode_plus',
 '_eos_token',
 '_eventual_warn_about_too_long_sequence',
 '_from_pretrained',
 '_get_padding_truncation_strategies',
 '_get_repo_url_from_name',
 '_mask_token',
 '_pad',
 '_pad_token',
 '_pad_token_type_id',
 '_push_to_hub',
 '_save_pretrained',
 '_sep_token',
 '_tokenizer',
 '_un