In [1]:
%load_ext lab_black
# %load_ext autotime

In [2]:
import sys

sys.path.append("../")

In [3]:
# from magpie.src.label import PulseLabel
from functional import seq

In [4]:
feed = seq.json("../data/hellokitty.feeds.json").to_dict()

In [5]:
feed

{'id': '6047944bd5bd6b6f323e59fa',
 'name': 'HelloKitty Ransomware Lacks Stealth, But Still Strikes Home',
 'description': 'HelloKitty is a ransomware family that emerged in late 2020. While it lacks the sophistication of some of the more well-known families such as Ryuk, REvil, and Conti, it has nevertheless struck some notable targets, including CEMIG0. In this post, Sentinel Labs analyzes a recent HelloKitty sample and outlines the basic behaviors and traits associated with this family of ransomware.',
 'author_name': 'AlienVault',
 'modified': '2021-03-09T15:29:14.999000',
 'created': '2021-03-09T15:29:14.999000',
 'revision': 1,
 'tlp': 'white',
 'public': 1,
 'adversary': '',
 'indicators': [{'id': 2892282605,
   'indicator': '136bd70f7aa98f52861879d7dca03cf2',
   'type': 'FileHash-MD5',
   'created': '2021-03-09T15:29:16',
   'content': '',
   'title': 'Ransom:Win32/Death.DB!MTB',
   'description': 'MD5 of fadd8d7c13a18c251ded1f645ffea18a37f1c2de',
   'expiration': None,
   'is_

In [6]:
from itertools import product
from enum import Enum
from collections import ChainMap, namedtuple
from functools import lru_cache
from textdistance import jaro

In [7]:
class PulseLabelGenerator(object):

    SINGULAR_PARTS = ["name", "description", "adversary"]
    PLRUAL_PARTS = [
        "tags",
        "targeted_countries",
        "malware_families",
        "attack_ids",
        "industries",
    ]

    PulseLabel = namedtuple("PulseLabel", ["category", "group", "role"])

    def __init__(self, pulse: dict):
        self.pulse = pulse

    @property
    def singular_label(self):
        # get items which field is string and not ""
        items = {
            key: self.pulse.get(key)
            for key in self.SINGULAR_PARTS
            if self.pulse.get(key) != ""
        }.items()
        # make label
        label = (
            seq(items)
            .starmap(lambda key, value: (value, key))
            .starmap(lambda string, key: (string, self.PulseLabel(key, 0, None)))
        )
        return label.to_dict()

    @property
    def plrual_label(self):
        # get items which field is list and not []
        items = {
            key: self.pulse.get(key)
            for key in self.PLRUAL_PARTS
            if self.pulse.get(key) != []
        }.items()

        # key name replacements
        substitution = {
            "tags": "tag",
            "targeted_countries": "targeted_country",
            "malware_families": "malware_family",
            "attack_ids": "attack_id",
            "industries": "industry",
        }
        # make label
        label = (
            seq(items)
            .starmap(lambda key, values: product(values, [key]))
            .flatten()
            .starmap(
                lambda string, key: (
                    string,
                    self.PulseLabel(substitution.get(key), 0, None),
                )
            )
        )
        return label.to_dict()

    def get_label(self):
        return ChainMap(self.singular_label, self.plrual_label)

In [8]:
class IndicatorLabelGenerator(object):

    IndicatorLabel = namedtuple("IndicatorLabel", ["category", "group", "role"])

    def __init__(self, indicators):
        self.indicators = indicators

    @property
    def enumerated_indicators(self):
        # enumerate from 1, to distinguish with pulse
        indicators = enumerate(self.indicators, 1)
        return indicators

    @staticmethod
    def extract(index, indicator):

        # extract from feed's indicator part, lazy but nasty
        extractions = (
            (
                (indicator.get("content"), (indicator.get("type"), index, "rule"))
                if indicator.get("content").startswith("rule")
                and indicator.get("type") is "YARA"
                else (
                    indicator.get("indicator"),
                    (indicator.get("type"), index, indicator.get("role")),
                )
            ),
            (indicator.get("title"), ("title", index, None)),
            (indicator.get("description"), ("description", index, None)),
        )

        return extractions

    def get_label(self):

        label = (
            seq(self.enumerated_indicators)
            .starmap(lambda index, indicator: (self.extract(index, indicator)))
            .flatten()
            .starmap(
                lambda string, extraction: (
                    string,
                    self.IndicatorLabel(*extraction),
                )  # formatting with namedtuple
            )
            .filter(
                lambda label: label[0] is not ""
            )  # if string is not existed, drop out
        )

        return label.to_dict()

In [9]:
class FeedLabeler(object):

    PULSE = [
        "name",  # pulse.name
        "description",  # pulse.description
        "adversary",  # pulse.adversary
        "tags",  # pulse.tags
        "targeted_countries",  # pulse.targeted_countries
        "malware_families",  # pulse.malware_families
        "attack_ids",  # pulse.attack_ids
        "industries",  # pulse.industries
    ]

    class MatchThreshold(Enum):
        exactly_match = 1
        fuzzy_match = 0.9

    def __init__(self, feed):
        self.feed = feed

    @property
    def pulse_label(self):
        pulse = {key: self.feed.get(key) for key in self.PULSE}
        label_processor = PulseLabelGenerator(pulse)
        return label_processor.get_label()

    @property
    def indicator_label(self):
        indicators = self.feed.get("indicators")
        label_processor = IndicatorLabelGenerator(indicators)
        return label_processor.get_label()

    @property
    @lru_cache()
    def label(self):
        return ChainMap(self.pulse_label, self.indicator_label)

    def match(self, string, method="fuzzy_match"):
        # match
        matchings = (
            seq(self.label.items())
            .starmap(lambda k, v: (jaro.normalized_similarity(k, string), v))
            .filter(
                lambda similar_string: similar_string[0]
                >= self.MatchThreshold[method].value
            )
            .sorted(lambda similar_string: similar_string[0], reverse=True)
            .cache()
        )

        if matchings.empty():
            label = None
        else:
            label = matchings.starmap(lambda similarity, label: label).first()

        return label

In [10]:
FeedLabeler(feed).label

ChainMap(ChainMap({'HelloKitty Ransomware Lacks Stealth, But Still Strikes Home': PulseLabel(category='name', group=0, role=None), 'HelloKitty is a ransomware family that emerged in late 2020. While it lacks the sophistication of some of the more well-known families such as Ryuk, REvil, and Conti, it has nevertheless struck some notable targets, including CEMIG0. In this post, Sentinel Labs analyzes a recent HelloKitty sample and outlines the basic behaviors and traits associated with this family of ransomware.': PulseLabel(category='description', group=0, role=None)}, {'HelloKitty': PulseLabel(category='tag', group=0, role=None), 'Ransomware': PulseLabel(category='tag', group=0, role=None), 'HelloKitty Ransomware': PulseLabel(category='malware_family', group=0, role=None), 'Win32:HelloKitty': PulseLabel(category='malware_family', group=0, role=None), 'Ransom.HelloKitty': PulseLabel(category='malware_family', group=0, role=None), 'T1005': PulseLabel(category='attack_id', group=0, role=

In [11]:
FeedLabeler(feed).match("6x7dp6h3w6q3ugjv4yv5gycj3femb24kysgry5b44hhgfwc5ml5qrdad")

IndicatorLabel(category='domain', group=4, role=None)

---

In [12]:
import sys

sys.path.append("../")

In [13]:
from magpie.src.label import PulseLabelGenerator, IndicatorLabelGenerator
from magpie.src.labeler import FeedLabeler, ElementLabeler

from functional import seq

In [14]:
feed = seq.json("../data/hellokitty.feeds.json").to_dict()
# feed = seq.json("../data/hafnium.feeds.json").to_dict()

In [15]:
feed_labeler = FeedLabeler(feed)

In [16]:
feed_labeler.match_label("6x7dp6h3w6q3ugjv4yv5gycj3femb24kysgry5b44hhgfwc5ml5qrdad")

IndicatorLabel(category='domain', group=4, role=None)

In [17]:
feed_labeler.label

ChainMap(ChainMap({'HelloKitty Ransomware Lacks Stealth, But Still Strikes Home': PulseLabel(category='name', group=0, role=None), 'HelloKitty is a ransomware family that emerged in late 2020. While it lacks the sophistication of some of the more well-known families such as Ryuk, REvil, and Conti, it has nevertheless struck some notable targets, including CEMIG0. In this post, Sentinel Labs analyzes a recent HelloKitty sample and outlines the basic behaviors and traits associated with this family of ransomware.': PulseLabel(category='description', group=0, role=None)}, {'HelloKitty': PulseLabel(category='tag', group=0, role=None), 'Ransomware': PulseLabel(category='tag', group=0, role=None), 'HelloKitty Ransomware': PulseLabel(category='malware_family', group=0, role=None), 'Win32:HelloKitty': PulseLabel(category='malware_family', group=0, role=None), 'Ransom.HelloKitty': PulseLabel(category='malware_family', group=0, role=None), 'T1005': PulseLabel(category='attack_id', group=0, role=

---

In [18]:
from magpie.src.analyzer import HTMLElementAnalyzer
from magpie.src.extractor import ContentExtractor
from magpie.src.parser import EntityParser

In [19]:
document = open(
    "../data/hellokitty-ransomware-lacks-stealth-but-still-strikes-home.html",
    # "../data/hafnium-targeting-exchange-servers.html",
    "r",
).read()

In [20]:
analyzer = HTMLElementAnalyzer(document)
extractor = ContentExtractor()
parser = EntityParser(analyzer.keywords)

In [21]:
element_labeler = ElementLabeler(
    analyzer.primary_subtree_leafs,
    content_extractor=extractor,
    entity_parser=parser,
    feed_labeler=feed_labeler,
)

---

In [22]:
from itertools import product

product_by_contents = lambda element: product(
    [element], extractor.get_contents(element)
)

product_by_entities = lambda element, content: product(
    [element], [content], parser.iterscan(content)
)

In [23]:
seq(analyzer.primary_subtree_leafs).map(product_by_contents).flatten().starmap(
    product_by_entities
).flatten().starmap(
    lambda element, content, text: (
        text.string,
        text.entity,
        # element.sourceline,
        element,
        content.property,
        content.span,
        text.span,
    )
).starmap(
    lambda string, entity, element, prop, content_span, text_span: (
        string,
        entity,
        element,
        prop,
        content_span,
        text_span,
        element_labeler.match_label(element)
        if element_labeler.match_label(element) is not None
        else feed_labeler.match_label(string, method="fuzzy_match"),
    )
).distinct().filter(
    lambda x: x[6] != None
)

0,1,2,3,4,5,6
"HelloKitty Ransomware Lacks Stealth, But Still Strikes Home",,<Element h1 at 0x12b04ab48>,text,"(49, 108)","(0, 59)","PulseLabel(category='name', group=0, role=None)"
"HelloKitty is a ransomware family that emerged in late 2020. While it lacks the sophistication of some of the more well-known families such as Ryuk, REvil, and Conti, it has nevertheless struck some notable targets, including CEMIG0. In this post, we analyse a recent HelloKitty sample and outline the basic behaviors and traits associated with this family of ransomware.",,<Element p at 0x12b04b048>,text,"(3, 374)","(0, 371)","PulseLabel(category='description', group=0, role=None)"
6x7dp6h3w6q3ugjv4yv5gycj3femb24kysgry5b44hhgfwc5ml5qrdad.onion,domain,<Element code at 0x12b04b908>,text,"(6, 68)","(0, 62)","IndicatorLabel(category='domain', group=4, role=None)"
fadd8d7c13a18c251ded1f645ffea18a37f1c2de,FileHash-SHA1,<Element br at 0x12b04bb08>,tail,"(5, 46)","(1, 41)","IndicatorLabel(category='FileHash-SHA1', group=3, role=None)"
501487b025f25ddf1ca32deb57a2b4db43ccf6635c1edc74b9cff54ce0e5bcfe,FileHash-SHA256,<Element br at 0x12b04bbc8>,tail,"(5, 70)","(1, 65)","IndicatorLabel(category='FileHash-SHA256', group=2, role=None)"
T1005,AttackTechnique,<Element a at 0x12b04bcc8>,text,"(45, 50)","(0, 5)","PulseLabel(category='attack_id', group=0, role=None)"
T1112,AttackTechnique,<Element a at 0x12b04bd48>,text,"(45, 50)","(0, 5)","PulseLabel(category='attack_id', group=0, role=None)"
T1012,AttackTechnique,<Element a at 0x12b04bdc8>,text,"(45, 50)","(0, 5)","PulseLabel(category='attack_id', group=0, role=None)"
T1082,AttackTechnique,<Element a at 0x12b04be48>,text,"(45, 50)","(0, 5)","PulseLabel(category='attack_id', group=0, role=None)"
T1486,AttackTechnique,<Element a at 0x12b04bec8>,text,"(45, 50)","(0, 5)","PulseLabel(category='attack_id', group=0, role=None)"


---