In [1]:
import sys
sys.path.append("../")

In [3]:
import pandas as pd
import numpy as np

In [9]:
from magpie.src.html_analyzer import Analyzer
article = "../data/Outbreak of Follina in Australia - Avast Threat Labs.html"
article_list = Analyzer.clean_article(article)

---
### Get Plain Text 

In [8]:
from readability import Document
from bs4 import BeautifulSoup
import unicodedata
import string
remove_words = '!"#$%&\'()*+,-.:;<=>?@[\\]^_`{|}~' + string.whitespace

In [13]:
class ModifyTags(object):

    def __init__(self, soup, tags):
        self.soup = soup
        self.replace_table_tag()
        self.delete_tags(tags)

    def replace_table_tag(self):
        table = self.soup.find_all('table')
        for row in table:
            allcols = row.findAll('tr')
            for col in allcols:
                thestrings = [s for s in col.findAll(string=True)]
                thestrings.append(' ')
                thetext = ' '.join(thestrings)
                new_tag = self.soup.new_tag('p')
                new_tag.string = thetext
                self.soup.find('table').append(new_tag)
            self.soup.find('tbody').decompose()

    def delete_tags(self, tags: list[str]):
        for drop_tags in self.soup.find_all(tags): # drop <pre> and <img> tag
            drop_tags.decompose()


In [14]:
def preprocess_article(article):
    with open(article, encoding="utf-8") as file:
        data = file.read()
    doc = Document(data)  # from python library readability
    summary = doc.summary(html_partial=True) # get readable content with html tags
    soup = BeautifulSoup(summary, "lxml")
    ModifyTags(soup, tags=['pre', 'img'])
    print(soup.prettify)
    # cleantext = unicodedata.normalize("NFKD",soup.text)
    # input = [i.rstrip(remove_words,) for i in cleantext.splitlines() if (i != "" and i != " ")] # sequence
    # return input

In [15]:
# article = "../data/hafnium-targeting-exchange-servers.html"
# article = "../data/hellokitty-ransomware-lacks-stealth-but-still-strikes-home.html"
# article = "../data/Emotet Returns With New Methods of Evasion.html"
# article = "../data/Hitching a ride with Mustang Panda - Avast Threat Labs.html"
# article = "../data/Outbreak of Follina in Australia - Avast Threat Labs.html"
# article = "../data/Dota 2 Under Attack_ How a V8 Bug Was Exploited in the Game - Avast Threat Labs.html"
article = "../data/new-horabot-targets-americas.html"
article_list = preprocess_article(article)
article_list

<bound method Tag.prettify of <html><body><div><div class="post-content">
<ul><li>Cisco Talos has observed a threat actor deploying a previously unidentified botnet program Talos is calling “Horabot,” which delivers a known banking trojan and spam tool onto victim machines in a campaign that has been ongoing since at least November 2020.</li><li>The threat actor appears to be targeting Spanish-speaking users in the Americas and, based on our analysis, may be located in Brazil.</li><li>Horabot enables the threat actor to control the victim’s Outlook mailbox, exfiltrate contacts’ email addresses, and send phishing emails with malicious HTML attachments to all addresses in the victim’s mailbox.</li><li>The banking trojan can collect the victim’s login credentials for various online accounts, operating system information and keystrokes. It also steals one-time security codes or soft tokens from the victim’s online banking applications.</li><li>The spam tool compromises Yahoo, Gmail and Out

---
### Import EntityParser

In [2]:
from magpie.src.entity import (
    attack_technique,
    bitcoin_address,
    cve,
    defender_threat,
    domain,
    email,
    filehash_md5,
    filehash_sha1,
    filehash_sha256,
    filepath,
    hostname,
    ipv4,
    ipv6,
    # keyword,
    sslcert_fingerprint,
    uri,
    url,
)

parser = (
    defender_threat.parser
    | url.parser
    | uri.parser
    | email.parser
    | hostname.parser
    | domain.parser
    | sslcert_fingerprint.parser
    | ipv6.parser
    | ipv4.parser
    | cve.parser
    | attack_technique.parser
    | filepath.parser
    | filehash_sha256.parser  # len = 64
    | filehash_sha1.parser  # len = 40
    | bitcoin_address.parser  # len = 34
    | filehash_md5.parser  # len = 32
    # | keyword.make_parser(self.extracted_keywords)  # generate keywords parser
)

In [6]:
from magpie.src.html_analyzer import Analyzer

# article = "../data/hafnium-targeting-exchange-servers.html"
# article = "../data/hellokitty-ransomware-lacks-stealth-but-still-strikes-home.html"
# article = "../data/Emotet Returns With New Methods of Evasion.html"
# article = "../data/Hitching a ride with Mustang Panda - Avast Threat Labs.html"
# article = "../data/Outbreak of Follina in Australia - Avast Threat Labs.html"
# article = "../data/Dota 2 Under Attack_ How a V8 Bug Was Exploited in the Game - Avast Threat Labs.html"
# article = "../data/Operation Magalenha _ Long-Running Campaign Pursues Portuguese Credentials and PII - SentinelOne.html"
# article = "../data/analysis-of-new-active-malware-mediaarena-pua.html"
article = "../data/new-horabot-targets-americas.html"

cleantext = Analyzer.clean_article(article)
cleantext

['',
 "Cisco Talos has observed a threat actor deploying a previously unidentified botnet program Talos is calling “Horabot,” which delivers a known banking trojan and spam tool onto victim machines in a campaign that has been ongoing since at least November 2020.The threat actor appears to be targeting Spanish-speaking users in the Americas and, based on our analysis, may be located in Brazil.Horabot enables the threat actor to control the victim’s Outlook mailbox, exfiltrate contacts’ email addresses, and send phishing emails with malicious HTML attachments to all addresses in the victim’s mailbox.The banking trojan can collect the victim’s login credentials for various online accounts, operating system information and keystrokes. It also steals one-time security codes or soft tokens from the victim’s online banking applications.The spam tool compromises Yahoo, Gmail and Outlook webmail accounts, enabling the threat actor to take control of those mailboxes, exfiltrate their contacts’

In [9]:
from functional import seq
result = list(parser.scanString(cleantext))
a = (
    seq(result)
    .starmap(lambda result, start, end: (
        result[0].rstrip(remove_words,), result.getName(), (start, end), result.asDict()))
    .cache()
    )
a

0,1,2,3
185[.]45[.]195[.]226,IPv4,"(2042, 2062)",{'IPv4': '185[.]45[.]195[.]226'}
216[.]238[.]70[.]224,IPv4,"(2194, 2214)",{'IPv4': '216[.]238[.]70[.]224'}
tributaria[.]website,Domain,"(2812, 2832)",{'Domain': 'tributaria[.]website'}
tributaria[.]website,Domain,"(3159, 3179)",{'Domain': 'tributaria[.]website'}
tributaria[.]website,Domain,"(3265, 3285)",{'Domain': 'tributaria[.]website'}
m9b4s2[.]site,Domain,"(3680, 3693)",{'Domain': 'm9b4s2[.]site'}
tributaria[.]website,Domain,"(3710, 3730)",{'Domain': 'tributaria[.]website'}
wiqp[.]xyz,Domain,"(3742, 3752)",{'Domain': 'wiqp[.]xyz'}
ckws[.]info,Domain,"(3766, 3777)",{'Domain': 'ckws[.]info'}
amarte[.]store,Domain,"(3792, 3806)",{'Domain': 'amarte[.]store'}


In [11]:
result[0].getName()

AttributeError: 'tuple' object has no attribute 'getName'

In [9]:
a.show(headers=('token', 'entity', 'location', 'dict'), n=70)

token                                                                                                                                                                                                                                                                                               entity     location        dict
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------  ---------  --------------  -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
185[.]45[.]195[.]226                                   

In [10]:
for line in article_list:
    entities = list(parser.scanString(line))
    if len(entities)>0:
        print(entities)

[(ParseResults(['CVE-2022-30190'], {'CVE': 'CVE-2022-30190'}), 135, 149)]
[(ParseResults(['CVE-2022-30190'], {'CVE': 'CVE-2022-30190'}), 171, 185)]
[(ParseResults(['robots.txt'], {'File Path': 'robots.txt'}), 79, 89), (ParseResults(['robots.txt'], {'File Path': 'robots.txt'}), 137, 147), (ParseResults(['msdt.exe'], {'File Path': 'msdt.exe'}), 284, 292), (ParseResults(['robots.txt'], {'File Path': 'robots.txt'}), 306, 316), (ParseResults(['Sihost.exe'], {'File Path': 'Sihost.exe'}), 356, 366)]
[(ParseResults(['Sihost.exe'], {'File Path': 'Sihost.exe'}), 14, 24)]
[(ParseResults(['robots.txt'], {'File Path': 'robots.txt'}), 18, 28), (ParseResults(['Sihost.exe'], {'File Path': 'Sihost.exe'}), 33, 43), (ParseResults(['msdt.exe'], {'File Path': 'msdt.exe'}), 63, 71), (ParseResults(['b63fbf80351b3480c62a6a5158334ec8e91fecd057f6c19e4b4dd3febaa9d447.'], {'FileHash-SHA256': 'b63fbf80351b3480c62a6a5158334ec8e91fecd057f6c19e4b4dd3febaa9d447.'}), 150, 215), (ParseResults(['favicon.svg'], {'File Pat

---
### Data Example:
```json
{
    "data": [
        {
            "tokens": ["fadd8d7c13a18c251ded1f645ffea18a37f1c2de"], 
            "ner_tags": ["B-sha1"]
        }, 
        {
            "tokens": ["Query", "Registry", "-", "T1012"], 
            "ner_tags": ["O", "O", "O", "B-attackID"]
        },  
        {
            "tokens": ["Ransomware"], 
            "ner_tags": ["O"]
        }
    ]
}
```

In [18]:
convert_cat_name = {
    'AttackTechnique': 'attackID', 'BitcoinAddress': 'bitcoinAddr', 'CVE': 'cve', 'MicrosoftDefenderThreat': 'defenderThreat',
    'domain': 'domain', 'email': 'email', 'FileHash-MD5': 'md5', 'FileHash-SHA1': 'sha1', 'FileHash-SHA256': 'sha256',
    'FilePath': 'filepath', 'hostname': 'hostname', 'IPv4': 'ipv4', 'IPv6': 'ipv6', 'SSLCertFingerprint': 'fingerprint',
    'URI': 'uri', 'URL': 'url', 'YARA': 'yara'
}

# ner_tags = {
#     'O': 0, 'B-attackID': 1, 'I-attackID': 2, 'B-bitcoinAddr': 3, 'I-bitcoinAddr': 4, 'B-cve': 5, 'I-cve': 6, 
#     'B-defenderThreat': 7, 'I-defenderThreat': 8, 'B-domain': 9, 'I-domain': 10, 'B-email': 11, 'I-email': 12, 'B-md5': 13, 
#     'I-md5': 14, 'B-sha1': 15, 'I-sha1': 16, 'B-sha256': 17,'I-sha256': 18, 'B-filepath': 19, 'I-filepath': 20, 
#     'B-hostname': 21, 'I-hostname': 22, 'B-ipv4': 23, 'I-ipv4': 24, 'B-ipv6': 25, 'I-ipv6': 26, 'B-fingerprint': 27, 'I-fingerprint': 28,
#     'B-uri': 29, 'I-uri': 30, 'B-url': 31, 'I-url': 32, 'B-yara': 33, 'I-yara': 34   
# }

#### Split by space

In [499]:
## create data dictionary
from nltk.tokenize import TreebankWordTokenizer, word_tokenize, casual_tokenize
tokenizer = TreebankWordTokenizer()
data = []
for idx, line in enumerate(article_list): # each sentence
    data_dict = {}
    entities = list(parser.scanString(line))
    entity_list = [entity[0][0] for entity in entities]  # ['taskkill.exe', 'net.exe']
    category = [entity[0].getName() for entity in entities]  # ['FilePath', 'FilePath']
    tokens = tokenizer.tokenize(line)  # tokenize
    
    if len(entities) > 0:
        data_dict.update({"id": idx, "tokens": tokens, "entity": entity_list, "category": category})
    else:
        data_dict.update({"id": idx, "tokens": tokens, "entity": ["None"], "category": ["None"]})
    data.append(data_dict)
# data

In [None]:
def mapBioTags(tags):
    next_element = 0
    bio_tags= []
    for x in range(len(tags)):
        element = tags[x]
        if element != 'O':
            if next_element == element:
                bio_tags.append(f'I-{element}')
            else:
                bio_tags.append(f'B-{element}')
        else:
            bio_tags.append(element)
        next_element = element
    bio_tags = list(map(ner_tags.get, bio_tags, bio_tags))
    return bio_tags

def bio_labelling(each_row, convert_cat_name):
    entity_index = {each_row["tokens"].index(ele): convert_cat_name[each_row["category"][idx]] for idx, ele in enumerate(each_row["entity"])}
    for k, v in entity_index.items(): # 替換ner_tags中，屬於entity的token (i.e 0 -> attackID)
        each_row["ner_tags"][k] = v
    bio_tags = mapBioTags(each_row["ner_tags"])
    return bio_tags

## save as a list
final_dataset = []
for each_row in data:
    each_row.update({"ner_tags": ['O' for count in range(len(each_row["tokens"]))]})
    if "None" not in each_row["entity"]:
        print(each_row)
        each_row["ner_tags"] = bio_labelling(each_row, convert_cat_name)
    final_dataset.append({"id": each_row["id"], "tokens": each_row["tokens"], "ner_tags": each_row["ner_tags"]})

## save a a dataframe and export as json file
df = pd.DataFrame.from_records(final_dataset)
df.to_json("../data/testSplit_data.json", orient='records', lines=True, force_ascii=False)

#### Tokenizer 1: NLTK

In [190]:
from nltk.tokenize import word_tokenize

In [1]:
## create data dictionary
data = []
for idx, line in enumerate(article_list): # each sentence
    data_dict = {}
    entities = list(parser.scanString(line))
    entity_list = [entity[0][0] for entity in entities]  # ['taskkill.exe', 'net.exe']
    category = [entity[0].getName() for entity in entities]  # ['FilePath', 'FilePath']
    tokens = word_tokenize(line)  # tokenize
    
    if len(entities) > 0:
        data_dict.update({"id": idx, "tokens": tokens, "entity": entity_list, "category": category})
    else:
        data_dict.update({"id": idx, "tokens": tokens, "entity": ["None"], "category": ["None"]})
    data.append(data_dict)
data

NameError: name 'article_list' is not defined

In [74]:
def mapBioTags(tags):
    next_element = 0
    bio_tags= []
    for x in range(len(tags)):
        element = tags[x]
        if element != 'O':
            if next_element == element:
                bio_tags.append(f'I-{element}')
            else:
                bio_tags.append(f'B-{element}')
        else:
            bio_tags.append(element)
        next_element = element
    # bio_tags = list(map(ner_tags.get, bio_tags, bio_tags))
    return bio_tags

def bio_labelling(each_row, convert_cat_name):
    entity_index = {each_row["tokens"].index(ele): convert_cat_name[each_row["category"][idx]] for idx, ele in enumerate(each_row["entity"])}
    for k, v in entity_index.items(): # 替換ner_tags中，屬於entity的token (i.e 0 -> attackID)
        each_row["ner_tags"][k] = v
    bio_tags = mapBioTags(each_row["ner_tags"])
    return bio_tags

## save as a list
final_dataset = {"data": []}
for each_row in data:
    each_row.update({"ner_tags": ['O' for count in range(len(each_row["tokens"]))]})
    if "None" not in each_row["entity"]:
        each_row["ner_tags"] = bio_labelling(each_row, convert_cat_name)
    final_dataset["data"].append({"tokens": each_row["tokens"], "ner_tags": each_row["ner_tags"]})

## export as json file
with open("../data/huggingface/testing.json", "w") as output:
    json.dump(final_dataset, output)

#### Tokenizer 2: spaCy
某些符號會被斷詞 i.e dsa* (會被分成兩個詞)

In [8]:
import spacy

  from .autonotebook import tqdm as notebook_tqdm


In [201]:
## create data dictionary
data = []
nlp = spacy.load("en_core_web_sm")
for idx, line in enumerate(article_list): # each sentence
    data_dict = {}
    entities = list(parser.scanString(line))
    entity_list = [entity[0][0] for entity in entities]  
    category = [entity[0].getName() for entity in entities] 
    doc = nlp(line)
    tokens = [token.text for token in doc]

    if len(entities) > 0:
        data_dict.update({"id": idx, "tokens": tokens, "entity": entity_list, "category": category})
    else:
        data_dict.update({"id": idx, "tokens": tokens, "entity": ["None"], "category": ["None"]})
    data.append(data_dict)

In [203]:
def mapBioTags(tags):
    next_element = 0
    bio_tags= []
    for x in range(len(tags)):
        element = tags[x]
        if element != 0:
            if next_element == element:
                bio_tags.append(f'I-{element}')
            else:
                bio_tags.append(f'B-{element}')
        else:
            bio_tags.append(element)
        next_element = element
    bio_tags = list(map(ner_tags.get, bio_tags, bio_tags))
    return bio_tags

def bio_labelling(each_row, convert_cat_name):
    entity_index = {each_row["tokens"].index(ele): convert_cat_name[each_row["category"][idx]] for idx, ele in enumerate(each_row["entity"])}
    for k, v in entity_index.items(): # 替換ner_tags中，屬於entity的token (i.e 0 -> attackID)
        each_row["ner_tags"][k] = v
    bio_tags = mapBioTags(each_row["ner_tags"])
    return bio_tags

## save as a list
final_dataset = []
for each_row in data:
    each_row.update({"ner_tags": [0 for count in range(len(each_row["tokens"]))]})
    if "None" not in each_row["entity"]:
        print(each_row)
        each_row["ner_tags"] = bio_labelling(each_row, convert_cat_name)
    final_dataset.append({"id": each_row["id"], "tokens": each_row["tokens"], "ner_tags": each_row["ner_tags"]})

## save a a dataframe and export as json file
df = pd.DataFrame.from_records(final_dataset)
df
# df.to_json("../data/testSpacy_data.json", orient='records', lines=True, force_ascii=False)

{'id': 6, 'tokens': ['The', 'vulnerabilities', 'recently', 'being', 'exploited', 'were', 'CVE-2021', '-', '26855', ',', 'CVE-2021', '-', '26857', ',', 'CVE-2021', '-', '26858', ',', 'and', 'CVE-2021', '-', '27065', ',', 'all', 'of', 'which', 'were', 'addressed', 'in', 'today', '’s', 'Microsoft', 'Security', 'Response', 'Center', '(', 'MSRC', ')', 'release', '–', 'Multiple', 'Security', 'Updates', 'Released', 'for', 'Exchange', 'Server', '.', 'We', 'strongly', 'urge', 'customers', 'to', 'update', 'on', '-', 'premises', 'systems', 'immediately', '.', 'Exchange', 'Online', 'is', 'not', 'affected', '.', 'We', 'have', 'established', 'a', 'resource', 'center', 'that', 'is', 'constantly', 'updated', 'as', 'more', 'information', 'becomes', 'available', 'at', 'https://aka.ms/ExchangeVulns', '.'], 'entity': ['CVE-2021-26855', 'CVE-2021-26857', 'CVE-2021-26858', 'CVE-2021-27065', 'https://aka.ms/ExchangeVulns.'], 'category': ['CVE', 'CVE', 'CVE', 'CVE', 'URI'], 'ner_tags': [0, 0, 0, 0, 0, 0, 0, 0

ValueError: 'CVE-2021-26855' is not in list

#### Tokenizer 3: BertTokenizer
建立資料集好像不太適合，會把特殊字詞拆分成好多個 (未來可以嘗試看看)

In [18]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
tokenizer.tokenize("taskkill.exe /f /PID '8512'")

['task',
 '##kill',
 '.',
 'ex',
 '##e',
 '/',
 'f',
 '/',
 'pi',
 '##d',
 "'",
 '85',
 '##12',
 "'"]

----
### Final Version: use spaCy tokenizer and add special case (entities)

In [34]:
def mapBioTags(tags):
    next_element = 0
    bio_tags= []
    for x in range(len(tags)):
        element = tags[x]
        if element != 'O':
            if next_element == element:
                bio_tags.append(f'I-{element}')
            else:
                bio_tags.append(f'B-{element}')
        else:
            bio_tags.append(element)
        next_element = element
    return bio_tags

def add_entities(entity):
    entity_list = []
    if entity:       
        for ents in entity:            
            if ents[0].getName() != 'URL' and ents[0].getName() != 'URI':
                special_case = [{ORTH: ents[0][0]}]
                nlp.tokenizer.add_special_case(ents[0][0], special_case)
                entity_list.append((ents[0][0], ents[0].getName()))
            else:
                if ents[0][0][-1] == '.':
                    special_case = [{ORTH: ents[0][0][:-1]}, {ORTH: "."}]
                    nlp.tokenizer.add_special_case(ents[0][0], special_case)
                    entity_list.append((ents[0][0][:-1], ents[0].getName()))
    return entity_list

In [None]:
import spacy
from spacy.symbols import ORTH

nlp = spacy.load("en_core_web_sm")
final_dataset = {"data": []}
for idx, line in enumerate(article_list): # each sentence
    entities = list(parser.scanString(line))
    entity_list = add_entities(entities)
    doc = nlp(line)
    tokens = [w.text for w in doc]
    ner_tags = ["O" for w in doc]
    for idx, ele in enumerate(entity_list):
        ner_tags[tokens.index(ele[0])] = convert_cat_name[ele[1]]
    bio_tags = mapBioTags(ner_tags)
    final_dataset["data"].append({"tokens": tokens, "ner_tags": bio_tags})

import json
with open("../data/huggingface/hafnium_0607.json", "w") as output:
    json.dump(final_dataset, output, ensure_ascii=False)

---
## Extracting Steps: 
1. 網頁文章清洗 Readability
2. 萃取命名實體並建立dictionary
3. 修改ner_tags成代號 (tbd)
4. save as json file

## Read Reports

1. get ids from pulse_20210310.json.gz:

    `gunzip -c pulse_20210310.json.gz | head -n 5 | jq '.id'`

2. parallel processing:

    `gunzip -c pulse_20210310.json.gz | head -n 5 | jq '.id' | parallel xxx.py`

In [2]:
from os import listdir
from os.path import isfile, join, exists
my_path = "/Users/csti-user/Downloads/Projects/feed_references"

import sys
id = '603eb1abdd4812819c64e197' #sys.argv[1]

In [8]:
DIRPATH = join(my_path, id)
invalid = ['PDF', 'pdf', 'html']
if exists(DIRPATH) == True: 
    for f in listdir(DIRPATH):
        if all(ele not in f for ele in invalid):
            print(f'{id}: {f}')
            file_name = join(DIRPATH, f)
            article_list = preprocess_article(file_name)
            if len(article_list) > 0:
                print(article_list)
else:
    print(f'{id} not exists. {join(my_path, id)}')

603eb1abdd4812819c64e197: exchange-zero-day-proxylogon-and-hafnium
['Sweden+46 8 10 00 10hello@truesec.seHeadquartersStockholmOxtorgsgränd 2, 5 tr111 57 StockholmMalmöTorggatan 4, 7 tr211 40 MalmöKarlskronaDrottninggatan 54, 2 tr371 33 KarlskronaUnited States+1 (425) 818-8044info@truesec.comJacksonville50 North Laura Street, Suite 2500371 33 Florida 3220']
603eb1abdd4812819c64e197: aa21-062a
['Official websites use .gov', 'A .gov website belongs to an official government organization in the United States.', "Secure .gov websites use HTTPS A lock () or https:// means you've safely connected to the .gov website. Share sensitive information only on official, secure websites."]
603eb1abdd4812819c64e197: hafnium-targeting-exchange-servers
603eb1abdd4812819c64e197: microsoft-exchange-server-vulnerabilities
['Threat Assessment: Active Exploitation of Four Zero-Day Vulnerabilities in Microsoft Exchange Server']
603eb1abdd4812819c64e197: active-exploitation-of-microsoft-exchange-zero-day-vul