## Model Prediction

In [6]:
import unicodedata

from bs4 import BeautifulSoup
from readability import Document

def preprocess_article(article):
    with open(article, encoding="utf-8") as file:
        data = file.read()
    doc = Document(data)  # from python library readability
    summary = doc.summary(html_partial=True) # get readable content with html tags
    soup = BeautifulSoup(summary, "lxml")
    for drop_tags in soup.find_all(['pre', 'code']): # drop <pre> tag
        drop_tags.decompose()
    cleantext = unicodedata.normalize("NFKD",soup.text)
    article_list = [i for i in cleantext.splitlines() if (i != "" and i != " ")] # sequence
    return article_list

In [12]:
article

['Update [03/16/2021]: Microsoft released updated tools and investigation guidance to help IT Pros and incident response teams identify, remediate, defend against associated attacks: Guidance for responders: Investigating and remediating on-premises Exchange Server vulnerabilities.',
 'Update [03/15/2021]: Microsoft released a new one-click mitigation tool, the Microsoft Exchange On-Premises Mitigation Tool, to help customers who do not have dedicated security or IT teams to apply security updates for Microsoft Exchange Server. ',
 'Update [03/08/2021]: Microsoft continues to see multiple actors taking advantage of unpatched systems to attack organizations with on-premises Exchange Server. To aid defenders in investigating these attacks where Microsoft security products and tooling may not be deployed, we are releasing a feed of observed indicators of compromise (IOCs). The feed of malware hashes and known malicious file paths observed in related attacks is available in both JSON and C

In [4]:
from transformers import pipeline
model= "cynthiachan/finetune-deberta" #"cynthiachan/finetuned-deberta-base-10pct"
token_classifier = pipeline("ner", model=model, aggregation_strategy="simple")

Downloading: 100%|██████████| 2.25k/2.25k [00:00<00:00, 695kB/s]
Downloading: 100%|██████████| 529M/529M [03:36<00:00, 2.56MB/s]   
Downloading: 100%|██████████| 1.40k/1.40k [00:00<00:00, 531kB/s]
Downloading: 100%|██████████| 780k/780k [00:02<00:00, 358kB/s]  
Downloading: 100%|██████████| 446k/446k [00:01<00:00, 290kB/s] 
Downloading: 100%|██████████| 2.01M/2.01M [00:04<00:00, 427kB/s] 
Downloading: 100%|██████████| 963/963 [00:00<00:00, 324kB/s]


In [11]:
returnList = []
article = preprocess_article('../data/hafnium-targeting-exchange-servers.html')
for line in article:
    for prediction in token_classifier(line):
        returnList.append({prediction['entity_group']:prediction['word']})
returnList

[{'cve': ' CVE-2021-26855'},
 {'cve': ' CVE-2021-26857'},
 {'cve': ' CVE-2021-26858'},
 {'cve': ' CVE-2021-27065'},
 {'cve': ' CVE-2021-26855'},
 {'cve': ' CVE-2021-26857'},
 {'filepath': ' service. Insecure'},
 {'cve': ' CVE-2021-26858'},
 {'cve': ' CVE-2021-27065'},
 {'cve': ' CVE-2021-26855'},
 {'cve': ' CVE-2021-26858'},
 {'cve': ' CVE-2021-26857'},
 {'filepath': ' System.InvalidCastException'},
 {'cve': ' CVE-2021-27065'},
 {'filepath': ' script. InternalUrl'},
 {'sha256': ' b75f163ca9b9240bf4b37ad92bc7556b40a17e27c2b8ed5c8991385fe07d17d0'},
 {'sha256': ' 097549cf7d0f76f0d99edf8b2d91c60977fd6a96e4b8c3c94b0b1733dc026d3e'},
 {'sha256': ' 2b6f1ebb2208e93ade4a6424555d6a8341fd6d9f60c25e44afe11008f5c1aad1'},
 {'sha256': ' 65149e036fff06026d80ac9ad4d156332822dc93142cf1a122b1841ec8de34b5'},
 {'sha256': ' 511df0e2df9bfa5521b588cc4bb5f8c5a321801b803394ebc493db1ef3c78fa1'},
 {'sha256': ' 4edc7770464a14f54d17f36dc9d0fe854f68b346b27b35a6f5839adf1f13f8ea'},
 {'sha256': ' 811157f9c7003ba8d17b45e

In [9]:
article = preprocess_article('../data/hafnium-targeting-exchange-servers.html')
cnt=0
for rows in article:
    entities = token_classifier(rows)
    if entities:
        cnt+=1
        print(entities)
print(cnt)

[{'entity_group': 'cve', 'score': 0.999722, 'word': ' CVE-2021-26855', 'start': 49, 'end': 64}, {'entity_group': 'cve', 'score': 0.99955887, 'word': ' CVE-2021-26857', 'start': 65, 'end': 80}, {'entity_group': 'cve', 'score': 0.9996373, 'word': ' CVE-2021-26858', 'start': 81, 'end': 96}, {'entity_group': 'cve', 'score': 0.99967283, 'word': ' CVE-2021-27065', 'start': 101, 'end': 116}]
[{'entity_group': 'cve', 'score': 0.9998477, 'word': ' CVE-2021-26855', 'start': 0, 'end': 14}]
[{'entity_group': 'cve', 'score': 0.9998457, 'word': ' CVE-2021-26857', 'start': 0, 'end': 14}, {'entity_group': 'filepath', 'score': 0.7159449, 'word': ' service. Insecure', 'start': 84, 'end': 102}]
[{'entity_group': 'cve', 'score': 0.9998282, 'word': ' CVE-2021-26858', 'start': 0, 'end': 14}]
[{'entity_group': 'cve', 'score': 0.9998396, 'word': ' CVE-2021-27065', 'start': 0, 'end': 14}]
[{'entity_group': 'cve', 'score': 0.9998816, 'word': ' CVE-2021-26855', 'start': 0, 'end': 14}]
[{'entity_group': 'cve', 's

In [3]:
article = preprocess_article('../data/hellokitty-ransomware-lacks-stealth-but-still-strikes-home.html')
cnt=0
for rows in article:
    entities = token_classifier(rows)
    if entities:
        cnt+=1
        print(entities)
print(cnt)

[{'entity_group': 'md5', 'score': 0.9441428, 'word': ' fadd8d7c13a18c251ded1f645ffea18a37f1c2de', 'start': 0, 'end': 40}]
[{'entity_group': 'sha256', 'score': 0.9947377, 'word': ' 501487b025f25ddf1ca32deb57a2b4db43ccf6635c1edc74b9cff54ce0e5bcfe', 'start': 0, 'end': 64}]
[{'entity_group': 'attackID', 'score': 0.99644643, 'word': ' T1005', 'start': 24, 'end': 30}]
[{'entity_group': 'attackID', 'score': 0.9964326, 'word': ' T1112', 'start': 17, 'end': 23}]
[{'entity_group': 'attackID', 'score': 0.99631554, 'word': ' T1012', 'start': 16, 'end': 22}]
[{'entity_group': 'attackID', 'score': 0.9965721, 'word': ' T1082', 'start': 30, 'end': 36}]
[{'entity_group': 'attackID', 'score': 0.9964803, 'word': ' T1486', 'start': 27, 'end': 33}]
[{'entity_group': 'attackID', 'score': 0.99612683, 'word': ' T1070.004', 'start': 15, 'end': 25}]
[{'entity_group': 'attackID', 'score': 0.99629706, 'word': ' T1059.003', 'start': 58, 'end': 68}]
[{'entity_group': 'attackID', 'score': 0.99651986, 'word': ' T1047

## BERT

In [27]:
from transformers import pipeline
model="cynthiachan/hp-search-bert"
token_classifier = pipeline("ner", model=model, aggregation_strategy="simple")

In [31]:
article = preprocess_article('../data/Compromised Turkish Government Web site leads to malware-1')
cnt=0
for rows in article:
    entities = token_classifier(rows)
    if entities:
        cnt+=1
        print(entities)
print(cnt)

[{'entity_group': 'ipv4', 'score': 0.9139078, 'word': '103. 246. 115.', 'start': 75, 'end': 87}]
[{'entity_group': 'md5', 'score': 0.9994227, 'word': 'adc9cafbd4e2aa91e4aa75e10a948213', 'start': 5, 'end': 37}]
[{'entity_group': 'ipv4', 'score': 0.99973077, 'word': '103. 246. 115. 238', 'start': 85, 'end': 100}]
[{'entity_group': 'hostname', 'score': 0.9998227, 'word': 'qpqaaa. best. volyn. ua', 'start': 0, 'end': 20}]
[{'entity_group': 'hostname', 'score': 0.9998191, 'word': 'ohbkaa. best. volyn. ua', 'start': 0, 'end': 20}]
[{'entity_group': 'hostname', 'score': 0.9998233, 'word': 'wknqba. best. volyn. ua', 'start': 0, 'end': 20}]
[{'entity_group': 'hostname', 'score': 0.99982136, 'word': 'wnewca. best. volyn. ua', 'start': 0, 'end': 20}]
[{'entity_group': 'hostname', 'score': 0.9998214, 'word': 'arlrda. best. volyn. ua', 'start': 0, 'end': 20}]
[{'entity_group': 'hostname', 'score': 0.9998236, 'word': 'umozea. best. volyn. ua', 'start': 0, 'end': 20}]
[{'entity_group': 'hostname', 's

In [32]:
article = preprocess_article('../data/Mumblehard indicators-1')
cnt=0
for rows in article:
    entities = token_classifier(rows)
    if entities:
        cnt+=1
        print(entities)
print(cnt)

[{'entity_group': 'domain', 'score': 0.99952793, 'word': 'yellsoft. net', 'start': 369, 'end': 381}]
1


In [33]:
article = preprocess_article('../data/NetTraveler APT Targets Russian, European Interests-1')
cnt=0
for rows in article:
    entities = token_classifier(rows)
    if entities:
        cnt+=1
        print(entities)
print(cnt)

[{'entity_group': 'cve', 'score': 0.999541, 'word': 'CVE - 2012 - 0158', 'start': 620, 'end': 633}]
[{'entity_group': 'filepath', 'score': 0.9864459, 'word': 'PlugX. Palo Alto', 'start': 210, 'end': 226}]
[{'entity_group': 'filepath', 'score': 0.9989669, 'word': 'USA. doc', 'start': 148, 'end': 155}]
[{'entity_group': 'filepath', 'score': 0.90002304, 'word': 'equipment', 'start': 45, 'end': 54}, {'entity_group': 'domain', 'score': 0.64700997, 'word': '. scr', 'start': 54, 'end': 58}]
[{'entity_group': 'filepath', 'score': 0.9985293, 'word': 'state. scr', 'start': 26, 'end': 35}]
[{'entity_group': 'filepath', 'score': 0.9803484, 'word': 'commission. scr', 'start': 39, 'end': 53}]
[{'entity_group': 'domain', 'score': 0.60196173, 'word': '##c', 'start': 23, 'end': 24}]
[{'entity_group': 'filepath', 'score': 0.9996863, 'word': 'Bordyuzh. scr', 'start': 39, 'end': 51}]
[{'entity_group': 'filepath', 'score': 0.9991908, 'word': '20. 04. 2016', 'start': 54, 'end': 64}]
[{'entity_group': 'filep

## DeBERTa

In [34]:
from transformers import pipeline
model="cynthiachan/hp-search-deberta"
token_classifier = pipeline("ner", model=model, aggregation_strategy="simple")

In [37]:
article = preprocess_article('../data/Compromised Turkish Government Web site leads to malware-1')
cnt=0
for rows in article:
    entities = token_classifier(rows)
    if entities:
        cnt+=1
        print(entities)
print(cnt)

[{'entity_group': 'md5', 'score': 0.996152, 'word': 'adc9cafbd4e2aa91e4aa75e10a948213', 'start': 5, 'end': 37}, {'entity_group': 'filepath', 'score': 0.76296157, 'word': 'Heuristic. LooksLike', 'start': 86, 'end': 105}, {'entity_group': 'filepath', 'score': 0.68075883, 'word': 'Win32', 'start': 106, 'end': 111}]
[{'entity_group': 'ipv4', 'score': 0.9987283, 'word': '103. 246. 115. 238', 'start': 85, 'end': 100}]
[{'entity_group': 'hostname', 'score': 0.9998481, 'word': 'qpqaaa. best. volyn. ua', 'start': 0, 'end': 20}]
[{'entity_group': 'hostname', 'score': 0.9998465, 'word': 'ohbkaa. best. volyn. ua', 'start': 0, 'end': 20}]
[{'entity_group': 'hostname', 'score': 0.9998503, 'word': 'wknqba. best. volyn. ua', 'start': 0, 'end': 20}]
[{'entity_group': 'hostname', 'score': 0.99985045, 'word': 'wnewca. best. volyn. ua', 'start': 0, 'end': 20}]
[{'entity_group': 'hostname', 'score': 0.99984515, 'word': 'arlrda. best. volyn. ua', 'start': 0, 'end': 20}]
[{'entity_group': 'hostname', 'score'

In [35]:
article = preprocess_article('../data/NetTraveler APT Targets Russian, European Interests-1')
cnt=0
for rows in article:
    entities = token_classifier(rows)
    if entities:
        cnt+=1
        print(entities)
print(cnt)

[{'entity_group': 'cve', 'score': 0.9995779, 'word': 'CVE - 2012 - 0158', 'start': 620, 'end': 633}]
[{'entity_group': 'filepath', 'score': 0.72257155, 'word': 'USA. doc', 'start': 148, 'end': 155}]
[{'entity_group': 'filepath', 'score': 0.9997755, 'word': 'state. scr', 'start': 26, 'end': 35}]
[{'entity_group': 'filepath', 'score': 0.99941957, 'word': 'commission. scr', 'start': 39, 'end': 53}]
[{'entity_group': 'filepath', 'score': 0.6456349, 'word': 'Б', 'start': 14, 'end': 15}, {'entity_group': 'filepath', 'score': 0.7893932, 'word': 'scr', 'start': 22, 'end': 25}]
[{'entity_group': 'filepath', 'score': 0.9997961, 'word': 'Bordyuzh. scr', 'start': 39, 'end': 51}]
[{'entity_group': 'filepath', 'score': 0.99944705, 'word': '20. 04. 2016', 'start': 54, 'end': 64}]
[{'entity_group': 'filepath', 'score': 0.99967045, 'word': '20. 04. 2016. scr', 'start': 46, 'end': 60}]
[{'entity_group': 'cve', 'score': 0.9994048, 'word': 'CVE - 2012 - 0158', 'start': 125, 'end': 138}, {'entity_group': '

In [36]:
article = preprocess_article('../data/Mumblehard indicators-1')
cnt=0
for rows in article:
    entities = token_classifier(rows)
    if entities:
        cnt+=1
        print(entities)
print(cnt)

[{'entity_group': 'domain', 'score': 0.99954545, 'word': 'yellsoft. net', 'start': 369, 'end': 381}]
1
