In [12]:
import pandas as pd
import os
from stanza.server import CoreNLPClient
import glob
from collections import Counter
import pickle
from gensim.models import Word2Vec
from fuzzywuzzy import fuzz
from scipy import spatial
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
from collections import Counter
from simpletransformers.ner import NERModel
import re
import spacy
from iocparser import IOCParser
import scipy
import neuralcoref
import sys
import stanza.protobuf.CoreNLP_pb2 as CoreNLP_pb2
from nltk.corpus import words
from tqdm.contrib import tzip


word_set = set(words.words())
sys.modules['CoreNLP_pb2'] = CoreNLP_pb2
os.environ["CORENLP_HOME"] = 'C:/Users/H/Documents/stanford-corenlp-4.5.1'
w2v = pickle.load(open('pkl/w2v_ref_blog_lemma.pkl', 'rb'))
nlp = spacy.load('en_core_web_lg', exclude=['ner', 'lemmatizer'])
neuralcoref.add_to_pipe(nlp)

mitre_groups = pickle.load(open('pkl/mitre_groups.pkl', 'rb'))
mitre_malwares = pickle.load(open('pkl/mitre_malwares.pkl', 'rb'))
programm_language = pickle.load(open('pkl/programming_language.pkl', 'rb'))

mitre_groups = set(list(map(str.lower, mitre_groups)))
mitre_malwares = set(list(map(str.lower, mitre_malwares)))

malware = pickle.load(open('pkl/malware.pkl', 'rb'))
software = pickle.load(open('pkl/software.pkl', 'rb'))
actor = pickle.load(open('pkl/actor.pkl', 'rb'))

mal_suffix = ['rat', 'downloader', 'trojan', 'ransomware', 'malware', 'virus', 'worm', 'backdoor']
for malname in list(malware):
    if ' ' in malname:
        for _ in mal_suffix:
            if _ in malname.split(' '):
                malware.remove(malname)
                malware.add(''.join(filter(lambda x: x not in mal_suffix, malname.split(' '))))
for malname in list(malware):
    if malname in word_set:
        malware.remove(malname)

for softname in list(software):
    if softname in word_set:
        software.remove(softname)

for actname in list(actor):
    if actname in word_set:
        actor.remove(actname)


class Triple():
    def __init__(self, sub, rel, obj, sub_ent='', rel_ent='', obj_ent='', sub_ent_conf=0, obj_ent_conf=0, conf=0):
        self.sub = sub.lower()
        self.rel = rel.lower()
        self.obj = obj.lower()
        self.sub_ent = sub_ent
        self.rel_ent = rel_ent
        self.obj_ent = obj_ent
        self.sub_ent_conf = sub_ent_conf
        self.obj_ent_conf = obj_ent_conf
        self.confidence = conf

    def __hash__(self):
        return hash(self.simple_sub + self.simple_rel + self.simple_obj)

    def set_raw_article_doc(self, doc):
        self.article_doc = doc

    def set_raw_sent(self, sent):
        self.raw_sent = sent

    def set_raw_triple(self, triple_obj):
        self.triple_obj = triple_obj

    def set_sent_ind(self, sent_ind):
        self.sent_ind = sent_ind

    def set_article_ref(self, article):
        self.article_id = article

    def set_tactic(self, tactic):
        self.tactic = tactic

    def set_tactic_conf(self, tactic_conf):
        self.tactic_conf = tactic_conf

    def set_behave_conf(self, behave_conf):
        self.behave_conf = behave_conf

    def set_relative_sent_ind(self, relative_sent_order):
        self.relative_sent_order = relative_sent_order

    def set_count(self, count):
        self.count = count

    def add_cluster(self, cluster):
        self.from_cluster = set()
        self.from_cluster.add(cluster)

    def __eq__(self, other):
        sub_eq = False
        if self.sub in other.sub or other.sub in self.sub:
            sub_eq = True
        elif fuzz.ratio(self.sub, other.sub) > 90:
            sub_eq = True
        else:
            self_sub_tokens = self.sub.split(' ')
            other_sub_tokens = other.sub.split(' ')
            if all(map(lambda x: x in w2v.wv, self_sub_tokens)) and all(map(lambda x: x in w2v.wv, other_sub_tokens)):
                self_sub_vec = np.mean([w2v.wv[x] for x in self_sub_tokens], axis=0)
                other_sub_vec = np.mean([w2v.wv[x] for x in other_sub_tokens], axis=0)
                sub_eq = spatial.distance.cosine(self_sub_vec, other_sub_vec) < 0.3

        rel_eq = False
        if self.rel in other.rel or other.rel in self.rel:
            rel_eq = True
        elif self.rel == other.rel:
            rel_eq = True
        else:
            self_rel_tokens = self.rel.split(' ')
            other_rel_tokens = other.rel.split(' ')
            if all(map(lambda x: x in w2v.wv, self_rel_tokens)) and all(map(lambda x: x in w2v.wv, other_rel_tokens)):
                self_rel_vec = np.mean([w2v.wv[x] for x in self_rel_tokens], axis=0)
                other_rel_vec = np.mean([w2v.wv[x] for x in other_rel_tokens], axis=0)
                rel_eq = spatial.distance.cosine(self_rel_vec, other_rel_vec) < 0.3

        obj_eq = False
        if self.obj in other.obj or other.obj in self.obj:
            obj_eq = True
        elif fuzz.ratio(self.obj, other.obj) > 90:
            obj_eq = True
        else:
            self_obj_tokens = self.obj.split(' ')
            other_obj_tokens = other.obj.split(' ')
            if all(map(lambda x: x in w2v.wv, self_obj_tokens)) and all(map(lambda x: x in w2v.wv, other_obj_tokens)):
                self_obj_vec = np.mean([w2v.wv[x] for x in self_obj_tokens], axis=0)
                other_obj_vec = np.mean([w2v.wv[x] for x in other_obj_tokens], axis=0)
                obj_eq = spatial.distance.cosine(self_obj_vec, other_obj_vec) < 0.3

        return sub_eq and rel_eq and obj_eq and self.sub_ent == other.sub_ent and self.obj_ent == other.obj_ent and self.rel_ent == other.rel_ent

    def print(self):
        print(f'{self.sub}({self.sub_ent}) >> {self.rel} >> {self.obj}({self.obj_ent})')


def bert_ner(ann):
    tokenized = [[x.value for x in sent.token] for sent in ann.sentence]
    predictions, raw_outputs = model.predict(tokenized, split_on_space=False)
    return predictions, raw_outputs


def simple_match(candidate, ioc_dic):
    candidate = candidate.lower().strip()
    if (candidate.startswith("cve")):
        return "CVE"
    for _ in mitre_groups:
        if candidate.find(_) != -1:
            return "Known_Actor"
    for _ in mitre_malwares:
        if candidate.find(_) != -1:
            return "Known_Malware"
    for _ in programm_language:
        if _ == candidate:
            return "Programming_Language"
    for _ in ioc_dic:
        if _ in candidate:
            textobj = IOCParser(ioc_dic[_]);
            results = textobj.parse();
            if len(results) > 0:
                return results[0].kind
    return ''


def process_triple_tokens(tokens, ann, predictions, raws, ioc_dic):
    redundant = ['PRP$', 'JJ', 'JJR', 'JJS', 'RB', 'RBR', 'RBS', 'DT']
    inflex = ['VBD', 'VBG', 'VBP', 'VBZ', 'NNS', 'NNPS']
    sub_tokens = tokens
    simplified_sub_tokens = []
    sub_entity = ''
    sub_ner_conf = 0
    for sub_token in sub_tokens:
        token_obj = ann.sentence[sub_token.sentenceIndex].token[sub_token.tokenIndex]
        if token_obj.pos.startswith('NN'):
            ner_tag = 'O'
            try:
                ner_tag = list(predictions[sub_token.sentenceIndex][sub_token.tokenIndex].values())[0]
                sub_ner_conf = max(
                    scipy.special.softmax(list(raws[sub_token.sentenceIndex][sub_token.tokenIndex].values())[0][0]))
            except:
                pass
            if ner_tag.startswith('B'):
                sub_entity = ner_tag.split('-')[1]
            if token_obj.pos in inflex:
                simplified_sub_tokens.append(token_obj.lemma)
            else:
                simplified_sub_tokens.append(token_obj.word)
        elif token_obj.pos not in redundant:
            simplified_sub_tokens.append(token_obj.word)
    if len(simplified_sub_tokens) == 0:
        return None, None, None
    else:
        sub = " ".join(simplified_sub_tokens)
        simple_ent = simple_match(sub, ioc_dic)
        if simple_ent != '':
            sub_entity = simple_ent
            sub_ner_conf = 1
    return sub, sub_entity, sub_ner_conf


def extract_triples_with_ner(ann, predictions, raws, ioc_dic, doc_id=None):
    inflex = ['VBD', 'VBG', 'VBP', 'VBZ', 'NNS', 'NNPS']
    extracted = list()
    sub_counter = Counter()
    obj_counter = Counter()
    relation_counter = Counter()
    for sent_ind, sent in enumerate(ann.sentence):
        triples = sent.openieTriple
        for triple in triples:
            print(triple)
            sub_tokens = triple.subjectTokens
            sub, sub_entity, sub_ner_conf = process_triple_tokens(sub_tokens, ann, predictions, raws, ioc_dic)
            if not sub:
                continue

            obj_tokens = triple.objectTokens
            obj, obj_entity, obj_ner_conf = process_triple_tokens(obj_tokens, ann, predictions, raws, ioc_dic)
            if not obj:
                continue

            rel_tokens = triple.relationTokens
            simplified_rel_tokens = []
            for rel_token in rel_tokens:
                token_obj = ann.sentence[rel_token.sentenceIndex].token[rel_token.tokenIndex]
                if token_obj.pos != 'RB':
                    if token_obj.pos in inflex:
                        simplified_rel_tokens.append(token_obj.lemma)
                    else:
                        simplified_rel_tokens.append(token_obj.word)
            if len(simplified_rel_tokens) == 0:
                print("no rel")
                continue
            else:
                rel = " ".join(simplified_rel_tokens)

            for k, v in ioc_dic.items():
                if k in sub:
                    sub = sub.replace(k, v)
                if k in obj:
                    obj = obj.replace(k, v)
                if k in rel:
                    rel = rel.replace(k, v)

            tri = Triple(sub, rel, obj, sub_ent=sub_entity, obj_ent=obj_entity, sub_ent_conf=sub_ner_conf,
                         obj_ent_conf=obj_ner_conf, conf=triple.confidence)
            tri.set_sent_ind(sent_ind)
            tri.set_relative_sent_ind(sent_ind / len(ann.sentence))
            tri.set_article_ref(doc_id)
            tri.set_raw_triple(triple)
            extracted.append(tri)
            sub_counter.update([sub])
            obj_counter.update([obj])
            relation_counter.update([rel])
    return extracted, (sub_counter, obj_counter), relation_counter


def change_ioc(text):
    count = 0
    dic = dict()
    textobj = IOCParser(text);
    results = textobj.parse();
    for res in results:
        sig = res.kind + str(count)
        dic[sig] = res.value
        count += 1
        text = text.replace(res.value, sig)
    return text, dic

tactic_dic = {
    0: 'Initial Access', 
    1: 'Execution', 
    2: 'Defense Evasion', 
    3: 'Command and Control', 
    4: 'Privilege Escalation', 
    5: 'Persistence', 
    6: 'Lateral Movement', 
    7: 'DataLeak', 
    8: 'Exfiltration', 
    9: 'Impact'
}


valid_ner = [
         'Known_Malware',
         'PRO',
         'PERSON',
         'filename',
         'uri',
         'md5',
         'ACTOR',
         'Known_Actor',
         'sha256',
         'URL',
         'sha1',
         'CVE',
]

def dedupe(article_extracts):
    seen_sent = dict()
    unique = []
    for triple in article_extracts:
        sent_ind = triple.sent_ind
        if sent_ind not in seen_sent:
            seen_sent[sent_ind] = list()
        
        dup = False
        for seen_triple in seen_sent[sent_ind]:
            if seen_triple == triple:
                if len(seen_triple.sub) >= len(triple.sub) and len(seen_triple.obj) >= len(triple.obj):
                    seen_sent[sent_ind].remove(seen_triple)
                    dup = False
                else:
                    #seen_triple.set_count(seen_triple.count+1)
                    dup = True
                break
                    
        if not dup:
            seen_sent[sent_ind].append(triple)
    
    for k, v in seen_sent.items():
        unique += v
        
    return unique
  
def fusion(trip):
    det_ner = ['filename', 'sha256', 'CVE', 'uri', 'md5', 'sha1']
    if trip.sub_ent not in det_ner:
        for single in malware:
            #if ' ' in single and single in trip.sub:
            if single in trip.sub:
                trip.sub = single
                trip.sub_ent = 'Known_Malware'
                trip.sub_ent_conf = 1
            elif single in trip.sub.split(' '):
                trip.sub = single
                trip.sub_ent = 'Known_Malware'
                trip.sub_ent_conf = 1
                
        for single in software:
           # if ' ' in single and single in trip.sub:
            if single in trip.sub:
                trip.sub_ent = 'PRO'
                trip.sub_ent_conf = 1
            elif single in trip.sub.split(' '):
                trip.sub_ent = 'PRO'
                trip.sub_ent_conf = 1
                
        for single in actor:
            #if ' ' in single and single in trip.sub:
            if single in trip.sub:
                trip.sub_ent = 'Known_Actor'
                trip.sub_ent_conf = 1
            elif single in trip.sub.split(' '):
                trip.sub_ent = 'Known_Actor'
                trip.sub_ent_conf = 1
    
    if trip.obj_ent not in det_ner:
        for single in malware:
            #if ' ' in single and single in trip.obj:
            if single in trip.obj:
                trip.obj = single
                trip.obj_ent = 'Known_Malware'
                trip.obj_ent_conf = 1
            elif single in trip.obj.split(' '):
                trip.obj = single
                trip.obj_ent = 'Known_Malware'
                trip.obj_ent_conf = 1
                
        for single in software:
            #if ' ' in single and single in trip.obj:
            if single in trip.obj:
                trip.obj_ent = 'PRO'
                trip.obj_ent_conf = 1
            elif single in trip.obj.split(' '):
                trip.obj_ent = 'PRO'
                trip.obj_ent_conf = 1
                
        for single in actor:
            #if ' ' in single and single in trip.obj:
            if single in trip.obj:
                trip.obj_ent = 'Known_Actor'
                trip.obj_ent_conf = 1
            elif single in trip.obj.split(' '):
                trip.obj_ent = 'Known_Actor'
                trip.obj_ent_conf = 1

def filter_ner(triples):
    filtered = []
    for triple in triples:
        if triple.sub_ent != '' and triple.sub_ent in valid_ner and triple.obj_ent != '' and triple.obj_ent in valid_ner:
            filtered.append(triple)
            
    return filtered

def filter_conf(triples, behav_thresh=0.5):
    filtered = []
    for triple in triples:        
        if triple.confidence < 0.8:
            continue
            
        if triple.behave_conf < behav_thresh:
            continue
        
        filtered.append(triple)
    return filtered

def display_triples(triples_list):
    for e in triples_list:
        #print(e.sub,'|',e.rel,'|',e.obj)
        print(e.sub,'|',e.rel,'|',e.obj,'|',e.sub_ent,'|',e.obj_ent,'|',e.sub_ent_conf,'|',e.obj_ent_conf,'|',e.confidence)


def build_graph(triples):
    g = nx.MultiDiGraph()
    for triple in triples:
        if triple.sub not in g.nodes:
            s = []
            s.append(triple)
            g.add_node(triple.sub, appearance=1, ent=triple.sub_ent, mentions=s)
        else:
            g.nodes[triple.sub]['appearance'] += 1
            g.nodes[triple.sub]['mentions'].append(triple)

        if triple.obj not in g.nodes:
            s = []
            s.append(triple)
            g.add_node(triple.obj, appearance=1, ent=triple.obj_ent, mentions=s)
        else:
            g.nodes[triple.obj]['appearance'] += 1
            g.nodes[triple.obj]['mentions'].append(triple)

        if not g.has_edge(triple.sub, triple.obj):
            s = []
            s.append(triple)
            g.add_edge(triple.sub, triple.obj, appearance=1, relation=triple.rel, mentions=s)
        elif g[triple.sub][triple.obj][0]['relation'] == triple.rel:
            g[triple.sub][triple.obj][0]['appearance'] += 1
            g[triple.sub][triple.obj][0]['mentions'].append(triple)
        else:
            s = []
            s.append(triple)
            g.add_edge(triple.sub, triple.obj, appearance=1, relation=triple.rel, mentions=s)
    return g

def extract_subgraph(g, nodes):
    sg = g.subgraph(nodes)
    sg_tactics = set()
    sg_articles = set()
    sg_linking_nodes = set()
    
    edge_labels = {}
    edge_mentions = []
    node2article = dict()
    for edge in sg.edges(data=True):
        mentions = edge[2]['mentions']
        sg_articles.update(map(lambda x: x.article_id, mentions))
        for mention in mentions:
            for tac in mention.tactic:
                sg_tactics.add(tac)
        if edge[0] not in node2article:
            node2article[edge[0]] = set()
        node2article[edge[0]].update(map(lambda x: x.article_id, mentions))
        if edge[1] not in node2article:
            node2article[edge[1]] = set()
        node2article[edge[1]].update(map(lambda x: x.article_id, mentions))
        
    for k, v in node2article.items():
        if len(v)>1:
            sg_linking_nodes.add(k)
            
    return sg, sg_tactics, sg_articles, sg_linking_nodes

def draw_graph(g, scheme=False, legend=False):
    node_deg = nx.degree(g)
    layout = nx.spring_layout(g, k=2, iterations=80)
    plt.figure(num=None, figsize=(50, 50), dpi=80)

    if scheme:
        node_labels = {node[0] : f"{node[0]}" for node in g.nodes(data=True)}
    else:
        node_labels = {node[0] : f"{node[0]}\n({node[1]['ent']})" for node in g.nodes(data=True)}
    nx.draw_networkx(
        g,
        node_size=[int(deg[1]) * 1000 for deg in node_deg],
        arrowsize=10,
        linewidths=1.5,
        pos=layout,
        edge_color='red',
        node_shape="s",
        bbox=dict(facecolor="skyblue", edgecolor='black', boxstyle='round,pad=0.1'),
        node_color='white',
        font_size=13,
        with_labels=True,
        labels = node_labels,
        )

    edge_labels = {}
    edge_mentions = []
    for edge in g.edges(data=True):
        mentions = edge[2]['mentions']
        mention_articles = ','.join(map(lambda x: str(x.article_id), mentions))
        mention_tactics = set()
        for mention in mentions:
            if len(mention.tactic)>0:
                for tac in mention.tactic: 
                    mention_tactics.add(tactic_dic[tac])
        mention_tactics = '('+','.join(mention_tactics)+')'
        
        edge_labels[(edge[0], edge[1])] = '-'.join([mention_tactics, edge[2]['relation'], mention_articles])
        edge_mentions.append(mentions)
    edge_mentions = [item for sublist in edge_mentions for item in sublist]
    descriptions = set([x.article_id for x in edge_mentions])
    
    #texts = ['::'.join([str(x), articles.loc[x]['title'], articles.loc[x]['date']]) for x in descriptions]
    nx.draw_networkx_edge_labels(g, 
                                 pos=layout, 
                                 edge_labels=edge_labels,
                                 font_color='red',
                                 font_size=12)
    #if legend:
        #plt.legend(texts, fontsize=10)

model = NERModel("roberta", "nermodel",use_cuda=False)

In [13]:
#read csv
df=pd.read_csv('top10_tor_data_frame.csv')

#find article and resolve coref
coref_resolved_df=pd.DataFrame(columns=['hardcode_id','line'])
#for i in df.hardcode_id.unique(): add tqdm
from tqdm import tqdm
for i in tqdm(df.hardcode_id.unique()):
    chunk_df=df[df.hardcode_id==i]
    fulltext='<SEP>'.join(chunk_df.line.values)
    fulltext = nlp(fulltext)._.coref_resolved
    for line in fulltext.split('<SEP>'):
        coref_resolved_df=coref_resolved_df.append({'hardcode_id':i,'line':line},ignore_index=True)

100%|██████████| 98/98 [01:15<00:00,  1.29it/s]


In [15]:
#save as csv, encoding='utf-8-sig' to avoid the problem of Chinese characters, index=False to avoid the first column of index
coref_resolved_df.to_csv('tor_top10_coref_resolved.csv',encoding='utf-8-sig',index=False)

In [None]:
with CoreNLPClient(
        properties={'annotators': 'tokenize,mwt,ssplit,pos,lemma,ner,parse,depparse,natlog,openie',
                            'coref.algorithm' : 'neural',
                            'ssplit.boundaryTokenRegex': '''''',
                            'openie.resolve_coref': False, 
                            'openie.triple.strict': True, 
                            'openie.triple.all_nominals':False,
                            'openie.affinity_probability_cap': 1.0},
        timeout=30000,
        memory='4G',endpoint="http://localhost:9079") as client:    

        for doc_id, text in tzip(doc_ids, texts):
            text, ioc_dic = change_ioc(text)
            text = nlp(text)._.coref_resolved;

In [None]:
for sent_ind, sent in enumerate(ann.sentence):
    print(sent_ind)

In [None]:
sentences=coref_resolved_df.line.values[:10]

In [None]:
#extract triples
extracted_list_list=[]
entity_count_list=[]
rel_count_list=[]
with CoreNLPClient(
        properties={'annotators': 'tokenize,mwt,ssplit,pos,lemma,ner,parse,depparse,natlog,openie',
                            'coref.algorithm' : 'neural',
                            'ssplit.boundaryTokenRegex': '''''',
                            'openie.resolve_coref': False, 
                            'openie.triple.strict': True, 
                            'openie.triple.all_nominals':False,
                            'openie.affinity_probability_cap': 1.0},
        timeout=30000,
        memory='4G',endpoint="http://localhost:9078") as client:
    for text in sentences:
        text, ioc_dic = change_ioc(text)
        ann = client.annotate(text)
        ner, raw_out = bert_ner(ann)      
        extracted_list, entity_count, rel_count = extract_triples_with_ner(ann, ner, raw_out, ioc_dic, doc_id = 1)
        extracted_list_list.append(extracted_list)
        entity_count_list.append(entity_count)
        rel_count_list.append(rel_count)
print('done');

In [None]:
#show results
for extracted_list in extracted_list_list:
    for e in extracted_list:
        if e.sub_ent!='' or e.obj_ent!='':
            print(e.sub,'|',e.rel,'|',e.obj,'|',e.sub_ent,'|',e.obj_ent,'|',e.sub_ent_conf,'|',e.obj_ent_conf,'|',e.confidence)
