In [114]:
from pathlib import Path
import json

network = open("./data/jsons/Network/networks.json", 'r', encoding='utf-8')
microservices = open("./data/jsons/Microservices/microservices.json", 'r', encoding='utf-8')

network_documents = json.load(network)
microservices_documents = json.load(microservices)

network.close()
microservices.close()

documents = {}
documents["network"] = network_documents
documents["microservices"] = microservices_documents
network_documents[17]

{'id': None,
 'doi': '10.1109/TrustCom50675.2020.00150',
 'authors': 'Jehanzaib Yousuf Muhammad;Mingjun Wang;Zheng Yan;Fawad Khan',
 'documentTitle': 'Trusted Network Slicing Among Multiple Mobile Network Operators',
 'publicationTitle': '2020 IEEE 19th International Conference on Trust, Security and Privacy in Computing and Communications (TrustCom)',
 'workAbstract': '5G mobile networks are expected to be much bigger in size, faster in speeds and better in scalability, providing varied services to different users and businesses in contrast to previous networks. 5G will also help enabling new business models and use cases. “Network Slicing” is a driving architectural concept for multi-tenancy. Network Slicing enables Mobile Network Operators (MNOs) to deploy different services over shared physical infrastructure, increasing inter-operator resource sharing. As 5G is still in its nascent, inter operator cooperation is an area that requires immediate attention of research. Traditional in

In [16]:
from transformers import BertTokenizer, BertModel, BertForMaskedLM
import torch
from torch.nn.functional import one_hot

class BERTComponent:
    tokenizer = None
    bert_model = None

    def __init__(self, model):
        self.bert_vector_size = 3072
        self.sent_vector_size = 768
        self.model = model
        self.tokenizer = BERTComponent.tokenizer if BERTComponent.tokenizer else BertTokenizer.from_pretrained(model)
        BERTComponent.tokenizer = self.tokenizer
        self.bert_model = BERTComponent.bert_model if BERTComponent.bert_model else BertModel.from_pretrained(model)
        BERTComponent.bert_model = self.bert_model
        self.bert_model.eval()


    def get_bert_spans(self, words, bert_tokens):
        if self.model == 'bert-large-uncased':
            words = [self._flat_word(word) for word in words]

        i = 0
        j = 1
        idx = 0

        bert_words_indexes = []
        bert_words = []
        while i < len(words):
            word = words[i]

            bert_word = bert_tokens[j]
            bert_word = bert_word[2:] if bert_word.startswith("##") else bert_word
            bert_word = bert_word[idx:]

            #Spacing control
            if word in [" ", "  ", "   "]:
                bert_words.append([word])
                bert_words_indexes.append([-1])

            #When the current word is [UNK] for bert
            elif bert_word == "[UNK]":
                bert_words.append(["[UNK]"])
                bert_words_indexes.append([j])
                j += 1
                idx = 0

            #When the current word is contained in bert token. Very weird
            elif len(word) < len(bert_word) and bert_word.find(word) >= 0:
                bert_words.append([bert_word])
                bert_words_indexes.append([j])

                idx = bert_word.find(word) + len(word)
                if idx == len(bert_word):
                    j += 1
                    idx = 0

            #Otherwise
            else:
                k = 0
                span = []
                span_indexes = []

                while k < len(word):
                    if word.find(bert_word, k) == k:
                        span.append(bert_word)
                        span_indexes.append(j)
                        k += len(bert_word)
                        j += 1
                        idx = 0
                        bert_word = bert_tokens[j]
                        bert_word = bert_word[2:] if bert_word.startswith("##") else bert_word
                    else:
                        print("Error")
                        return bert_words, bert_words_indexes

                bert_words.append(span)
                bert_words_indexes.append(span_indexes)

            i += 1

        assert len(bert_words_indexes) == len(words)

        return bert_words, bert_words_indexes

    def _flat_word(self, word):
        word = word.lower()
        word = word.replace("ñ", "n")
        word = word.replace("á", "a")
        word = word.replace("é", "e")
        word = word.replace("í", "i")
        word = word.replace("ó", "o")
        word = word.replace("ú", "u")
        word = word.replace("ä", "a")
        word = word.replace("ü", "u")
        word = word.replace("ö", "o")
        word = word.replace("ū", "u")
        word = word.replace("ā", "a")
        word = word.replace("ī", "i")
        word = word.replace("ș", "s")
        word = word.replace("ã", "a")
        word = word.replace("ô", "o")

        return word

    def _sum_merge(self, vectors):
        return torch.sum(torch.stack(vectors), dim=0)

    def _mean_merge(self, vectors):
        return torch.mean(torch.stack(vectors), dim=0)

    def _last_merge(self, vectors):
        return vectors[-1]

    def _get_merge_tensors(self, token_vec_sums, words_indexes):
        pad_tensor = torch.zeros(self.bert_vector_size)
        real_vec = []
        for word_indexes in words_indexes:
            vectors = [(token_vec_sums[idx] if idx != -1 else pad_tensor) for idx in word_indexes]
            real_vec.append(self._mean_merge(vectors))

        return real_vec

    def get_bert_embeddings(self, sentence, spans):
        tokenized_sentence = self.tokenizer.tokenize(sentence)
        tokenized_sentence = ['[CLS]'] + tokenized_sentence + ['[SEP]']
        indexed_tokens = self.tokenizer.convert_tokens_to_ids(tokenized_sentence)
        segments_ids = [1] * len(tokenized_sentence)

        tokens_tensor = torch.tensor([indexed_tokens])
        segments_tensors = torch.tensor([segments_ids])

        with torch.no_grad():
            encoded_layers = self.bert_model(tokens_tensor, segments_tensors, output_hidden_states=True)

        #print("This is enconded layers: ", len(encoded_layers.hidden_states))
        
        token_embeddings = torch.stack(encoded_layers.hidden_states, dim=0)
        token_embeddings = torch.squeeze(token_embeddings, dim=1)
        token_embeddings = token_embeddings.permute(1,0,2)

        token_vec_sums = []
        for token in token_embeddings:
            cat_vec = torch.cat((token[-1], token[-2], token[-3], token[-4]), dim=-1)
            token_vec_sums.append(cat_vec)

        words = [sentence[beg:end] for (beg, end) in spans]
        bert_words, bert_words_indexes = self.get_bert_spans(words, tokenized_sentence)

        bert_embeddings = self._get_merge_tensors(token_vec_sums, bert_words_indexes)
        sentence_embedding = torch.mean(torch.stack(token_vec_sums), dim=0)
        
        return bert_embeddings, sentence_embedding

In [24]:
from nltk.tokenize import TreebankWordTokenizer as twt
spans = twt().span_tokenize("Hi this is a test")
bert = BERTComponent('bert-large-uncased')
q_embeddings, q_sent_embedding = bert.get_bert_embeddings("Hi this is a test", spans)
q_sent_embedding

tensor([ 0.0484,  0.1071, -0.1547,  ...,  0.3583,  0.2960,  0.1679])

In [25]:
doc_spans = twt().span_tokenize("Hi this is a documment for test")
doc_bert = BERTComponent('bert-large-uncased')
doc_embeddings, doc_sent_embedding = bert.get_bert_embeddings("Hi this is a documment for test", spans)
doc_sent_embedding

tensor([ 0.0943, -0.1501,  0.1642,  ..., -0.0924,  0.0940,  0.3068])

In [26]:
dot = torch.dot(q_sent_embedding, doc_sent_embedding)
dot

tensor(728.5737)

In [105]:
from nltk.tokenize import TreebankWordTokenizer as twt
import torch
import torch.nn as nn
import torch.nn.functional as F

class DocumentRanker:
    
    def __init__(self, documents):
        self.documents = documents
        self.bert = BERTComponent('bert-large-uncased')
    
    def __get_info_rep(self, document):
        pass
    
    def __get_embedding(self, text):
        spans = twt().span_tokenize(text)
        text_word_embeddings, text_embedding = self.bert.get_bert_embeddings(text, spans)
        return text_embedding
    
    def get_related_documents(self, query, number_of_documents):
        index = {}
        last = 0
        related_documents = []
        
        q_sent_embedding = self.__get_embedding(query)
        
        for document in self.documents:
            abstract = document['workAbstract']
            abstract_embedding = self.__get_embedding(abstract)
            index[last] = torch.dot(q_sent_embedding, abstract_embedding)
            last += 1
        
        doc_scores = list(index.items())
        doc_scores = [(x[0], x[1].tolist()) for x in doc_scores]
        doc_scores= sorted(doc_scores, key = lambda x: x[1], reverse=True)
        #print("Scores: ", scores)
#         probs = F.softmax(scores, dim=0)
#         probs = [t.tolist() for t in probs]
#         probs.sort(reverse=True)
        #print("Probs: ", probs)
        print(doc_scores)
        
        if number_of_documents > len(doc_scores):
            return doc_scores
        else:
            return doc_scores[:number_of_documents]
        

In [None]:
docs = network_documents[:50]
ranker = DocumentRanker(docs)

In [118]:
related_docs = ranker.get_related_documents("Intel SGX based multi-MNO cooperation scheme for trusted, dynamic and efficient network slice sharing in order to support inter-operator trustworthy collaboration", 10)
related_docs

[(9, 749.80615234375), (16, 739.1793823242188), (42, 733.2034301757812), (0, 733.1528930664062), (32, 731.5643310546875), (14, 729.9841918945312), (23, 729.164306640625), (19, 727.5703125), (17, 725.63818359375), (3, 723.998779296875), (4, 722.0805053710938), (12, 722.0667724609375), (8, 721.5213623046875), (38, 718.4793090820312), (7, 717.9136352539062), (36, 717.0819702148438), (44, 716.21533203125), (25, 716.06005859375), (31, 715.6558227539062), (48, 715.2150268554688), (40, 715.1771850585938), (1, 714.4179077148438), (5, 712.82421875), (2, 712.6151123046875), (49, 711.911376953125), (41, 710.5901489257812), (26, 709.453857421875), (45, 709.44384765625), (13, 709.2012939453125), (33, 708.664794921875), (21, 706.9766845703125), (47, 703.2574462890625), (34, 702.8947143554688), (28, 701.8461303710938), (27, 701.268310546875), (6, 700.61767578125), (22, 699.0571899414062), (15, 697.817138671875), (43, 695.0389404296875), (11, 692.7727661132812), (46, 691.4434204101562), (24, 691.39514

[(9, 749.80615234375),
 (16, 739.1793823242188),
 (42, 733.2034301757812),
 (0, 733.1528930664062),
 (32, 731.5643310546875),
 (14, 729.9841918945312),
 (23, 729.164306640625),
 (19, 727.5703125),
 (17, 725.63818359375),
 (3, 723.998779296875)]

In [101]:
docs[14]

{'id': None,
 'doi': '10.1109/NOMS.2016.7502883',
 'authors': 'Joris Claassen;Ralph Koning;Paola Grosso',
 'documentTitle': 'Linux containers networking: Performance and scalability of kernel modules',
 'publicationTitle': 'NOMS 2016 - 2016 IEEE/IFIP Network Operations and Management Symposium',
 'workAbstract': 'Linux container virtualisation is gaining momentum as lightweight technology to support cloud and distributed computing. Applications relying on container architectures might at times rely on inter-container communication, and container networking solutions are emerging to address this need. Containers can be networked together as part of an overlay network, or with actual links from the container to the network via kernel modules. Most overlay solutions are not quite production ready yet; on the other hand kernel modules that can link a container to the network are much more mature. We benchmarked three kernel modules: veth, macvlan and ipvlan, to quantify their respective ra