In [1]:
# Perform standard imports
import spacy
nlp = spacy.load('en_core_web_sm')

In [2]:
class NerEncoder():
    def __init__(self):
        self.ner_dict = {}
        self.label_index = 0
        
        
    def translate(self, label, word):
        self.label_index += 1
        if label == 'NORP':
            self.ner_dict["Group" + str(self.label_index)] = word
            return "Group" + str(self.label_index)
        if label == 'FAC':
            self.ner_dict["Place" + str(self.label_index)] = word
            return "Place" + str(self.label_index)
        if label == 'ORG':
            self.ner_dict["Organization" + str(self.label_index)] = word
            return "Organization" + str(self.label_index)
        if label == 'GPE':
            self.ner_dict["State" + str(self.label_index)] = word
            return "State" + str(self.label_index)
        if label == 'LOC':
            self.ner_dict["Location" + str(self.label_index)] = word
            return "Location" + str(self.label_index)
        if label == 'PERCENT':
            self.ner_dict["Percentage" + str(self.label_index)] = word
            return "Percentage" + str(self.label_index)
        if label == 'ORDINAL':
            self.ner_dict["Number" + str(self.label_index)] = word
            return "Number" + str(self.label_index)
        if label == 'CARDINAL':
            self.ner_dict["first" + str(self.label_index)] = word
            return "first" + str(self.label_index)
        else:
            self.ner_dict[label + str(self.label_index)] = word
            return label + str(self.label_index)
        
    
    def encode(self, doc):
        res = ""
        b = False
        for i, token in enumerate(doc):
            for ent in doc.ents:
                if i == ent.start:
                    b = True
                    res += self.translate(ent.label_, ent.text) + " "
                if i == ent.end:
                    b = False
            if b==False:
                res += token.text +" "
        return res
    def decode(self, doc):
        res = ""
        for token in doc:
            if token.text in self.ner_dict:
                res += self.ner_dict[token.text] + " "
            else:
                res += token.text + " "
        return res

In [3]:
doc = nlp(u'ali ahmad ahmad loves Massachusetts Institute of Technology, he lives in the United States of America')

In [4]:
encoder = NerEncoder()
print(encoder.encode(doc))
print(encoder.decode(nlp(encoder.encode(doc))))

PERSON1 loves Organization2 , he lives in State3 
ali ahmad ahmad loves Massachusetts Institute of Technology , he lives in the United States of America 


In [5]:
from spacy import displacy 
print(f"{'Node (from)-->':<15} {'Relation':^10} {'-->Node (to)':>15}\n")
for token in doc:
    print("{:<15} {:^10} {:>15}".format(str(token.head.text), str(token.dep_), str(token.text)))
displacy.render(doc, style='dep')

Node (from)-->   Relation     -->Node (to)

loves             nsubj                 he
loves              ROOT              loves
job                poss                his
loves              dobj                job


In [6]:
simplified_doc = nlp(encoder.encode(doc))
from spacy import displacy 
print(f"{'Node (from)-->':<15} {'Relation':^10} {'-->Node (to)':>15}\n")
for token in simplified_doc:
    print("{:<15} {:^10} {:>15}".format(str(token.head.text), str(token.dep_), str(token.text)))
displacy.render(simplified_doc, style='dep')

Node (from)-->   Relation     -->Node (to)

loves             nsubj                 he
loves              ROOT              loves
job                poss                his
loves              dobj                job


In [50]:
from nltk.corpus import wordnet as wn
x = wn.synsets('love')
for i, synset in enumerate(x):
    print("synset_" + str(i))
    print(synset.definition())
    print("synonyms: ")
    for l in synset.lemmas():
        print(l.name())
    if len(synset.lemmas()[0].antonyms()) > 0:
        print("antonynom is:", synset.lemmas()[0].antonyms()[0])
    if len(synset.examples()) > 0:
        print(synset.examples())
    else:
        print("no examples")
    print('***********************')

synset_0
a strong positive emotion of regard and affection
synonyms: 
love
antonynom is: Lemma('hate.n.01.hate')
['his love for his work', 'children need a lot of love']
***********************
synset_1
any object of warm affection or devotion
synonyms: 
love
passion
['the theater was her first love', 'he has a passion for cock fighting']
***********************
synset_2
a beloved person; used as terms of endearment
synonyms: 
beloved
dear
dearest
honey
love
no examples
***********************
synset_3
a deep feeling of sexual desire and attraction
synonyms: 
love
sexual_love
erotic_love
['their love left them indifferent to their surroundings', 'she was his first love']
***********************
synset_4
a score of zero in tennis or squash
synonyms: 
love
['it was 40 love']
***********************
synset_5
sexual activities (often including sexual intercourse) between two people
synonyms: 
sexual_love
lovemaking
making_love
love
love_life
['his lovemaking disgusted her', "he hadn't had 

In [8]:
from transformers import BertTokenizer, BertModel
import pandas as pd
import numpy as np
import nltk
import torch
from scipy.spatial.distance import cosine

In [9]:
# Loading the pre-trained BERT model
###################################
# Embeddings will be derived from
# the outputs of this model
model = BertModel.from_pretrained('bert-base-uncased',
                                  output_hidden_states = True,
                                  )

# Setting up the tokenizer
###################################
# This is the same tokenizer that
# was used in the model to generate 
# embeddings to ensure consistency
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [10]:
def bert_text_preparation(text, tokenizer):
    """Preparing the input for BERT
    
    Takes a string argument and performs
    pre-processing like adding special tokens,
    tokenization, tokens to ids, and tokens to
    segment ids. All tokens are mapped to seg-
    ment id = 1.
    
    Args:
        text (str): Text to be converted
        tokenizer (obj): Tokenizer object
            to convert text into BERT-re-
            adable tokens and ids
        
    Returns:
        list: List of BERT-readable tokens
        obj: Torch tensor with token ids
        obj: Torch tensor segment ids
    
    
    """
    marked_text = "[CLS] " + text + " [SEP]"
    tokenized_text = tokenizer.tokenize(marked_text)
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
    segments_ids = [1]*len(indexed_tokens)

    # Convert inputs to PyTorch tensors
    tokens_tensor = torch.tensor([indexed_tokens])
    segments_tensors = torch.tensor([segments_ids])

    return tokenized_text, tokens_tensor, segments_tensors
    
def get_bert_embeddings(tokens_tensor, segments_tensors, model):
    """Get embeddings from an embedding model
    
    Args:
        tokens_tensor (obj): Torch tensor size [n_tokens]
            with token ids for each token in text
        segments_tensors (obj): Torch tensor size [n_tokens]
            with segment ids for each token in text
        model (obj): Embedding model to generate embeddings
            from token and segment ids
    
    Returns:
        list: List of list of floats of size
            [n_tokens, n_embedding_dimensions]
            containing embeddings for each token
    
    """
    
    # Gradient calculation id disabled
    # Model is in inference mode
    with torch.no_grad():
        outputs = model(tokens_tensor, segments_tensors)
        # Removing the first hidden state
        # The first state is the input state
        hidden_states = outputs[2][1:]

    # Getting embeddings from the final BERT layer
    token_embeddings = hidden_states[-1]
    # Collapsing the tensor into 1-dimension
    token_embeddings = torch.squeeze(token_embeddings, dim=0)
    # Converting torchtensors to lists
    list_token_embeddings = [token_embed.tolist() for token_embed in token_embeddings]

    return list_token_embeddings

In [11]:
def dep_encoder(doc, ind):
    res = ""
    allowed = ["nsubj", "dobj", "cobj", "ROOT", "amod", "det", "poss"]
    for i, token in enumerate(doc):
        if str(token.head.text) == str(doc[ind]):
            if str(token.dep_) in allowed:
                res += str(token.dep_) + " "
    return res

def dep_encoder_similarity(dep_1, dep_2):
    res = 0
    dep1 = dep_1
    dep2 = dep_2
    if len(dep1) > len(dep2):
        dep1 = dep_2
        dep2 = dep_1
    for i, word in enumerate(dep1):
        if word == dep2[i]:
            res += 1
    return res/(max(len(dep1), len(dep2)))

In [12]:
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer

def pos_extractor(pos):
    if pos == "NOUN":
        return "n"
    if pos == "VERB":
        return "v"
    if pos == "ADJ":
        return "a"
    if pos == "ADV":
        return "r"
    else:
        return "o"
    
    
def get_index(sent, words):
    ps = PorterStemmer()
    words = [ps.stem(w) for w in words]
    for i, word in enumerate(sent.split()):
        if ps.stem(word) in words:
            return i

In [89]:
def all_subsets(lst1, lst2):
    res = []
    for item1 in lst1:
        for item2 in lst2:
            f = item1.copy()
            f.append(item2)
            res.append(f)
    return res

def paraphrase(to_be_replaced):
    subsets = []
    for i in range(len(to_be_replaced[0])):
        subsets.append([i])
    for i, replace in enumerate(to_be_replaced):
        if i == 0:
            continue
        a = [i for i in range(len(replace))]
        subsets = all_subsets(subsets, a)
    res = []
    for subset in subsets:
        text = ""
        score = 1.0
        for i, word_index in enumerate(subset):
            text += to_be_replaced[i][word_index][0]
            text += " "
            score*=to_be_replaced[i][word_index][1]
        sent = tuple((text, score))
        res.append(sent)
    sorted_list = sorted(res, key=lambda x: x[1], reverse = True) 
    return sorted_list
        

#print(all_subsets([[0, 1], [0, 2]], [0, 1, 2]))

print(paraphrase([[tuple(("b", 0.2))], [tuple(("c", 0.2)), tuple(("d", 0.2))]]))

[('b c ', 0.04000000000000001), ('b d ', 0.04000000000000001)]


In [91]:
to_be_replaced = []
doc = nlp("he can achieve his goal")
# doc = nlp("he goes to work every day")
# doc = nlp("he loves playing football")
# doc = nlp("this process includes many types of activities")
# doc = nlp("do you like to have a cup of tea")
# doc = nlp("hw loves his job")

#doc = nlp("I want to win this game")
doc = nlp(encoder.encode(doc))

for i, token in enumerate(doc):
    
    if pos_extractor(token.pos_) == "o":
        to_be_replaced.append([tuple((token.text, 1.0))])
        continue
    synsets = wn.synsets(token.text, pos = pos_extractor(token.pos_))
    to_be_appended = [tuple((token.text, 1.0))]
    new_words = set(token.text)
    new_words.add(token.lemma_)
    for synset in synsets:
        names = [x.name() for x in synset.lemmas()]        
        for example in synset.examples():
            ind_head = get_index(encoder.encode(nlp(example)), names)
            if ind_head is not None:
                dep_example = dep_encoder(nlp(encoder.encode(nlp(example))), ind_head)
                dep_original = dep_encoder(doc, i)
                if len(dep_original) > 1:
                    simi = dep_encoder_similarity(dep_example, dep_original)
                    if simi > 0.5:
                        for name in names:
                            if name not in new_words:
                                new_words.add(name)
                                to_be_appended.append(tuple((name, simi)))
    to_be_replaced.append(to_be_appended)
print(to_be_replaced)
print(paraphrase(to_be_replaced))

[[('he', 1.0)], [('can', 1.0)], [('achieve', 1.0), ('accomplish', 1.0), ('attain', 1.0), ('reach', 1.0)], [('his', 1.0)], [('goal', 1.0), ('finish', 1.0), ('destination', 1.0)]]
[('he can achieve his goal ', 1.0), ('he can achieve his finish ', 1.0), ('he can achieve his destination ', 1.0), ('he can accomplish his goal ', 1.0), ('he can accomplish his finish ', 1.0), ('he can accomplish his destination ', 1.0), ('he can attain his goal ', 1.0), ('he can attain his finish ', 1.0), ('he can attain his destination ', 1.0), ('he can reach his goal ', 1.0), ('he can reach his finish ', 1.0), ('he can reach his destination ', 1.0)]


In [104]:
to_be_replaced = []
doc = nlp("he insisted to achieve his goal")
doc = nlp("he loves his job")

doc = nlp(encoder.encode(doc))
tokenized_text, tokens_tensor, segments_tensors = bert_text_preparation(doc.text, tokenizer)
list_token_embeddings = get_bert_embeddings(tokens_tensor, segments_tensors, model)
for i, token in enumerate(doc):
    if pos_extractor(token.pos_) == "o":
        to_be_replaced.append([tuple((token.text, 1.0))])
        continue
    if tokenized_text.count(token.text) > 0:
        word_index = tokenized_text.index(token.text)
        word_embedding = list_token_embeddings[word_index]
    else:
        to_be_replaced.append([tuple((token.text, 1.0))])
        continue
    synsets = wn.synsets(token.text, pos = pos_extractor(token.pos_))
    to_be_appended = [tuple((token.text, 1.0))]
    new_words = set(token.text)
    new_words.add(token.lemma_)
    for synset in synsets:
        names = [x.name() for x in synset.lemmas()]
        for name in names:
            context = doc.text.split(' ')
            context[i] = name
            tokenized_text_example, tokens_tensor_example, segments_tensors_example = bert_text_preparation(str(context), tokenizer)
            list_token_embeddings_example = get_bert_embeddings(tokens_tensor_example, segments_tensors_example, model)
            if tokenized_text_example.count(name) > 0:
                word_index_example = tokenized_text_example.index(name)
                word_embedding_example = list_token_embeddings_example[word_index_example]
                if name not in new_words:
                    to_be_appended.append(tuple((name, 1 - cosine(word_embedding, word_embedding_example))))

    to_be_replaced.append(to_be_appended)
print(to_be_replaced)
print(paraphrase(to_be_replaced))

[[('he', 1.0)], [('loves', 1.0), ('enjoy', 0.43151522449495616), ('know', 0.39090325277610716), ('screw', 0.3693842734242532), ('fuck', 0.3747362351408903), ('jazz', 0.3108570027979174), ('bed', 0.3044436655213504), ('bang', 0.3327514086942118)], [('his', 1.0)], [('job', 1.0), ('occupation', 0.4365978158609427), ('business', 0.4491671778335864), ('line', 0.33719778558596203), ('task', 0.4419834812851823), ('problem', 0.40334215120539607)]]
[('he loves his job ', 1.0), ('he loves his business ', 0.4491671778335864), ('he loves his task ', 0.4419834812851823), ('he loves his occupation ', 0.4365978158609427), ('he enjoy his job ', 0.43151522449495616), ('he loves his problem ', 0.40334215120539607), ('he know his job ', 0.39090325277610716), ('he fuck his job ', 0.3747362351408903), ('he screw his job ', 0.3693842734242532), ('he loves his line ', 0.33719778558596203), ('he bang his job ', 0.3327514086942118), ('he jazz his job ', 0.3108570027979174), ('he bed his job ', 0.30444366552135

In [105]:
def find_closest_embeddings(embedding):
    words = sorted(embeddings_dict.keys(), key=lambda word: spatial.distance.cosine(embeddings_dict[word], embedding))
    dists = [1 - spatial.distance.cosine(embeddings_dict[word], embedding) for word in words]
    return words[:5], dists[:5]

In [23]:
import nltk
from nltk.corpus import stopwords
from scipy import spatial
stop_words = set(stopwords.words('english'))

In [34]:
doc = nlp("he insisted to achieve his goal")
doc = nlp("the king loves his job")

doc = nlp(encoder.encode(doc))
embeddings_dict = {}
line = doc.text
with open("glove.6B.300d.txt", 'r', encoding="utf-8") as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], "float32")
        embeddings_dict[word] = vector

In [109]:
to_be_replaced = []
print(doc)
for word in doc.text.split(' '):
    if word not in stop_words:
        if word in embeddings_dict:
            to_be_appended = find_closest_embeddings(embeddings_dict[word])
            lst = []
            for i in range(len(to_be_appended[0])):
                lst.append(tuple((to_be_appended[0][i], to_be_appended[1][i])))
        else:
            lst = [tuple((word, 1.0))]
    else:
        lst = [tuple((word, 1.0))]

    
    to_be_replaced.append(lst)

he loves his job 


In [110]:
print(to_be_replaced)

[[('he', 1.0)], [('loves', 1.0), ('hates', 0.7345002889633179), ('likes', 0.6868317127227783), ('everybody', 0.6422836184501648), ('love', 0.6420262455940247)], [('his', 1.0)], [('job', 1.0), ('jobs', 0.757643461227417), ('doing', 0.6103168725967407), ('work', 0.60713791847229), ('working', 0.6018442511558533)], [('', 1.0)]]


In [111]:
print(paraphrase(to_be_replaced))

[('he loves his job  ', 1.0), ('he loves his jobs  ', 0.757643461227417), ('he hates his job  ', 0.7345002889633179), ('he likes his job  ', 0.6868317127227783), ('he everybody his job  ', 0.6422836184501648), ('he love his job  ', 0.6420262455940247), ('he loves his doing  ', 0.6103168725967407), ('he loves his work  ', 0.60713791847229), ('he loves his working  ', 0.6018442511558533), ('he hates his jobs  ', 0.5564893412027061), ('he likes his jobs  ', 0.5203735561080407), ('he everybody his jobs  ', 0.4866219837722525), ('he love his jobs  ', 0.4864269869107005), ('he hates his doing  ', 0.4482779192814945), ('he hates his work  ', 0.44594297655848436), ('he hates his working  ', 0.4420547763848859), ('he likes his doing  ', 0.4191849829092291), ('he likes his work  ', 0.4170015764032655), ('he likes his working  ', 0.41336571781373266), ('he everybody his doing  ', 0.39199652933262286), ('he love his doing  ', 0.3918394503359721), ('he everybody his work  ', 0.3899547391746836), ('