In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')


In [None]:
!pip install pytorch-pretrained-bert==0.6.2
!pip install git+https://github.com/boudinfl/pke.git
!pip install flashtext
!python -m spacy download en
!python -m nltk.downloader universal_tagset

In [5]:
import re
import torch
from pytorch_pretrained_bert import BertTokenizer,BertForMaskedLM
import time
start = time.time()
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForMaskedLM.from_pretrained('bert-base-uncased')
model.eval()
end = time.time()
print ("Time Elapsed to load BERT ",end-start)

100%|██████████| 231508/231508 [00:00<00:00, 905626.02B/s]
100%|██████████| 407873900/407873900 [00:10<00:00, 37319225.17B/s]


Time Elapsed to load BERT  22.36511468887329


In [7]:
def get_predicted_words(text):
    text = "[CLS] " + text.replace("____", "[MASK]") + " [SEP]"
    tokenized_text = tokenizer.tokenize(text)
    masked_index = tokenized_text.index('[MASK]')
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
    segments_ids = [0] * len(tokenized_text)
    tokens_tensor = torch.tensor([indexed_tokens])
    segments_tensors = torch.tensor([segments_ids])
    with torch.no_grad():
        predictions = model(tokens_tensor, segments_tensors)
    k = 30
    predicted_index, predicted_index_values = torch.topk(predictions[0, masked_index], k)
    predicted_tokens = tokenizer.convert_ids_to_tokens(predicted_index_values.tolist())
    filtered_tokens_to_remove_punctuation = []
    
    for token in predicted_tokens:
        if re.match("^[a-zA-Z0-9_]*$", token):
            filtered_tokens_to_remove_punctuation.append(token)
        
    return filtered_tokens_to_remove_punctuation

In [9]:
sentence = "i am so fat ____ i cannot"
print ("original sentence: ",sentence,"\n")
predicted_words = get_predicted_words(sentence)
print ("predicted choices: ", predicted_words)

original sentence:  i am so fat ____ i cannot 

predicted choices:  ['that', 'and', 'but', 'because', 'when', 'now', 'yet', 'even', 'as', 'so', 'if', 'here', 'for', 'sometimes', 'then', 'where', 'today', 'inside', 'tonight']


In [10]:
file_path = "sun.txt" 

def read_file(file_path):
    with open(file_path, 'r') as content_file:
        content = content_file.read()
        return content
    
text = read_file(file_path)
print(text)


Scientists know many things about the Sun. They know how old it is. The Sun is more than 4 billion years old. That would be too many candles to put on a birthday cake!  They also know the Sun's size. The Sun may seem small, but that is because it is so far away. It is about 93 million miles (150 million kilometers) away from the Earth. The Sun is so large that the diameter of the Sun is 109 times the Earth's diameter. The Sun also weighs as much as 333,000 Earths. The Sun is made up of gases: 75% hydrogen and 25% helium.  Hydrogen is the simplest and lightest of all of the known elements. When you combine hydrogen with oxygen, you get water. You probably know what helium is. It is the gas that can be put into balloons to make them stay in the air and float. Scientists also know the temperature of the Sun. The surface of the Sun is about 10,000 degrees Fahrenheit (5,600 degrees Celsius). That might sound hot, but the Sun's core is even hotter. The core is the central region where the te

In [19]:
#  We will extract some adpositions. An adposition is a cover term for prepositions and postpositions.
import pke
import string


def get_adpositions_multipartite(text):
    out=[]

    extractor = pke.unsupervised.MultipartiteRank()
    extractor.load_document(input=text)
    pos = {'ADP'} 
    stoplist = list(string.punctuation)
    stoplist += ['-lrb-', '-rrb-', '-lcb-', '-rcb-', '-lsb-', '-rsb-']
    extractor.candidate_selection(pos=pos, stoplist=stoplist)
    extractor.candidate_weighting(alpha=1.1,threshold=0.75,method='average')
    keyphrases = extractor.get_n_best(n=10)
    for key in keyphrases:
        out.append(key[0])
    return out
adpositions = get_adpositions_multipartite(text)
print ("Adpositions from the text: ",adpositions)

[('from', 0.19093882002737322), ('without', 0.14011457301844113), ('around', 0.1351766209398164), ('about', 0.0977403268953759), ('up of', 0.08715105891647905), ('over', 0.08225340594374793), ('out', 0.0680767762519501), ('across from', 0.05676373409692328), ('with', 0.052306613832667145), ('into', 0.04968229545441503)]
Adpositions from the text:  ['from', 'without', 'around', 'about', 'up of', 'over', 'out', 'across from', 'with', 'into']


In [21]:
from nltk.tokenize import sent_tokenize
import nltk
nltk.download('punkt')
from flashtext import KeywordProcessor


def tokenize_sentences(text):
    sentences = [sent_tokenize(text)]
    sentences = [y for x in sentences for y in x]
    # Remove any short sentences less than 20 letters.
    sentences = [sentence.strip() for sentence in sentences if len(sentence) > 20]
    return sentences
sentences = tokenize_sentences(text)


def get_sentences_for_keyword(keywords, sentences):
    keyword_processor = KeywordProcessor()
    keyword_sentences = {}
    for word in keywords:
        keyword_sentences[word] = []
        keyword_processor.add_keyword(word)
    for sentence in sentences:
        keywords_found = keyword_processor.extract_keywords(sentence)
        for key in keywords_found:
            keyword_sentences[key].append(sentence)

    for key in keyword_sentences.keys():
        values = keyword_sentences[key]
        values = sorted(values, key=len, reverse=True)
        keyword_sentences[key] = values
    return keyword_sentences

keyword_sentence_mapping_adpos = get_sentences_for_keyword(adpositions, sentences)

for word in keyword_sentence_mapping_adpos:
    print (word, " : ",keyword_sentence_mapping_adpos[word],"\n")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
from  :  ["One thing they do is to look at the amount of light from the Sun and the effect of the Sun's light on the Earth's climate.", 'That is why you need to be careful of the Sun and wear sunscreen and clothing to protect yourself from its rays.', 'It is about 93 million miles (150 million kilometers) away from the Earth.', 'They all look tiny because they are so far away from the Earth.'] 

without  :  ['Our planet would also be without people, animals, and plants because these things need sunlight and water to live.', 'Without it, there would be only darkness and our planet would be very cold and be without liquid water.', 'Without it, there would be only darkness and our planet would be very cold and be without liquid water.'] 

around  :  ['They orbit around the same center point and across from each other.', 'The Earth and other planets revolve around the Sun.'] 

abo

In [22]:
def get_best_sentence_and_options(word, sentences_array):
    keyword = word
    sentences = sentences_array
    sentences = sorted(sentences, key=len, reverse=False)
    max_no = min(5, len(sentences))
    sentences = sentences[:max_no]
    choices_filtered = []
    ordered_sentences = []
    for sentence in sentences:
        insensitive_line = re.compile(re.escape(keyword), re.IGNORECASE)
        no_of_replacements =  len(re.findall(re.escape(keyword),sentence,re.IGNORECASE))
        blanked_sentence = insensitive_line.sub("____", sentence)
        blanks = get_predicted_words(blanked_sentence)

        if blanks is not None:
            choices_filtered = blanks
            try:
                word_index = choices_filtered.index(keyword.lower())
                if no_of_replacements<2:
                    ordered_sentences.append((blanked_sentence, choices_filtered, word_index))
            except:
                pass

    ordered_sentences = sorted(ordered_sentences, key=lambda x: x[2])
    if len(ordered_sentences) > 0:
        return (ordered_sentences[0][0], ordered_sentences[0][1])
    else:
        return None, None
    
for each_adpos in adpositions:
    sentence, best_options = get_best_sentence_and_options(each_adpos, keyword_sentence_mapping_adpos[each_adpos])
    print (sentence)
    print (best_options)
    print ("\n\n")

They all look tiny because they are so far away ____ the Earth.
['from', 'on', 'in', 'off', 'beneath', 'to', 'above', 'under', 'than', 'underneath', 'below', 'into', 'with', 'across', 'over', 'behind', 'of', 'by', 'near', 'form', 'inside', 'like', 'at', 'and', 'toward', 'down', 'towards', 'for', 'within']



Our planet would also be ____ people, animals, and plants because these things need sunlight and water to live.
['without', 'for', 'producing', 'eating', 'like', 'missing', 'of', 'having', 'with', 'lacking', 'around', 'losing', 'barren', 'killing', 'protecting', 'mostly', 'containing', 'including', 'about', 'farming', 'raining', 'all', 'saving', 'destroying', 'mainly', 'growing', 'rich', 'just', 'supporting', 'housing']



The Earth and other planets revolve ____ the Sun.
['around', 'round', 'near', 'about', 'along', 'under', 'surrounding', 'on', 'in', 'above', 'within', 'at', 'outside', 'over', 'below', 'beneath', 'by', 'of', 'toward', 'towards', 'inside', 'surrounded', 'upon', 'u