In [1]:
# !pip install --quiet allennlp==0.9.0
# !pip install --quiet spacy==2.1.9
# !pip install --quiet https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.1.0/en_core_web_sm-2.1.0.tar.gz

In [2]:
# !tar –xvzf en_core_web_sm-2.1.0.tar.gz

In [3]:
# !python -m spacy download en_core_web_sm

In [4]:
import spacy
from allennlp.predictors.predictor import Predictor
predictor = Predictor.from_path("https://s3-us-west-2.amazonaws.com/allennlp/models/elmo-constituency-parser-2018.03.14.tar.gz")

In [5]:
nlp = spacy.load("en_core_web_sm")

In [6]:
test_sentence = "The old woman was sitting under a tree and sipping coffee."
test_sentence = test_sentence.rstrip('?:!.,;')
print (test_sentence)
parser_output = predictor.predict(sentence=test_sentence)
# print (parser_output)

Your label namespace was 'pos'. We recommend you use a namespace ending with 'labels' or 'tags', so we don't add UNK and PAD tokens by default to your vocabulary.  See documentation for `non_padded_namespaces` parameter in Vocabulary.


The old woman was sitting under a tree and sipping coffee


In [7]:
tree_string = parser_output["trees"]
print (tree_string)

(S (NP (DT The) (JJ old) (NN woman)) (VP (VBD was) (VP (VP (VBG sitting) (PP (IN under) (NP (DT a) (NN tree)))) (CC and) (VP (VBG sipping) (NP (NN coffee))))))


In [8]:
from nltk import tokenize
from nltk.tree import Tree

tree = Tree.fromstring(tree_string)
print (tree)
print (tree.pretty_print())

(S
  (NP (DT The) (JJ old) (NN woman))
  (VP
    (VBD was)
    (VP
      (VP (VBG sitting) (PP (IN under) (NP (DT a) (NN tree))))
      (CC and)
      (VP (VBG sipping) (NP (NN coffee))))))
                            S                                          
      ______________________|________                                   
     |                               VP                                
     |          _____________________|_______                           
     |         |                             VP                        
     |         |                  ___________|________________          
     |         |                 VP               |           |        
     |         |      ___________|___             |           |         
     |         |     |               PP           |           VP       
     |         |     |       ________|___         |      _____|____     
     NP        |     |      |            NP       |     |          NP  
  ___|____   

In [9]:
# split at right most nounphrase or verbphrase

def get_flattened(t):
    sent_str_final = None
    if t is not None:
        sent_str = [" ".join(x.leaves()) for x in list(t)]
        sent_str_final = [" ".join(sent_str)]
        sent_str_final = sent_str_final[0]
    return sent_str_final

def get_right_most_VP_or_NP(parse_tree,last_NP = None,last_VP = None):
    if len(parse_tree.leaves()) == 1:
        return last_NP,last_VP
    last_subtree = parse_tree[-1]
    if last_subtree.label() == "NP":
        last_NP = last_subtree
    elif last_subtree.label() == "VP":
        last_VP = last_subtree
    
    return get_right_most_VP_or_NP(last_subtree,last_NP,last_VP)


last_nounphrase, last_verbphrase =  get_right_most_VP_or_NP(tree)
last_nounphrase_flattened = get_flattened(last_nounphrase)
last_verbphrase_flattened = get_flattened(last_verbphrase)

print ("Original Sentence ",test_sentence)
print ("last_nounphrase ",last_nounphrase )
print ("last_verbphrase ",last_verbphrase)
print ("\n ")
print ("last_nounphrase ",last_nounphrase_flattened )
print ("last_verbphrase ",last_verbphrase_flattened)

Original Sentence  The old woman was sitting under a tree and sipping coffee
last_nounphrase  (NP (NN coffee))
last_verbphrase  (VP (VBG sipping) (NP (NN coffee)))

 
last_nounphrase  coffee
last_verbphrase  sipping coffee


In [10]:
import re

# sub_string - sipping coffee
# main_string - The old woman was sitting under a tree and sipping coffee
# compare like below
# Theoldwomanwassittingunderatreeandsippingcoffee  || sippingcoffee
# oldwomanwassittingunderatreeandsippingcoffee || sippingcoffee
# womanwassittingunderatreeandsippingcoffee || sippingcoffee
# ...............
# andsippingcoffee || sippingcoffee
# sippingcoffee || sippingcoffee
def get_termination_portion(main_string, sub_string):
    combined_sub_string = sub_string.replace(" ", "")
    main_string_list = main_string.split()
    last_index = len(main_string_list)
    for i in range(last_index):
        check_string_list = main_string_list[i:]
        check_string = "".join(check_string_list)
        check_string = check_string.replace(" ", "")
        if check_string == combined_sub_string:
            return " ".join(main_string_list[:i])

    return None

longest_phrase_to_use = max(last_nounphrase_flattened, last_verbphrase_flattened)
print ("Ending phrase: ", longest_phrase_to_use)

longest_phrase_to_use = re.sub(r"-LRB- ", "(", longest_phrase_to_use)
longest_phrase_to_use = re.sub(r" -RRB-", ")", longest_phrase_to_use)


split_sentence = get_termination_portion(test_sentence, longest_phrase_to_use)
print ("Original sentence : ",test_sentence)
print ("Original sentence after splitting at ending phrase: ",split_sentence)

Ending phrase:  sipping coffee
Original sentence :  The old woman was sitting under a tree and sipping coffee
Original sentence after splitting at ending phrase:  The old woman was sitting under a tree and


In [11]:
# !conda uninstall h5py
# !pip install h5py

In [12]:
import tensorflow as tf
from transformers import TFGPT2LMHeadModel, GPT2Tokenizer

# GPT2tokenizer = GPT2Tokenizer.from_pretrained("distilgpt2")
# GPT2model = TFGPT2LMHeadModel.from_pretrained("distilgpt2",pad_token_id=GPT2tokenizer.eos_token_id)
GPT2tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
GPT2model = TFGPT2LMHeadModel.from_pretrained("gpt2",pad_token_id=GPT2tokenizer.eos_token_id)


In [13]:
partial_sentence = "The old woman was sitting under a tree and"
input_ids = GPT2tokenizer.encode(partial_sentence,return_tensors='tf')
print (input_ids)
maximum_length = len(partial_sentence.split())+40

tf.Tensor([[ 464 1468 2415  373 5586  739  257 5509  290]], shape=(1, 9), dtype=int32)


In [14]:
# Activate top_k sampling and top_p sampling with only from 90% most likely words
sample_outputs = GPT2model.generate(
    input_ids, 
    do_sample=True, 
    max_length=maximum_length, 
    top_p=0.80, # 0.85 
    top_k=60,   #30
    repetition_penalty  = 10.0,
    num_return_sequences=12
)

In [15]:
import nltk
nltk.download('punkt')
from nltk import tokenize
generated_sentences=[]

for i, sample_output in enumerate(sample_outputs):
    decoded_sentence = GPT2tokenizer.decode(sample_output, skip_special_tokens=True)
    # final_sentence = decoded_sentence
    final_sentence = tokenize.sent_tokenize(decoded_sentence)[0]
    generated_sentences.append(final_sentence)
    print (i,": ",final_sentence)

0 :  The old woman was sitting under a tree and looking at the trees, thinking of her own life.
1 :  The old woman was sitting under a tree and he took her hand.
2 :  The old woman was sitting under a tree and couldn't get her hair down.
3 :  The old woman was sitting under a tree and she said, "I'm not sure if you're talking to the young lady in your room or her parents.
4 :  The old woman was sitting under a tree and looking at me.
5 :  The old woman was sitting under a tree and had her hand on his shoulder.
6 :  The old woman was sitting under a tree and holding her hand, but she turned around to look at him.
7 :  The old woman was sitting under a tree and standing over it, her face covered in dust.
8 :  The old woman was sitting under a tree and she had just started to cry when the voice of her aunt came out.
9 :  The old woman was sitting under a tree and her arms were folded around herself, so she leaned back in the chair.
10 :  The old woman was sitting under a tree and listenin

[nltk_data] Downloading package punkt to /home/jupyter/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
#Single sentence true or false
#Multiple sentence true or false

In [None]:
# https://www.geeksforgeeks.org/nlp-how-tokenizing-text-sentence-words-works/