In [1]:
import nltk
import time
import string
import numpy 
import random


from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.tree import Tree

from nltk import CFG, Nonterminal, Production

from nltk.tag import pos_tag
from nltk.parse.generate import generate
from nltk.parse.corenlp import CoreNLPParser

In [2]:
#API connecten Terminal
#java -mx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 9000 -timeout 15000

In [3]:
#1. Preprocessing

In [4]:
# Voeg txt file toe
with open('HobbitKI.txt', 'r') as file:
    text = file.read()

In [5]:
illegal_words = ["\ufeffin", "/xad", ";"]

def preprocesser(given_text):
    #Doel is om gegeven text op te schonen van puncties en illegale woorden
    #Returnt text in een list met sentences
    
    # Lowercasing
    given_text = given_text.lower()
    sentences = sent_tokenize(given_text)
    
    # Alle puncties
    punctuations = str.maketrans('', '', string.punctuation)

    # Preprocessing
    preprocessed_sentences = []
    for sentence in sentences:
        # Puncties verwijderen
        sentence = sentence.translate(punctuations)
        words = word_tokenize(sentence)

        #Check of zin illegaal woord bevat, zo ja voeg zin niet toe.
        #Nadeel, 1 illegale woord zorgt dat een heel zin verwijderd wordt
        illegal_word = False
        for word in words:
            if word in illegal_words:
                illegal_word = True
                break

        #Als zin geen illegaal woord bevat voeg toe
        if not illegal_word:
            preprocessed_sentences.append(' '.join(words))
    
    return preprocessed_sentences
    
# Output
preprocessed = preprocesser(text)

for sentence in preprocessed:
    print(sentence)

not a nasty dirty wet hole filled with the ends of worms and an oozy smell nor yet a dry bare sandy hole with nothing in it to sit down on or to eat it was a hobbit hole and that means comfort
it had a perfectly round door like a porthole painted green with a shiny yellow brass knob in the exact middle
the door opened onto a tube shaped hall like a tunnel a very comfortable tunnel without smoke with panelled walls and floors tiled and carpeted provided with polished chairs and lots and lots of pegs for hats and coats ­ the hobbit was fond of visitors
the tunnel wound on and on going fairly but not quite straight into the side of the hill ­ the hill as all the people for many miles round called it ­ and many little round doors opened out of it first on one side and then on another
no going upstairs for the hobbit bedrooms bathrooms cellars pantries lots of these wardrobes he had whole rooms devoted to clothes kitchens dining rooms all were on the same floor and indeed on the same passag

In [6]:
#2 CFG maken met stanford API

In [7]:
parser = CoreNLPParser(url='http://localhost:9000')
#Zinnen:
#1. the architect obtained a sketchbook or a pencil from the landlord (1 normale zin, voor als basis)
#2. he lit up his wand as he did that day in bilbos dining room that seemed so long ago if you and by its light they explored the cave from end to end
#3. it had a perfectly round door like a porthole painted green with a shiny yellow brass knob in the exact middle
#4. i suppose hobbits need some description nowadays since they have become rare and shy of the big people as they call us

line = "he lit up his wand as he did that day in bilbos dining room that seemed so long ago if you and by its light they explored the cave from end to end"

# Parse tree
parse_tree = next(parser.raw_parse(line))

# Output
print(parse_tree)


(ROOT
  (S
    (PRN
      (S
        (NP (PRP he))
        (VP
          (VBD lit)
          (PRT (RP up))
          (NP (PRP$ his) (NN wand))
          (SBAR
            (IN as)
            (S
              (NP (PRP he))
              (VP
                (VBD did)
                (NP
                  (NP
                    (NP (DT that) (NN day))
                    (PP
                      (IN in)
                      (NP (NNS bilbos) (NML (NN dining) (NN room)))))
                  (SBAR
                    (WHNP (WDT that))
                    (S
                      (VP
                        (VBD seemed)
                        (ADVP (RB so) (RB long))
                        (PP
                          (PP (IN ago) (PP (IN if) (NP (PRP you))))
                          (CC and)
                          (PP (IN by) (NP (PRP$ its) (NN light))))))))))))))
    (NP (PRP they))
    (VP
      (VBD explored)
      (NP (DT the) (NN cave))
      (PP (IN from) (NP (NN end)))
     

In [8]:
def tree_to_cfg(tree, start_symbol='S'):
    cfg_rules = set()

    def extract_rules(t):
        if isinstance(t, Tree):
            if t.label() != "ROOT":

                rule = t.productions()[0]
                cfg_rules.add(rule)

            for child in t:
                extract_rules(child)

    extract_rules(tree)
    
    grammar = CFG(start=Nonterminal(start_symbol), productions=list(cfg_rules))
    return grammar

# Convert parse tree to CFG
cfg = tree_to_cfg(parse_tree)
print(cfg)

Grammar with 55 productions (start state = S)
    S -> VP
    VBD -> 'seemed'
    IN -> 'from'
    S -> NP VP
    NN -> 'wand'
    WHNP -> WDT
    VBD -> 'explored'
    PP -> PP CC PP
    PRP -> 'you'
    PRP -> 'they'
    VBD -> 'lit'
    SBAR -> WHNP S
    CC -> 'and'
    IN -> 'to'
    PRN -> S
    RB -> 'so'
    IN -> 'if'
    DT -> 'the'
    RB -> 'long'
    WDT -> 'that'
    S -> PRN NP VP
    NP -> DT NN
    NML -> NN NN
    NN -> 'day'
    SBAR -> IN S
    IN -> 'as'
    NP -> NP PP
    IN -> 'by'
    NN -> 'dining'
    RP -> 'up'
    NN -> 'cave'
    VP -> VBD PRT NP SBAR
    IN -> 'in'
    NP -> NP SBAR
    NN -> 'room'
    VP -> VBD NP PP PP
    DT -> 'that'
    NNS -> 'bilbos'
    NP -> NN
    VP -> VBD ADVP PP
    PP -> IN PP
    NP -> PRP
    VP -> VBD NP
    IN -> 'ago'
    PRT -> RP
    PRP -> 'he'
    NN -> 'end'
    VBD -> 'did'
    NP -> NNS NML
    NN -> 'light'
    PP -> IN NP
    NP -> PRP$ NN
    ADVP -> RB RB
    PRP$ -> 'his'
    PRP$ -> 'its'


In [9]:
def wordandtag_frequencies(sentences):
    #Doel is om het aantal word en bijbehorende tag frequencies te tellen
    #Om hier later bij generate_pcfg een probability van te maken
    
    word_frequencies = {}
    
    for sentence in sentences:
        tagged = nltk.pos_tag(word_tokenize(sentence))
        
        for word, tag in tagged:
            tag = tag.replace('$', 'DOLLAR')
            
            # Tel aantal woorden dat bij het tag behoort
            # Als tag nog nieet in dict, voeg toe.
            if tag not in word_frequencies:
                word_frequencies[tag] = {}
            
            # Value is een dictionary bestaande uit een woord en zijn "count/occurences)
            if word not in word_frequencies[tag]:
                word_frequencies[tag][word] = 0
            word_frequencies[tag][word] += 1
            
    return word_frequencies
    
# # Output
# word_frequencies = wordandtag_frequencies(preprocessed)
# print(word_frequencies)

def generate_pcfg(word_frequencies):
    # PCFG creeren, door de probability aand rules te voegen

    #Verkregen CFG uit stanford API
    grammar_dict = {
        'S': ["NP VP"],
        'NP': ["DT NN", "NP CC NP", "PRP", "NP PP", "DT JJ JJ NN NN", "DT JJ NN", "DT ADJP NN", "DT NN", "NNS", "PRP", "ADJP", "NN", "NP SBAR", "PRP NN"],
        'PP': ["IN NP", "IN PP", "IN NP"],
        'VP': ["VBD NP PP", "VBD NP SBAR", "VBD ADJP PP", "VBP NP", "VBP NP ADVP SBAR", "VBP SBAR", "VBN NP SBAR", "VBD NP", "VBD PRT NP SBAR", "VBD NP PP PP", "VBD NP PP"],
        'ADJP': ["RB JJ", "JJ", "JJ CC JJ"],
        'SBAR': ["IN S", "S", "WHNP S", "IN S"],
        'ADVP': ["RB", "RB RB"],
        'PRT': ["RP"],
        'NML': ["NN NN"],
        'PRN': ["S"],
        'WHNP': ["WDT"]
    }



    # Probabilities berekenen van zowel grammar als lexical rules
    prob_grammar_rules = []

    #Grammar rules probabilties
    for key, productions in grammar_dict.items():
        total = len(productions)
        for production in set(productions):
            count = productions.count(production)
            prob = count / total
            
            #Programma werkt niet met wetenschappelijke notatie,
            #Dus afgerond op 6 decimalen, kan nog aangepast worden.
            prob_grammar_rules.append(f"{key} -> {production} [{prob:.6f}]")

    #Lexical rules probabilities
    for tag, words in word_frequencies.items():
        total = sum(words.values())
        for word, count in words.items():
            probability = count / total
            prob_grammar_rules.append(f"{tag} -> '{word}' [{probability:.6f}]")
    

    return prob_grammar_rules
# # Output
# pcfg_rules = generate_pcfg(word_frequencies)
# print(pcfg_rules)

In [10]:
def top_down_expansion(pcfg, sentence, non_terminal):
    #Expands de grammar rules van links naar rechts, tot het een terminal bereikt,
    #die voegen we toe aan onze sentence, tot alle non_terminals een terminal bereiken

    # Zoek voor alle regels, die de non_terminal bevat 
    # VB. S -> NP VP | NP VBZ VP | etc.
    if non_terminal in pcfg._lhs_index:
        possible_rules = pcfg._lhs_index[non_terminal]

        # Neem de probabilities van elk production 
        # VB. S -> NP VP [1.0}
        probabilities = []
        for left_most_rule in possible_rules:
            probabilities.append(left_most_rule.prob())

        # Kies een RHS rule gebaseerd op kansen
        chosen_path = random.choices(possible_rules, weights=probabilities)[0]
        # Expand de meest linker non-terminal 
        # btw .rhs() werkt niet op list
        for left_most_rule in chosen_path.rhs():
            top_down_expansion(pcfg, sentence, left_most_rule)
    else:
        # Als geen mogelijke rules, dan is het een terminal/woord
        # Voeg toe aan sentence
        terminal = non_terminal
        sentence.append(str(terminal))

def generate_sentence(pcfg):
    # Genereerd de sentence, door top_down_expansion the runnen en de woorden toe te
    # voegen aan sentence, verder join het tot een string
    sentence = []
    top_down_expansion(pcfg, sentence, pcfg.start())
    return ' '.join(sentence)

def main(text):
    #Verkrijg processed sentences
    preprocessed_sentences = preprocesser(text)
    
    #Verkrijg de PCFG, met bijbehorende probabilities
    #Returns probabilties
    word_frequencies = wordandtag_frequencies(preprocessed_sentences)
    #Returns pcfg in list type
    pcfg_rules_list = generate_pcfg(word_frequencies)
    #Returns pcfg als str
    pcfg_rules_string = '\n'.join(pcfg_rules_list)  
    #Returns de PCFG als grammar
    pcfg = nltk.PCFG.fromstring(pcfg_rules_string)

    # Genereet de sentences
    sentences = []
    # Aanpasbaar, ligt eraan hoeveel sentences je wilt
    while len(sentences) < 10:
        sentence = generate_sentence(pcfg)

        # Dit is geen must, maar anders is er groot kans dat het programma
        # lange zinnen creert met veel recursion door gebruik van "and" en "but"
        # Dit valt ook aan te passen, ligt eraan hoe lang je de sentences wilt
        word_count = len(sentence.split())
        if 15 <= word_count <= 20:
            sentences.append(sentence)

    return sentences

# Example usage
generated_sentences = main(text)
for sentence in generated_sentences:
    print(sentence)

he cleaned here splintered torches with on in if it thought green of songs of between we till
course hope which thorin gripped the idea the i left you creaking in sure from by it
the road that before nori and great from of others was many and important after the elf
a tom am him sun; said a long gold whether then light grew grim­voiced about the angry thick time eye
while; go himself only of that steep; mistake believe them enough if he dwarf was few
nuts cried shapes up him and the full small air table the dont below; were they
greybeards say they in that great main use boom the dark other morning fire had the tied cave
they are him of with they up simply if firmly grim was the last breast
rock of the smell are the thought that the nasty fierce time steam are no shriek
the spring seen me none which great and wet shadowed forever light on me like them
