Gregorio Orlando

In [387]:
# import libraries needed for the project
import nltk
from nltk.corpus import brown
from nltk.tokenize import word_tokenize
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords

from nltk.corpus import shakespeare
import xml.etree.ElementTree as ET
import random

# Data Preparation

### Load a text file containing Shakespeare's works.

In [388]:
# load all the shakespeare books in nltk corpus
shakespeare.fileids()

['a_and_c.xml',
 'dream.xml',
 'hamlet.xml',
 'j_caesar.xml',
 'macbeth.xml',
 'merchant.xml',
 'othello.xml',
 'r_and_j.xml']

I'll chose: HAMLET. The famus play of the "to be or not to be"

In [None]:
# load the hamlet book
play = shakespeare.xml('hamlet.xml')
play

<Element 'PLAY' at 0x000001A51D397100>

This is the addres of the root of the play

In [390]:
# print title of the book
print('%s: %s' % (play[0].tag, play[0].text))

TITLE: The Tragedy of Hamlet, Prince of Denmark


### Preprocess the text

1. parse the XML and extract the relevant text elements

In [391]:
# Function to recursively extract text from XML elements
def extract_text(element): # element is an XML element
    text = element.text or "" # initialize text with element's text
    for child in element: # loop over element's children
        text += extract_text(child) # recursively extract text from child
        if child.tail: # if child has a tail
            text += child.tail # add the tail to text
    return text # return the text

In [392]:
# Extract and print the text
play_text = extract_text(play)

In [393]:
len(play_text)

179465

In [394]:
type(play_text)

str

In [395]:
play_text[100:150]

' son to the late, and nephew to the present king.\n'

In [396]:
# Search for the "To be, or not to be" soliloquy
to_be_text = "To be, or not to be"
start_index = play_text.find(to_be_text)
end_index = start_index + len(to_be_text)
play_text[start_index:end_index]

'To be, or not to be'

2. convert it to lowercase

In [397]:
play_text_lower = play_text.lower()
play_text_lower[start_index:end_index]

'to be, or not to be'

In [398]:
type(play_text_lower)

str

3. splitting it into tokens.

In [399]:
# use 'word_tokenize' to tokenize the play text
# meaning splitting the text into words
play_tokenized = word_tokenize(play_text_lower)
len(play_tokenized)

39514

In [400]:
play_tokenized[100:110]

['gertrude',
 ',',
 'queen',
 'of',
 'denmark',
 ',',
 'and',
 'mother',
 'to',
 'hamlet']

4. removing punctuation & tokenize again

In [None]:
# use 'RegexpTokenizer' to tokenize the play text again and remove punctuation
tokenizer = RegexpTokenizer(r'\w+')
play_words = tokenizer.tokenize(play_text_lower)
play_words[100:110]

['denmark',
 'hamlet',
 'act',
 'i',
 'scene',
 'i',
 'elsinore',
 'a',
 'platform',
 'before']

In [402]:
type(play_words)

list

### Create list of bigrams

In [403]:
# Case folding and creating vocabulary
vocab = set(play_words)  # Set of unique words in play_words

# Initialize bigrams dictionary
# Each key is a bigram (two consecutive words), and the value is a dictionary:
#   - "occurrence": Total count of how many times this bigram appears
#   - "next_words": Dictionary of words that follow this bigram with their counts
bigrams = {}

In [404]:
# Sliding through the corpus to get bigram and next-word counts
for i in range(len(play_words) - 2):  # Iterates through play_words to extract bigrams
    bigram = (play_words[i], play_words[i + 1])  # Current bigram
    next_word = play_words[i + 2]  # Word that follows the bigram

    # Update bigram counts
    if bigram not in bigrams:
        bigrams[bigram] = {
            "occurrence": 1,  # Initialize occurrence count
            "next_words": {next_word: 1}  # Initialize next-word dictionary
        }
    else:
        bigrams[bigram]["occurrence"] += 1  # Increment occurrence count

        # Update next-word counts
        if next_word in bigrams[bigram]["next_words"]:
            bigrams[bigram]["next_words"][next_word] += 1
        else:
            bigrams[bigram]["next_words"][next_word] = 1

In [405]:
example_bigrams = list(bigrams.items())[:3]  # Show only the first 3 bigrams
for bigram, details in example_bigrams:
    print(f"{bigram}: {details}")

('the', 'tragedy'): {'occurrence': 1, 'next_words': {'of': 1}}
('tragedy', 'of'): {'occurrence': 1, 'next_words': {'hamlet': 1}}
('of', 'hamlet'): {'occurrence': 8, 'next_words': {'prince': 1, 's': 4, 'our': 1, 'sits': 1, 'do': 1}}


In [406]:
print(f"Example: Count for bigram ('to', 'be') is: {bigrams.get(('to', 'be'))}")
print(f"Example: Count for bigram ('be', 'or') is: {bigrams.get(('be', 'or'))}")
print(f"Example: Count for bigram ('airplane', 'house') is: {bigrams.get(('airplane', 'house'))}")

Example: Count for bigram ('to', 'be') is: {'occurrence': 34, 'next_words': {'done': 1, 'contracted': 1, 'disjoint': 1, 'near': 1, 'commanded': 1, 'a': 3, 'nothing': 1, 'honest': 1, 'one': 1, 'sounded': 1, 'or': 1, 'that': 1, 'wish': 1, 'considered': 1, 'played': 1, 'forestalled': 1, 'free': 1, 'too': 1, 'bless': 1, 'kind': 1, 'demanded': 1, 'last': 1, 'great': 1, 'spilt': 1, 'your': 1, 'heard': 1, 'buried': 2, 'made': 2, 'in': 1, 'damn': 1}}
Example: Count for bigram ('be', 'or') is: {'occurrence': 1, 'next_words': {'not': 1}}
Example: Count for bigram ('airplane', 'house') is: None


With this rappresentation I am able to have all the bigrams (set of 2 consecutive words), with their occurence. Plus the occurence of each word that came after the bigram.

### Dictionary Bigram Count

This function shows me the possible next words that I can have after the given gram

In [407]:
def from_bigram_to_next_token_counts(gram, grams):
    #print(f"\nInput bigram: {bigram}")

    # Check if the bigram exists in the dictionary
    if gram not in grams:
        print("No suggestions available for this bigram.")
        return []

    # Access the next-word counts for the given bigram
    next_word_counts = grams[gram]["next_words"]

    # Sort and return the top 3 suggestions (words with the highest counts)
    top_suggestions = sorted(next_word_counts.items(), key=lambda x: x[1], reverse=True)[:3]
    return top_suggestions

In [408]:
# Example usage
tokens = ('to', 'be')
top_suggestions = from_bigram_to_next_token_counts(tokens, bigrams)
print(f"Top suggestions: {top_suggestions}")

Top suggestions: [('a', 3), ('buried', 2), ('made', 2)]


In [409]:
# Example usage
tokens = ('be', 'or')
top_suggestions = from_bigram_to_next_token_counts(tokens, bigrams)
print(f"Top suggestions: {top_suggestions}")

Top suggestions: [('not', 1)]


after 'to', 'be' we can have [('a', 3), ('buried', 2), ('made', 2), ..] this means that after 'to be' the most commun word to occur is 'a'

On the other hand after 'be or' there is only one word to occur with 100% probability, that is 'not'

# Probability Distribution

Given the gram and the possible next words, we can calculate the probability of occurence of each possible word after the given gram

In [410]:
def from_bigram_to_next_token_probs(gram, grams):
    #print(f"\nInput bigram: {bigram}")

    # Check if the bigram exists in the dictionary
    if gram not in grams:
        print("No suggestions available for this bigram.")
        return []

    # Access the next-word counts for the given bigram
    next_word_counts = grams[gram]["next_words"]

    # Sort and return the top 3 suggestions (words with the highest counts)
    top_suggestions = sorted(next_word_counts.items(), key=lambda x: x[1], reverse=True)[:3]

    # Calculate the total count of next words
    total_count = sum(next_word_counts.values())

    # Calculate the probabilities for each suggestion
    top_suggestions_probs = [(word, count / total_count) for word, count in top_suggestions] # Calculate the probabilities as count/total_count
    result = [total_count, top_suggestions_probs]
    return result

In [411]:
# Example usage
tokens = ('to', 'be')
top_suggestions = from_bigram_to_next_token_probs(tokens, bigrams)
print(f"Top suggestions for token: {tokens}\nare: {top_suggestions[1]}\nWith total number of possible words after the bigram of {top_suggestions[0]}")

Top suggestions for token: ('to', 'be')
are: [('a', 0.08823529411764706), ('buried', 0.058823529411764705), ('made', 0.058823529411764705)]
With total number of possible words after the bigram of 34


In [412]:
# Example usage
tokens = ('be', 'or')
top_suggestions = from_bigram_to_next_token_probs(tokens, bigrams)
print(f"Top suggestions for token: {tokens}\nare: {top_suggestions[1]}\nWith total number of possible words after the bigram of {top_suggestions[0]}")

Top suggestions for token: ('be', 'or')
are: [('not', 1.0)]
With total number of possible words after the bigram of 1


Same thing as before, but now we have a probability and not a number of time it occurs

# Sampling Next Token

This function will pick with weighted random choice the next word. This means that even if after 'to be' the most tipical word to appear is 'a', we still give space to the algorithm to might chose something else. This is why our bigram can be less accurate than quadgram. 

In [413]:
def sample_next_token(bigram, bigrams):
    # Get the probabilities for the next tokens
    total_count, next_token_probs = from_bigram_to_next_token_probs(bigram, bigrams)
    # If no next tokens are available, return None
    if total_count == 0 or not next_token_probs:
        return None
    # Extract tokens and their probabilities
    tokens = [token for token, prob in next_token_probs]
    probabilities = [prob for token, prob in next_token_probs]
    # Sample a token based on the probabilities
    next_token = random.choices(tokens, weights=probabilities, k=1)[0]  # Sample one token
    return next_token

In [414]:
tokens = ('to', 'be')
next_token = sample_next_token(tokens, bigrams)
print(f"Sampled next token for bigram {tokens}: {next_token}")

Sampled next token for bigram ('to', 'be'): buried


##### Test

I have never used weighted sampling, so I wanted to be sure that the occurence of the best word reflect the probabilities

In [415]:
# Test weighted sampling
from collections import Counter

tokens = ('to', 'be')
results = []

# Run the sampling 1000 times
for _ in range(1000):
    next_token = sample_next_token(tokens, bigrams)
    results.append(next_token)

# Count occurrences of each word
occurrences = Counter(results)
print("Sampling results after 1000 iterations:")
for word, count in occurrences.items():
    print(f"{word}: {count} times (probability ~ {count / 1000:.4f})")

Sampling results after 1000 iterations:
made: 276 times (probability ~ 0.2760)
a: 444 times (probability ~ 0.4440)
buried: 280 times (probability ~ 0.2800)


Weitghted random choice is working quite good, as expected it mostly give us the one with higher probability, but not always!!

# Generating Text

This function will take the gram, calculate the next word to occur, create a new gram with the new generated word and repeat the process. Doing so we expect to recreate the famus passeges of the book

In [416]:
def generate_text_from_bigram(gram, grams, max_length=100):
    # Initialize the generated text with the bigram
    generated_text = list(gram)
    # Generate the next tokens until reaching the maximum length
    for _ in range(max_length):
        # Sample the next token
        next_token = sample_next_token(gram, grams)
        # If no next token is available, stop the generation
        if next_token is None:
            break
        # Append the token to the generated text
        generated_text.append(next_token)
        # Update the bigram for the next iteration
        gram = (gram[1:] + (next_token,))
    return generated_text

In [417]:
tokens = ('to', 'be')
generated_text = generate_text_from_bigram(tokens, bigrams)
print(f"Generated text from bigram {tokens}:\n{' '.join(generated_text)}")

Generated text from bigram ('to', 'be'):
to be buried quick with her gentleman she speaks much of water hast thou been a gentlewoman she should have fatted all the holy vows of heaven visit her face too roughly heaven and earth must i remember why she even she o god a mercy lord polonius what is the poison into the grave to tell the secrets of my father horatio where my abridgement comes enter four or five players you are not sterling tender yourself more dearly or not at all that suffers nothing a man as hamlet is a man take him for a certain term to walk


In [418]:
tokens = ('be', 'or')
generated_text = generate_text_from_bigram(tokens, bigrams)
print(f"Generated text from bigram {tokens}:\n{' '.join(generated_text)}")

Generated text from bigram ('be', 'or'):
be or not at all i would not hear your enemy say so nor shall you see there is a man might play but i thank you sir exit rosencrantz wilt please you to drink deep ere you go your servants tend laertes farewell ophelia o help him you sweet heavens hamlet if thou hast uphoarded in thy memory see thou character give thy thoughts no tongue nor any unproportioned thought his act be thou a spirit of health or goblin damn d and so without more circumstance at all that he is the matter my mother had not borne me i


# Exploration of Different N-grams

### Create list of Trigrams

In [419]:
trigrams = {}
# Sliding through the corpus to get trigrams and next-word counts
for i in range(len(play_words) - 3):  # Iterates through play_words to extract trigrams
    trigram = (play_words[i], play_words[i + 1], play_words[i+2])  # Current trigrams
    next_word = play_words[i + 3]  # Word that follows the trigrams

    # Update trigrams counts
    if trigram not in trigrams:
        trigrams[trigram] = {
            "occurrence": 1,  # Initialize occurrence count
            "next_words": {next_word: 1}  # Initialize next-word dictionary
        }
    else:
        trigrams[trigram]["occurrence"] += 1  # Increment occurrence count

        # Update next-word counts
        if next_word in trigrams[trigram]["next_words"]:
            trigrams[trigram]["next_words"][next_word] += 1
        else:
            trigrams[trigram]["next_words"][next_word] = 1

In [420]:
example_trigrams = list(trigrams.items())[:3]  # Show only the first 3 trigrams
for trigram, details in example_trigrams:
    print(f"{trigram}: {details}")

('the', 'tragedy', 'of'): {'occurrence': 1, 'next_words': {'hamlet': 1}}
('tragedy', 'of', 'hamlet'): {'occurrence': 1, 'next_words': {'prince': 1}}
('of', 'hamlet', 'prince'): {'occurrence': 1, 'next_words': {'of': 1}}


In [421]:
print(f"Example: Count for bigram ('to', 'be', 'or') is: {trigrams.get(('to', 'be', 'or'))}")
print(f"Example: Count for bigram ('airplane', 'house', 'car') is: {trigrams.get(('airplane', 'house', 'car'))}")

Example: Count for bigram ('to', 'be', 'or') is: {'occurrence': 1, 'next_words': {'not': 1}}
Example: Count for bigram ('airplane', 'house', 'car') is: None


In [422]:
tokens = ('to', 'be', 'or')

In [423]:
top_suggestions = from_bigram_to_next_token_counts(tokens, trigrams)
print(f"Top suggestions: {top_suggestions}")

Top suggestions: [('not', 1)]


In [424]:
top_suggestions = from_bigram_to_next_token_probs(tokens, trigrams)
print(f"Top suggestions for token: {tokens}\nare: {top_suggestions[1]}\nWith total number of possible words after the bigram of {top_suggestions[0]}")

Top suggestions for token: ('to', 'be', 'or')
are: [('not', 1.0)]
With total number of possible words after the bigram of 1


In [425]:
next_token = sample_next_token(tokens, trigrams)
print(f"Sampled next token for bigram {tokens}: {next_token}")

Sampled next token for bigram ('to', 'be', 'or'): not


In [426]:
generated_text = generate_text_from_bigram(tokens, trigrams, max_length=10)
print(f"Generated text from bigram {tokens}:\n{' '.join(generated_text)}")

Generated text from bigram ('to', 'be', 'or'):
to be or not to crack the wind of me as if you


### Quadgrams

In [427]:
quadgrams = {}
# Sliding through the corpus to get quadgrams and next-word counts
for i in range(len(play_words) - 4):  # Iterates through play_words to extract quadgrams
    quadgram = (play_words[i], play_words[i + 1], play_words[i+2], play_words[i+3])  # Current quadgrams
    next_word = play_words[i + 4]  # Word that follows the quadgrams

    # Update quadgrams counts
    if quadgram not in quadgrams:
        quadgrams[quadgram] = {
            "occurrence": 1,  # Initialize occurrence count
            "next_words": {next_word: 1}  # Initialize next-word dictionary
        }
    else:
        quadgrams[quadgram]["occurrence"] += 1  # Increment occurrence count

        # Update next-word counts
        if next_word in quadgrams[quadgram]["next_words"]:
            quadgrams[quadgram]["next_words"][next_word] += 1
        else:
            quadgrams[quadgram]["next_words"][next_word] = 1

In [428]:
example_quadgrams = list(quadgrams.items())[:3]  # Show only the first 3 quadgrams
for quadgram, details in example_quadgrams:
    print(f"{quadgram}: {details}")

('the', 'tragedy', 'of', 'hamlet'): {'occurrence': 1, 'next_words': {'prince': 1}}
('tragedy', 'of', 'hamlet', 'prince'): {'occurrence': 1, 'next_words': {'of': 1}}
('of', 'hamlet', 'prince', 'of'): {'occurrence': 1, 'next_words': {'denmark': 1}}


In [429]:
print(f"Example: Count for bigram ('to', 'be', 'or', 'not') is: {quadgrams.get(('to', 'be', 'or', 'not'))}")
print(f"Example: Count for bigram ('airplane', 'house', 'car', 'dog') is: {quadgrams.get(('airplane', 'house', 'car', 'dog'))}")

Example: Count for bigram ('to', 'be', 'or', 'not') is: {'occurrence': 1, 'next_words': {'to': 1}}
Example: Count for bigram ('airplane', 'house', 'car', 'dog') is: None


In [430]:
tokens = ('to', 'be', 'or', 'not')

In [431]:
top_suggestions = from_bigram_to_next_token_counts(tokens, quadgrams)
print(f"Top suggestions: {top_suggestions}")

Top suggestions: [('to', 1)]


In [432]:
top_suggestions = from_bigram_to_next_token_probs(tokens, quadgrams)
print(f"Top suggestions for token: {tokens}\nare: {top_suggestions[1]}\nWith total number of possible words after the bigram of {top_suggestions[0]}")

Top suggestions for token: ('to', 'be', 'or', 'not')
are: [('to', 1.0)]
With total number of possible words after the bigram of 1


In [433]:
next_token = sample_next_token(tokens, quadgrams)
print(f"Sampled next token for bigram {tokens}: {next_token}")

Sampled next token for bigram ('to', 'be', 'or', 'not'): to


In [436]:
generated_text = generate_text_from_bigram(tokens, quadgrams, max_length=20)
print(f"Generated text from bigram {tokens}:\n{' '.join(generated_text)}")

Generated text from bigram ('to', 'be', 'or', 'not'):
to be or not to be that is the question whether tis nobler in the mind to suffer the slings and arrows of outrageous


# Test case

In [452]:
goal = 'to be or not to be that is the question whether tis nobler in the mind to suffer the slings and arrows of outrageous fortune or to take arms against a sea of troubles'

In [453]:
def accuracy(goal, generated_text):
    goal_words = goal.split()
    generated_words = generated_text[:len(goal_words)]
    correct = sum([1 for goal_word, generated_word in zip(goal_words, generated_words) if goal_word == generated_word])
    return correct / len(goal_words)

In [454]:
def test_grams():
    bigram_input = ('to','be')
    generated_text = generate_text_from_bigram(bigram_input, bigrams, max_length=30)
    print(f"Generated text from bigram {bigram_input}:\n{' '.join(generated_text)}")
    print(f"Accuracy: {accuracy(goal, generated_text)}")
    print()

    trigram_input = ('to','be','or')
    generated_text = generate_text_from_bigram(trigram_input, trigrams, max_length=30)
    print(f"Generated text from trigram {trigram_input}:\n{' '.join(generated_text)}")
    print(f"Accuracy: {accuracy(goal, generated_text)}")
    print()

    quadgram_input = ('to','be','or','not')
    generated_text = generate_text_from_bigram(quadgram_input, quadgrams, max_length=30)
    print(f"Generated text from quadgram {quadgram_input}:\n{' '.join(generated_text)}")
    print(f"Accuracy: {accuracy(goal, generated_text)}")
    print()
test_grams()

Generated text from bigram ('to', 'be'):
to be a villain kills my father s leave what says polonius lord polonius i would not this sir and therefore i forbid my tears but yet i hold my peace i
Accuracy: 0.058823529411764705

Generated text from trigram ('to', 'be', 'or'):
to be or not to crack the wind of me as if you would drive me into a towering passion horatio peace who comes here enter osric osric your lordship speaks most infallibly
Accuracy: 0.14705882352941177

Generated text from quadgram ('to', 'be', 'or', 'not'):
to be or not to be that is the question whether tis nobler in the mind to suffer the slings and arrows of outrageous fortune or to take arms against a sea of troubles
Accuracy: 1.0



# Human Evaluation

In [448]:
def gather_feedback():
    print("We will find text in Hamlet the famus play by William Shakespeare")
    print("The code will ask you to start a sentence with 2/3/4 words and I will complete the sentence")
    print('lets start')

    bigram_input = input("Please enter the first TWO words of the sentence that Shakespeare will complete:\n")
    # make a list out of the input
    bigram_input = bigram_input.split()
    bigram_input = tuple(bigram_input)
    generated_text = generate_text_from_bigram(bigram_input, bigrams, max_length=30)
    print(f"-------------\nGenerated text from digital Shakespeare:\n{' '.join(generated_text)}\n")
    feedback_bigram = input("Did Shakespeare complete the sentence correctly? How do you feel about this digital poet? Does it resemble the master?\n")

    trigram_input = input("Please enter the first THREE words of the sentence that Shakespeare will complete:\n")
    trigram_input = trigram_input.split()
    trigram_input = tuple(trigram_input)
    generated_text = generate_text_from_bigram(trigram_input, trigrams, max_length=30)
    print(f"-------------\nGenerated text from digital Shakespeare:\n{' '.join(generated_text)}\n")
    feedback_trigram = input("Did Shakespeare complete the sentence correctly? How do you feel about this digital poet? Does it resemble the master?\n")

    quadgram_input = input("Please enter the first FOUR words of the sentence that Shakespeare will complete:\n")
    quadgram_input = quadgram_input.split()
    quadgram_input = tuple(quadgram_input)
    generated_text = generate_text_from_bigram(quadgram_input, quadgrams, max_length=30)
    print(f"-------------\nGenerated text from digital Shakespeare:\n{' '.join(generated_text)}\n")
    feedback_quadgram = input("Did Shakespeare complete the sentence correctly? How do you feel about this digital poet? Does it resemble the master?\n")

    feedbacks = [feedback_bigram, feedback_trigram, feedback_quadgram]
    return feedbacks

In [449]:
feedbacks = gather_feedback()

We will find text in Hamlet the famus play by William Shakespeare
The code will ask you to start a sentence with 2/3/4 words and I will complete the sentence
lets start
-------------
Generated text from digital Shakespeare:
to be a couch for luxury and damned light to their graves like beds fight for a state but to the king s ears hamlet he was mad he shall sir an

-------------
Generated text from digital Shakespeare:
to be or not to crack the wind of the poor phrase running it thus you ll tender me a fool ophelia my lord he s going to his mother s closet behind

-------------
Generated text from digital Shakespeare:
to be or not to be that is the question whether tis nobler in the mind to suffer the slings and arrows of outrageous fortune or to take arms against a sea of troubles



In [451]:
print('My roomate tested the digital poet.And I ask him to give me feedbacks\n\n-------------')
feedbacks

My roomate tested the digital poet.And I ask him to give me feedbacks

-------------


["this is not even close to what I was expecting! This digital poet is not starting well... As I thought... machine can't replace and not even eìimitate humans",
 'Mmmmh it started good, but then it start pasting random words... Gringo what is this ?!?!',
 'Wow!! This are the exact words I was expecting... but still... everyone can find the test and start reading it... ']