In [None]:
# Name: Edilberto F. Carrizales
# Date: Thurs, Feb 22, 2024

In [None]:
# installing nltk (Natural Language ToolKit) Library
%pip install nltk

Python interpreter will be restarted.
Collecting nltk
  Downloading nltk-3.8.1-py3-none-any.whl (1.5 MB)
Collecting tqdm
  Downloading tqdm-4.66.2-py3-none-any.whl (78 kB)
Collecting regex>=2021.8.3
  Downloading regex-2023.12.25-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (773 kB)
Installing collected packages: tqdm, regex, nltk
Successfully installed nltk-3.8.1 regex-2023.12.25 tqdm-4.66.2
Python interpreter will be restarted.


In [None]:
# 1. WordCount for Named Entities
# In this part we will compute the word frequency for named entities in a large file.
import nltk
import math
from nltk.tokenize import word_tokenize

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')

book_rdd = sc.textFile("dbfs:/FileStore/tables/PrideAndPrejudice.txt")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping chunkers/maxent_ne_chunker.zip.
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.


In [None]:
# We use map and word_tokenize to split the whole text into tokens (results is list of lists where each list is a sentence that has been turned into tokens)
book_words_rdd = book_rdd.map(lambda text: word_tokenize(text))
#book_words_rdd.collect()

In [None]:
# the each token in each of the lists is then tagged based on the type of word it is
tagged_words_rdd = book_words_rdd.map(lambda token: nltk.pos_tag(token))
#tagged_words_rdd.collect()

In [None]:
# after they are tagged, they are turned into a tree like structure this can allow us to determine named entities
entities_rdd = tagged_words_rdd.map(lambda tagged: nltk.ne_chunk(tagged))
#entities_rdd.collect()

In [None]:
# will turn the nltk tree structure into a tuple having: (word, position, chunk)
tree_tagged_tuples = entities_rdd.map(lambda tree: nltk.chunk.tree2conlltags(tree))
#tree_tagged_tuples.collect()

In [None]:
# helper function to collect the named entities from the tuples we got from the tree
def get_named_entities(tags):
    named_entities = [] # complete named entity
    current_entity = [] # current named entity (the reason for this is because there could be extended named entities such as "Mr. Darcy" or "Rocky Balboa")

    # we will loop through each tuple tag (each tuple has a word, a position, and a chunk)
    for word, pos, chunk in tags:
        # if the chunk == "O" that means it is NOT a named entity, if it is != "O", then it is a named entity 
        # (Note: "O" stands for "Outside" and is used to mark words that are not part of any named entity)
        if chunk != "O":
            current_entity.append(word) # add the current entity
        
        # if the next entity is NOT a named entity, then we have to add it to the current entity because it means it is an extended entity
        elif chunk == "O" and len(current_entity) != 0: 
            named_entities.append(" ".join(current_entity))
            current_entity = [] # clear current entity

    # this is just to handle the end of a sentence, in case there is an entity at the end
    if len(current_entity) != 0:
        named_entities.append(" ".join(current_entity))
    
    return named_entities

# runs our helper function on the cluster
named_entities = tree_tagged_tuples.map(get_named_entities)
#named_entities.collect()

In [None]:
# now that we have our named entities, we can call flatMap to put all named entities in a single list
named_entities = named_entities.flatMap(lambda entity: entity)
#named_entities.collect()

In [None]:
# next we give a count of 1 to each named entity
named_entities = named_entities.map(lambda x: (x, 1))
#named_entities.collect()

In [None]:
# finally we reduce by key meaning me "count the number of keys" and put the counter next to each key
named_entities = named_entities.reduceByKey(lambda x, y: x + y)
#named_entities.collect()

In [None]:
# we sort the count of each named entity in descending order from most frequent at the top to least frequent at the bottom.
named_entities.sortBy(lambda x: -x[1]).collect()

Out[10]: [('Elizabeth', 612),
 ('Jane', 293),
 ('Mr. Darcy', 245),
 ('Lydia', 151),
 ('Mr. Collins', 141),
 ('Darcy', 137),
 ('Bingley', 123),
 ('Wickham', 117),
 ('Mr. Bingley', 106),
 ('Lady Catherine', 103),
 ('Mr. Bennet', 85),
 ('Bennet', 84),
 ('Longbourn', 83),
 ('Miss Bingley', 79),
 ('Lizzy', 77),
 ('Kitty', 70),
 ('Netherfield', 70),
 ('Charlotte', 69),
 ('Mr. Wickham', 60),
 ('Miss Bennet', 57),
 ('Meryton', 57),
 ('London', 56),
 ('Pemberley', 51),
 ('Rosings', 39),
 ('Miss Darcy', 38),
 ('Hertfordshire', 38),
 ('George', 35),
 ('Mary', 34),
 ('Sir William', 31),
 ('Mr. Gardiner', 31),
 ('Miss', 30),
 ('Collins', 28),
 ('Bourgh', 28),
 ('Project', 28),
 ('Gardiner', 28),
 ('Catherine', 26),
 ('Derbyshire', 25),
 ('Brighton', 24),
 ('Colonel', 24),
 ('Miss Lucas', 23),
 ('Miss Austen', 22),
 ('Mr.', 21),
 ('Hunsford', 21),
 ('Colonel Fitzwilliam', 21),
 ('Lady', 19),
 ('Maria', 18),
 ('Project Gutenberg', 17),
 ('Lady Lucas', 16),
 ('Philips', 14),
 ('Georgiana', 14),
 ('Col