# Homework 3

### Step 1: Import necessary libraries

In [1]:
import nltk.data
from os import listdir
from os.path import isfile, join
from nltk.util import bigrams 
from nltk.tokenize import TreebankWordTokenizer
sentence_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
treebank_tokenizer = TreebankWordTokenizer()

In [2]:
# This will be the corpus we work from
from nltk.corpus import reuters

In [3]:
# I will assume you are using Spacy as a default entity recognizer.
import spacy

In [4]:
# note, the model load can be odd. In some instances your model might have the full name or the short name here.
# if you run into issues here, check the spacy model page at https://spacy.io/usage/models
nlp = spacy.load("en_core_web_sm")

### Step 2: FIll in the following function to extract the entity, document id, and relevant sentence text from the input

In [22]:
def extract_entities(doc_id, doc_text):
    analyzed_doc = nlp(doc_text)
    
    doc_persons = {}
    doc_locations = {}
    
    for entity in analyzed_doc.ents:
        if entity.text.strip() != "":
            # The .label_ property will provide information on the type of entity tagged
            #print(" -> ", entity.label_)
            # The .text property will display the actual text of the entity in the text
            #print("->", entity.text.strip(), "<-")
            # You can also access the sentence that the entity is contained in by using the .sent property
            # inside the sentence you can then use the .text property
            #print("->", entity.sent.text, "<-")
            
            # add the relevant document id and sentence to the entity record
            if entity.label_ == "PERSON":
                if entity.text.strip() not in doc_persons.keys():
                    doc_persons[entity.text.strip()] = [doc_id,[entity.sent.text]]
                else:
                    doc_persons[entity.text.strip()][1].append(entity.sent.text)
                    
            if entity.label_ in ["LOC","GPE"]: # Here, I considered both geopolitical and non_geopolitical locations.
                if entity.text.strip() not in doc_locations.keys():
                    doc_locations[entity.text.strip()] = [doc_id,[entity.sent.text]]
                else:
                    doc_locations[entity.text.strip()][1].append(entity.sent.text)
            
    return doc_persons, doc_locations

### Step 3: Adjust the following code to run the document entity extraction function

### Also, add the entity records you are constructing to your master list of entities

In [23]:
num_docs = len(reuters.fileids())

# these two dictionaries will incorporate all the referneces to 
combined_persons = {}
combined_locations = {}

for doc_id in reuters.fileids(): 
    # this doc_text variable will give you a text version of the news article. This could be tokenized.
    persons, locations = extract_entities(doc_id, reuters.open(doc_id).read())
    
    # you will need to write something here to put the persons and locations found in a document into the 
    # combined_persons and combined_locations dictionaries.
    # here you will need to consider how to extend the values already in the dictionaries
    # maybe something like:
    
    for person in persons.keys():
        if person not in combined_persons.keys():
            combined_persons[person] = [persons[person]]
        else:
            combined_persons[person].append(persons[person])
            
    for location in locations.keys():
        if location not in combined_locations.keys():
            combined_locations[location] = [locations[location]]
        else:
            combined_locations[location].append(locations[location])


### Step4: Fill in the following method to look through the content of an entity dictionary to determine the most popular based on number of mentions

In [8]:
import operator

In [24]:
# now that we have the text associated with the entities, 
# you will want to focus on the 500 top entities in each category
# Identify the top 500 entities by the count of their occurrences

def find_most_popular_entities(entity_dictionary):
    list_of_dictionary_keys_with_most_mentions = []
    temp_dict = {}
    # sort through the entities in the dictionary by the number of sentences
    for entity_key in entity_dictionary.keys():
        count = 0
        for doc in entity_dictionary[entity_key]:
            count += len(doc[1])
        temp_dict[entity_key] = count
    
    sorted_temp_dict = sorted(temp_dict.items(), key=operator.itemgetter(1), reverse=True)
    
    for item in sorted_temp_dict[:500]:
        list_of_dictionary_keys_with_most_mentions.append(item[0])
    
    return list_of_dictionary_keys_with_most_mentions


### Step 5: Now invoke your top entity mention finder

In [25]:
# simply get the top persons and locations
top_persons = find_most_popular_entities(combined_persons)
top_locations = find_most_popular_entities(combined_locations)

### Step 6: Analyze the most popular entities to determine what words they most frequently occur with

In [26]:
from nltk.corpus import stopwords
stopWords = set(stopwords.words('english'))
from nltk.stem.porter import *
porter_stemmer = PorterStemmer()

In [27]:
# This function is used for identifying all possible noise words
min_token_length = 2

def isNoise(token):     
    is_noise = False
    if porter_stemmer.stem(token) in stopWords:
        is_noise = True
    elif len(token.strip()) < min_token_length:
        is_noise = True
    elif re.findall(r"\,|\`|\'", token):
        is_noise = True
    return is_noise 

In [33]:
from collections import Counter 
  
def most_frequent(List): 
    occurence_count = Counter(List) 
    return [occurence_count.most_common(4)[0],occurence_count.most_common(4)[1],
            occurence_count.most_common(4)[2],occurence_count.most_common(4)[3]]  

In [35]:
# use these two dictionaries to store the most frequent terms associated with the entities
person_most_popular_terms = {}
location_most_popular_terms = {}

# finally, now find the most frequent tokens associated with the entities
for person in top_persons:
    temp_words = []
    for doc in combined_persons[person]:
        for sent in doc[1]:
            temp_words += treebank_tokenizer.tokenize(str(sent))
    temp_words  = [word.strip("\\") for word in temp_words if not isNoise(word)] 
    person_most_popular_terms[person] = []
    for pair in most_frequent(temp_words):  
        if (pair[0] != person) & (len(person_most_popular_terms[person]) < 3):
            person_most_popular_terms[person].append(pair)
    
for location in top_locations:
    temp_words = []
    for doc in combined_locations[location]:
        for sent in doc[1]:
            temp_words += treebank_tokenizer.tokenize(sent)
    temp_words  = [word.strip("\\") for word in temp_words if not isNoise(word)] 
    location_most_popular_terms[location] = []
    for pair in most_frequent(temp_words):
        if (pair[0] != location) & (len(location_most_popular_terms[location]) < 3):
            location_most_popular_terms[location].append(pair)        

### Step 7: Present your results of the most popular entities and their associated terms

In [36]:
# My way of finding the associated terms is to find the top 3 most frequent terms which cooccur with the certain entity in the same sentences.
print(person_most_popular_terms)

{'Oper': [('vs', 1377), ('cts', 853), ('loss', 739)], 'Record': [('cts', 809), ('vs', 411), ('April', 405)], 'Reagan': [('said', 171), ('U.S.', 140), ('President', 124)], 'Baker': [('said', 115), ('was', 52), ('Hughes', 35)], 'LOSS': [('loss', 263), ('vs', 172), ('cts', 139)], '4TH': [('vs', 196), ('cts', 147), ('QTR', 132)], 'Lawson': [('said', 78), ('was', 28), ('would', 24)], 'NET': [('vs', 253), ('cts', 176), ('Shr', 111)], 'Avg': [('vs', 251), ('mln', 163), ('billion', 129)], 'James Baker': [('James', 95), ('Treasury', 94), ('Baker', 93)], 'PAYOUT': [('cts', 57), ('SETS', 32), ('lt', 30)], 'Qtr': [('vs', 134), ('cts', 85), ('Shr', 45)], 'Poehl': [('said', 22), ('would', 15), ('rate', 14)], 'BAKER': [('TREASURY', 38), ('SAYS', 28), ('U.S.', 20)], 'Yeutter': [('said', 48), ('U.S.', 24), ('trade', 16)], 'Revs': [('vs', 121), ('loss', 72), ('mln', 54)], 'Clayton Yeutter': [('U.S.', 61), ('Trade', 45), ('Yeutter', 43)], 'Kiichi Miyazawa': [('Finance', 41), ('Minister', 38), ('Kiichi', 




In [37]:
print(location_most_popular_terms)

{'U.S.': [('said', 1861), ('trade', 575), ('dlrs', 500)], 'Shr': [('vs', 2889), ('cts', 1932), ('loss', 1854)], 'Japan': [('said', 648), ('U.S.', 434), ('trade', 322)], 'U.K.': [('said', 142), ('MONEY', 103), ('MARKET', 102)], 'Brazil': [('said', 224), ('mln', 99), ('dlrs', 78)], 'the United States': [('United', 411), ('States', 410), ('said', 227)], 'Paris': [('said', 182), ('accord', 93), ('nations', 90)], 'Canada': [('said', 147), ('U.S.', 93), ('pct', 70)], 'China': [('said', 169), ('tonnes', 78), ('mln', 64)], 'Washington': [('said', 140), ('U.S.', 90), ('trade', 84)], 'London': [('said', 145), ('market', 52), ('was', 44)], 'West Germany': [('West', 280), ('Germany', 277), ('said', 123)], 'New York': [('New', 305), ('York', 282), ('said', 157)], 'Taiwan': [('U.S.', 150), ('said', 115), ('billion', 91)], 'Iran': [('said', 150), ('Gulf', 71), ('U.S.', 67)], 'Britain': [('said', 101), ('Japan', 77), ('France', 65)], 'Gulf': [('said', 126), ('U.S.', 83), ('oil', 58)], 'JAPAN': [('Japa




### Output Analysis

##### From the output above, we can see all top 500 popular persons and locations in Reuters with their top 3 frequently associated terms. Despite little number of meaningless words like "said", "it", etc, the majority of the associated terms are informatically useful. In addition, we can see there is strong association between popular persons and locations, which leads me to choose the first one in the Extra Credit questions.

## Extra Credit

### Determine which persons and locations most frequently occur in the same sentences.

In [61]:
# Here, I want to store all possible occurred locations associated with a person and vice versa.
locations_associated_with_persons = {}
persons_associated_with_locations ={}

for person in person_most_popular_terms.keys():
    locations_associated_with_persons[person] = []
    all_sentences = []
    for doc in combined_persons[person]:
        for sent in doc[1]:
            all_sentences.append(sent)
    analyzed_all_sents = nlp(str(all_sentences))
    for entity in analyzed_all_sents.ents:
        if entity.text.strip() != "":
            if entity.label_ in ["LOC","GPE"]:
                locations_associated_with_persons[person].append(entity.text.strip())
                
for location in location_most_popular_terms.keys():
    persons_associated_with_locations[location] = []
    all_sentences = []
    for doc in combined_locations[location]:
        for sent in doc[1]:
            all_sentences.append(sent)
    analyzed_all_sents = nlp(str(all_sentences))
    for entity in analyzed_all_sents.ents:
        if entity.text.strip() != "":
            if (entity.label_ == "PERSON") & (entity.text.strip()!= '\\n\\n'):
                persons_associated_with_locations[location].append(entity.text.strip())

In [62]:
# Find out the most frequently cooccurred location with a person and vice versa.
most_freq_locations_associated_with_persons = {}
most_freq_persons_associated_with_locations = {}

for person in locations_associated_with_persons.keys():
    occurence_count = Counter(locations_associated_with_persons[person])
    if len(occurence_count.most_common(1)) > 0:
        most_freq_locations_associated_with_persons[person] = occurence_count.most_common(1)[0]
    
for location in persons_associated_with_locations.keys():
    occurence_count = Counter(persons_associated_with_locations[location])
    if len(occurence_count.most_common(1)) > 0:
        most_freq_persons_associated_with_locations[location] = occurence_count.most_common(1)[0]

In [63]:
most_freq_locations_associated_with_persons

{'Oper': ('dlrs\\n', 221),
 'Record': ('Mthly', 18),
 'Reagan': ('U.S.', 99),
 'Baker': ('U.S.', 15),
 'LOSS': ('Shr', 84),
 '4TH': ('Shr', 44),
 'Lawson': ('the United States', 5),
 'NET': ('Shr', 54),
 'Avg': ('dlrs\\n', 19),
 'James Baker': ('U.S.', 34),
 'PAYOUT': ('Qtly', 4),
 'Qtr': ('Shr', 4),
 'Poehl': ('and\\n', 4),
 'BAKER': ('U.S.', 18),
 'Yeutter': ('U.S.', 16),
 'Revs': ('dlrs\\n', 19),
 'Clayton Yeutter': ('U.S.', 36),
 'Kiichi Miyazawa': ('Japan', 13),
 'Allegheny': ('Sunter', 1),
 'Johnson': ('U.S.', 4),
 'Brown': ('dlrs\\n', 1),
 'DIVIDEND': ('HIKES', 1),
 'TONNES': ('U.S.', 9),
 'Icahn': ('Piedmont', 3),
 'Crazy Eddie': ('Crazy Eddie', 4),
 'Yasuhiro Nakasone': ('Japan', 11),
 'Williams': ('Norcros', 2),
 'Prev Wk': ('E.O.A', 4),
 'Bass': ('Texas', 5),
 'Reuter': ('Japan', 2),
 'REAGAN': ('U.S.', 11),
 '3RD': ('Shr', 10),
 '3RD QTR FEB': ('Shr', 4),
 'Baldrige': ('U.S.', 9),
 'Nigel Lawson': ('U.K.', 8),
 'Miyazawa': ('Japan', 4),
 'Subroto': ('Indonesia', 3),
 'Richa

In [64]:
most_freq_persons_associated_with_locations

{'U.S.': ('the\\n', 107),
 'Shr': ('mln\\n', 262),
 'Japan': ('Reagan', 49),
 'U.K.': ('LAWSON', 16),
 'Brazil': ('the\\n', 12),
 'the United States': ('the\\n', 18),
 'Paris': ('Baker', 15),
 'Canada': ('by\\n', 8),
 'China': ('mln\\n', 8),
 'Washington': ('the\\n', 10),
 'London': ('Lawson', 5),
 'West Germany': ('the\\n', 11),
 'New York': ('the\\n', 7),
 'Taiwan': ('by\\n', 5),
 'Iran': ('the\\n', 13),
 'Britain': ('Lawson', 8),
 'Gulf': ('Reagan', 16),
 'JAPAN': ('an\\n', 3),
 'Tokyo': ('the\\n', 8),
 'France': ('Kiichi Miyazawa', 5),
 'Indonesia': ('the\\n', 8),
 'Ecuador': ('Javier Espinosa', 9),
 'Australia': ('the\\n', 11),
 'Europe': ('he\\n', 2),
 'Nine': ('Avg', 26),
 'Texas': ('Jim Wright', 4),
 'U.S': ('the\\n', 6),
 'Italy': ('the\\n', 4),
 'the Soviet Union': ('Reagan', 7),
 'Saudi Arabia': ('Nazer', 9),
 'South Korea': ('the\\n', 4),
 'Iraq': ('the\\n', 4),
 'Avg': ('mln\\n', 89),
 'Malaysia': ('the\\n', 3),
 'India': ('Cargill', 3),
 'Kuwait': ('Kuwait', 9),
 'Colombi

### Output Analysis

##### Here I identified the most frequent location cooccurred with a certain person in the same sentences and vice versa. From the output above, for example, the name "Reagan" cooccurred with "U.S." most and it's 99 times. Similar cases exist like "Yasuhiro Nakasone" and "Japan", "James Baker" and "U.S.", which all make a lot of sense. However, the output for my second dictionary is not ideal, where I need to get rid of the noisy words.  
##### If I was given further time, I will do two things: First, I want to consider all frequent terms rather than top 3, although under many cases top 3 would be enough. Second, I will go back and figure out how to improve my code to be able to get rid of the noisy words showing up in my "most_freq_persons_associated_with_locations".

##### Thank you! Best, Guangji.