In [1]:
# Import the dependencies
import spacy
from nltk.corpus import reuters
from spacy import displacy
from collections import Counter
import pandas as pd

# Load the small English language model for spacy
nlp = spacy.load("en_core_web_sm")

In [2]:
# Analyze a sentence using spacy
doc = nlp(u"""Patrick Mahomes is a quarterback for the Kansas City Chiefs in the American Conference, 
which is one of two conferences in the National Football League.""")

# Access the tagged entities with .text and .label_
[ent.text +" ---> "+ ent.label_ for ent in doc.ents]

['Patrick Mahomes ---> PERSON',
 'the Kansas City Chiefs ---> ORG',
 'the American Conference ---> ORG',
 'one ---> CARDINAL',
 'two ---> CARDINAL',
 'the National Football League ---> ORG']

In [3]:
# Get all the categories in the Reuters corpus. 
categories = reuters.categories()
print(categories)

['acq', 'alum', 'barley', 'bop', 'carcass', 'castor-oil', 'cocoa', 'coconut', 'coconut-oil', 'coffee', 'copper', 'copra-cake', 'corn', 'cotton', 'cotton-oil', 'cpi', 'cpu', 'crude', 'dfl', 'dlr', 'dmk', 'earn', 'fuel', 'gas', 'gnp', 'gold', 'grain', 'groundnut', 'groundnut-oil', 'heat', 'hog', 'housing', 'income', 'instal-debt', 'interest', 'ipi', 'iron-steel', 'jet', 'jobs', 'l-cattle', 'lead', 'lei', 'lin-oil', 'livestock', 'lumber', 'meal-feed', 'money-fx', 'money-supply', 'naphtha', 'nat-gas', 'nickel', 'nkr', 'nzdlr', 'oat', 'oilseed', 'orange', 'palladium', 'palm-oil', 'palmkernel', 'pet-chem', 'platinum', 'potato', 'propane', 'rand', 'rape-oil', 'rapeseed', 'reserves', 'retail', 'rice', 'rubber', 'rye', 'ship', 'silver', 'sorghum', 'soy-meal', 'soy-oil', 'soybean', 'strategic-metal', 'sugar', 'sun-meal', 'sun-oil', 'sunseed', 'tea', 'tin', 'trade', 'veg-oil', 'wheat', 'wpi', 'yen', 'zinc']


In [4]:
# Locate and store a single article from the Reuters stories with the category "coffee".
article = reuters.raw(fileids = reuters.fileids(categories='coffee')[3])
print(article)

COLOMBIA COFFEE REGISTRATIONS REMAIN OPEN
  Colombia's coffee export registrations
  remain open and there are no plans to close them since a new
  marketing policy means an unlimited amount can be registered,
  Gilberto Arango, president of the private exporters'
  association said.
      "The philosophy of the new policy is not to close
  registrations. Nobody so far said may would be closed," he told
  Reuters.
      On March 13, Colombia opened registrations for April and
  May for an unlimited amount.
      Without giving breakdowns, Arango said private exporters
  had registered 1,322,804 bags this calendar year up to April 6,
  or roughly 440,000 bags per month, slightly lower than the
  average in recent years.
      He estimated the amount of bags registered by the national
  coffee growers' federation at about the same, meaning a total
  of about 900,000 bags registered and sold per month by
  Colombia.
      "The only change that could happen is, because of the
  volume, we 

In [5]:
# Analyze the article with spacy
doc = nlp(article)

# Render NER visualization with displacy to determine entities for extraction
displacy.render(doc, style='ent')

In [6]:
# Store all Reuters articles with category "coffee".
articles = reuters.raw(categories='coffee')

# Analyze the articles with spaCy
doc = nlp(articles)

In [7]:
# Extract geopolitical "GPE" and organizational entities "ORG" using a list comprehension.
geo_org_entities = [ent.text for ent in doc.ents if ent.label_ in ['GPE', 'ORG']]

In [8]:
# Print the first 20 entities.
geo_org_entities[:20]

['INDONESIAN COMMODITY',
 'The Indonesian Commodity Exchange',
 'Reuters',
 'CPO',
 'Trade',
 'Indonesia',
 'Trade',
 'Indonesia',
 'Nainggolan',
 'South Korea',
 'Taiwan',
 'Mexico',
 'Colombia',
 'the Foreign Trade\n  Institute',
 'Colombia',
 'The National Planning Department',
 'Colombia',
 "National Coffee Growers' Federation",
 'New York',
 'Colombia']

In [9]:
# Using a list comprehension convert each entity to lowercase and remove the newline character. 
entities = [i.lower().replace('\n','') for i in geo_org_entities]

# Print the entities
print(entities)

['indonesian commodity', 'the indonesian commodity exchange', 'reuters', 'cpo', 'trade', 'indonesia', 'trade', 'indonesia', 'nainggolan', 'south korea', 'taiwan', 'mexico', 'colombia', 'the foreign trade  institute', 'colombia', 'the national planning department', 'colombia', "national coffee growers' federation", 'new york', 'colombia', 'reuters', 'colombia', 'arango', 'colombia', 'umuarama nil', 'paranavai nil', 'nil', 'nil', 'sao paulo', 'nil', 'votuporanga nil', 'nil', 'sao', 'nil', 'sao simao nil', 'nil', 'tres', 'generally', 'nairobi', 'ab  ', 'the coffee board', 'kenya', 'kenya', 'the national planning department', 'colombia', 'the national planning department', 'colombia', 'uganda', 'uganda', 'coffee marketing board', 'cmb', 'ugandan', 'cmb', 'mombasa', 'uganda', 'new vision', 'cmb', 'kampala', 'uganda', 'cmb', 'uganda', 'u.s.', 'co inc  ', 'colombia', "national coffee growers'", 'colombia', 'guatemala', 'the  international coffee organisation', 'ico', 'mexico', 'the dominican 

In [10]:
# Create a variable, most_freq_entities, that stores the most frequent entities 
# using the most_common() function from the Counter module.
most_freq_entities = Counter(entities).most_common()

# Print the first 10 most frequent entities
print(most_freq_entities[:10])

[('brazil', 172), ('ico', 125), ('u.s.', 84), ('colombia', 81), ('london', 59), ('ibc', 41), ('reuters', 32), ('indonesia', 28), ('india', 25), ('uganda', 21)]


In [11]:
# Use list comprehensions to retrieve each entity and the number of occurrences for each entity in separate lists.
entity = [most_freq_entities[i][0] for i, _ in enumerate(most_freq_entities)]
frequency = [most_freq_entities[i][1] for i, _ in enumerate(most_freq_entities)]

In [12]:
# Create a DataFrame that has columns to hold each entity and the number of times each entity appears.
common_entities_df = pd.DataFrame(
    {
        'entity':entity,
        'frequency':frequency
    }
)

# Sort the DataFrame
common_entities_df.sort_values(by=['frequency'], ascending=False).reset_index(drop=True)

# Display the first ten rows. 
common_entities_df.head(10)

Unnamed: 0,entity,frequency
0,brazil,172
1,ico,125
2,u.s.,84
3,colombia,81
4,london,59
5,ibc,41
6,reuters,32
7,indonesia,28
8,india,25
9,uganda,21


In [13]:
# Display the last ten rows. 
common_entities_df.tail(10)

Unnamed: 0,entity,frequency
410,the international coffee agreement,1
411,quotas,1
412,the national coffee growers' federation,1
413,back,1
414,syndarma,1
415,resettle,1
416,farms,1
417,the federal statistics office,1
418,attend ico executive board,1
419,dutch coffee roasters' association,1
