# imports

In [1]:
import en_core_web_lg
import pandas as pd
import re
import random
import spacy
from spacy.util import minibatch, compounding
import warnings
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm

# Get food data and organize

In [2]:
# read in the food csv file
food_df = pd.read_csv("/Users/ElizabethHealy/Downloads/food.csv")

# print row and column information
food_df.head()

Unnamed: 0,fdc_id,data_type,description,food_category_id,publication_date
0,356425,branded_food,MOCHI ICE CREAM BONBONS,,2019-04-01
1,356426,branded_food,CHIPOTLE BARBECUE SAUCE,,2019-04-01
2,356427,branded_food,HOT & SPICY BARBECUE SAUCE,,2019-04-01
3,356428,branded_food,BARBECUE SAUCE,,2019-04-01
4,356429,branded_food,BARBECUE SAUCE,,2019-04-01


In [3]:
foods = food_df[food_df["description"].str.contains(
    "[^a-zA-Z ]") == False]["description"].apply(lambda food: food.lower())

# <=3 words, no dups
foods = foods[foods.str.split().apply(len) <= 3].drop_duplicates()

In [4]:
one_worded_foods = foods[foods.str.split().apply(len) == 1]
two_worded_foods = foods[foods.str.split().apply(len) == 2]
three_worded_foods = foods[foods.str.split().apply(len) == 3]

In [6]:
total_num_foods = round(one_worded_foods.size / 45 * 100)

#mix
two_worded_foods = two_worded_foods.sample(frac=1)
three_worded_foods = three_worded_foods.sample(frac=1)

#combine
foods = one_worded_foods.append(two_worded_foods[:round(
    total_num_foods * 0.30)]).append(three_worded_foods[:round(total_num_foods * 0.25)])

for i in range(3):
    print(f"{i+1}-worded food entities:", foods[foods.str.split().apply(len) == i + 1].size)

1-worded food entities: 1231
2-worded food entities: 821
3-worded food entities: 684


In [24]:
food_templates = [
    "I ate my {}",
    "I'm eating a {}",
    "I just ate a {}",
    "I only ate the {}",
    "I'm done eating a {}",
    "I've already eaten a {}",
    "I just finished my {}",
    "When I was having lunch I ate a {}",
    "I had a {} and a {} today",
    "I ate a {} and a {} for lunch",
    "I made a {} and {} for lunch",
    "I ate {} and {}",
    "today I ate a {} and a {} for lunch",
    "I had {} with my husband last night",
    "I brought you some {} on my birthday",
    "I made {} for yesterday's dinner",
    "last night, a {} was sent to me with {}",
    "I had {} yesterday and I'd like to eat it anyway",
    "I ate a couple of {} last night",
    "I had some {} at dinner last night",
    "Last night, I ordered some {}",
    "I made a {} last night",
    "I had a bowl of {} with {} and I wanted to go to the mall today",
    "I brought a basket of {} for breakfast this morning",
    "I had a bowl of {}",
    "I ate a {} with {} in the morning",
    "I made a bowl of {} for my breakfast",
    "There's {} for breakfast in the bowl this morning",
    "This morning, I made a bowl of {}",
    "I decided to have some {} as a little bonus",
    "I decided to enjoy some {}",
    "I've decided to have some {} for dessert",
    "I had a {}, a {} and {} at home",
    "I took a {}, {} and {} on the weekend",
    "I ate a {} with {} and {} just now",
    "Last night, I ate an {} with {} and {}",
    "I tasted some {}, {} and {} at the office",
    "There's a basket of {}, {} and {} that I consumed",
    "I devoured a {}, {} and {}",
    "I've already had a bag of {}, {} and {} from the fridge"
]

In [25]:
# to store data
TRAIN_FOOD_DATA = {
    "one_food": [],
    "two_foods": [],
    "three_foods": []
}

TEST_FOOD_DATA = {
    "one_food": [],
    "two_foods": [],
    "three_foods": []
}

In [26]:
FOOD_SENTENCE_LIMIT = 167

def get_food_data(count):
    return {
        1: TRAIN_FOOD_DATA["one_food"] if len(TRAIN_FOOD_DATA["one_food"]) < FOOD_SENTENCE_LIMIT else TEST_FOOD_DATA["one_food"],
        2: TRAIN_FOOD_DATA["two_foods"] if len(TRAIN_FOOD_DATA["two_foods"]) < FOOD_SENTENCE_LIMIT else TEST_FOOD_DATA["two_foods"],
        3: TRAIN_FOOD_DATA["three_foods"] if len(TRAIN_FOOD_DATA["three_foods"]) < FOOD_SENTENCE_LIMIT else TEST_FOOD_DATA["three_foods"],
    }[count]

pattern_to_replace = "{}"

foods = foods.sample(frac=1)

food_entity_count = foods.size - 1

In [27]:
while food_entity_count >= 2:
    entities = []

    sentence = food_templates[random.randint(0, len(food_templates) - 1)]

    matches = re.findall(pattern_to_replace, sentence)

    for match in matches:
        food = foods.iloc[food_entity_count]
        food_entity_count -= 1

        sentence = sentence.replace(match, food, 1)
        match_span = re.search(food, sentence).span()

        entities.append((match_span[0], match_span[1], "FOOD"))

    get_food_data(len(matches)).append((sentence, {"entities": entities}))

In [28]:
for key in TRAIN_FOOD_DATA:
    print("{} {} sentences: {}".format(len(TRAIN_FOOD_DATA[key]), key, TRAIN_FOOD_DATA[key][0]))

167 one_food sentences: ('I ate my tropicals pops', {'entities': [(9, 23, 'FOOD')]})
167 two_foods sentences: ('last night, a chips was sent to me with ju ju hearts', {'entities': [(14, 19, 'FOOD'), (40, 52, 'FOOD')]})
167 three_foods sentences: ('I devoured a breakfast sausage kit, pickle and choco grains biscuits', {'entities': [(13, 34, 'FOOD'), (36, 42, 'FOOD'), (47, 68, 'FOOD')]})


In [29]:
for key in TEST_FOOD_DATA:
    print("{} {} items: {}".format(len(TEST_FOOD_DATA[key]), key, TEST_FOOD_DATA[key][0]))

864 one_food items: ('I had fried potatoes with my husband last night', {'entities': [(6, 20, 'FOOD')]})
197 two_foods items: ('I ate cadbury and zebra cakes', {'entities': [(6, 13, 'FOOD'), (18, 29, 'FOOD')]})
158 three_foods items: ('I devoured a organic macaroni product, spearmint and whipped heavy cream', {'entities': [(13, 37, 'FOOD'), (39, 48, 'FOOD'), (53, 72, 'FOOD')]})


# Get Article Data and Organize

In [13]:
articles_df = pd.read_csv("/Users/ElizabethHealy/Downloads/articles1.csv")

articles_df.head()

Unnamed: 0.1,Unnamed: 0,id,title,publication,author,date,year,month,url,content
0,0,17283,House Republicans Fret About Winning Their Hea...,New York Times,Carl Hulse,2016-12-31,2016.0,12.0,,WASHINGTON — Congressional Republicans have...
1,1,17284,Rift Between Officers and Residents as Killing...,New York Times,Benjamin Mueller and Al Baker,2017-06-19,2017.0,6.0,,"After the bullet shells get counted, the blood..."
2,2,17285,"Tyrus Wong, ‘Bambi’ Artist Thwarted by Racial ...",New York Times,Margalit Fox,2017-01-06,2017.0,1.0,,"When Walt Disney’s “Bambi” opened in 1942, cri..."
3,3,17286,"Among Deaths in 2016, a Heavy Toll in Pop Musi...",New York Times,William McDonald,2017-04-10,2017.0,4.0,,"Death may be the great equalizer, but it isn’t..."
4,4,17287,Kim Jong-un Says North Korea Is Preparing to T...,New York Times,Choe Sang-Hun,2017-01-02,2017.0,1.0,,"SEOUL, South Korea — North Korea’s leader, ..."


In [14]:
nlp = en_core_web_lg.load()

In [16]:
revision_texts = []

for doc in tqdm(nlp.pipe(articles_df["content"][:6000], batch_size=30, disable=["tagger", "ner", "lemmatizer"])):
    for sentence in doc.sents:
        if  40 < len(sentence.text) < 80:
            revision_texts.append(" ".join(re.split("\s+", sentence.text, flags=re.UNICODE)))




In [19]:
revisions = []

for doc in tqdm(nlp.pipe(revision_texts, batch_size=50, disable=["tagger", "parser", "lemmatizer"])):
    
    if len(doc.ents) > 0:
        revisions.append((doc.text, {"entities": [(e.start_char, e.end_char, e.label_) for e in doc.ents]}))




In [30]:
print(revisions[0][0])

print(revisions[0][1])

And across the Bronx, investigative resources are squeezed.
{'entities': [(15, 20, 'GPE')]}


In [31]:
TRAIN_REVISION_DATA = []
TEST_REVISION_DATA = []

TRAIN_ENTITY_COUNTER = {}
TEST_ENTITY_COUNTER = {}

REVISION_SENTENCE_SOFT_LIMIT = 100

In [32]:
def increment_revision_counters(entity_counter, entities):
    for entity in entities:
        label = entity[2]
        if label in entity_counter:
            entity_counter[label] += 1
        else:
            entity_counter[label] = 1

In [33]:
random.shuffle(revisions)
for revision in tqdm(revisions):
    entities = revision[1]["entities"]

    should_append_to_train_counter = 0
    for _, _, label in entities:
        if label in TRAIN_ENTITY_COUNTER and TRAIN_ENTITY_COUNTER[label] > REVISION_SENTENCE_SOFT_LIMIT:
            should_append_to_train_counter -= 1
        else:
            should_append_to_train_counter += 1

    if should_append_to_train_counter >= 0:
        TRAIN_REVISION_DATA.append(revision)
        increment_revision_counters(TRAIN_ENTITY_COUNTER, entities)
    else:
        TEST_REVISION_DATA.append(revision)
        increment_revision_counters(TEST_ENTITY_COUNTER, entities)




# Training

In [34]:
TRAIN_ENTITY_COUNTER

{'CARDINAL': 172,
 'DATE': 220,
 'EVENT': 101,
 'FAC': 101,
 'GPE': 165,
 'LANGUAGE': 61,
 'LAW': 50,
 'LOC': 101,
 'MONEY': 104,
 'NORP': 115,
 'ORDINAL': 104,
 'ORG': 177,
 'PERCENT': 101,
 'PERSON': 262,
 'PRODUCT': 101,
 'QUANTITY': 101,
 'TIME': 105,
 'WORK_OF_ART': 106}

In [35]:
TEST_ENTITY_COUNTER

{'CARDINAL': 5575,
 'DATE': 7713,
 'EVENT': 174,
 'FAC': 241,
 'GPE': 5891,
 'LANGUAGE': 18,
 'LAW': 7,
 'LOC': 554,
 'MONEY': 854,
 'NORP': 2468,
 'ORDINAL': 1071,
 'ORG': 5936,
 'PERCENT': 462,
 'PERSON': 16245,
 'PRODUCT': 219,
 'QUANTITY': 259,
 'TIME': 836,
 'WORK_OF_ART': 831}

In [36]:
TRAIN_FOOD_DATA_COMBINED = TRAIN_FOOD_DATA["one_food"] + TRAIN_FOOD_DATA["two_foods"] + TRAIN_FOOD_DATA["three_foods"]

print("FOOD", len(TRAIN_FOOD_DATA_COMBINED))

print("REVISION", len(TRAIN_REVISION_DATA))

TRAIN_DATA = TRAIN_REVISION_DATA + TRAIN_FOOD_DATA_COMBINED
print("COMBINED", len(TRAIN_DATA))

FOOD 501
REVISION 1434
COMBINED 1935


In [37]:
ner = nlp.get_pipe("ner")
ner.add_label("FOOD")

pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]

In [39]:
from spacy.training import Example

In [42]:
epochs = 30
optimizer = nlp.resume_training()
with nlp.disable_pipes(*other_pipes), warnings.catch_warnings():
    warnings.filterwarnings("once", category=UserWarning, module='spacy')
    sizes = compounding(1.0, 4.0, 1.001)
    
    for epoch in tqdm(range(epochs)):
        examples = TRAIN_DATA
        random.shuffle(examples)
        batches = minibatch(examples, size=sizes)
        losses = {}
        
        for batch in batches:
            examples = []
            for text, annotation in batch:
                # create Example
                doc = nlp.make_doc(text)
                example = Example.from_dict(doc, annotation)
                examples += [example]
            nlp.update(examples, sgd=optimizer, drop=0.35, losses=losses)

        print("Losses ({}/{})".format(epoch + 1, epochs), losses)

Losses (1/30) {'ner': 2377.8630048280183}
Losses (2/30) {'ner': 1410.7387512310331}
Losses (3/30) {'ner': 1144.2987907257325}
Losses (4/30) {'ner': 1069.0466717254194}
Losses (5/30) {'ner': 919.1210094992783}
Losses (6/30) {'ner': 808.7935200928196}
Losses (7/30) {'ner': 732.6207742005149}
Losses (8/30) {'ner': 774.4898677134069}
Losses (9/30) {'ner': 724.468754730585}
Losses (10/30) {'ner': 601.6654691512047}
Losses (11/30) {'ner': 572.536107361625}
Losses (12/30) {'ner': 649.9299391393989}
Losses (13/30) {'ner': 584.3419035830849}
Losses (14/30) {'ner': 567.2139042014884}
Losses (15/30) {'ner': 531.8329892524305}
Losses (16/30) {'ner': 448.25248885973065}
Losses (17/30) {'ner': 516.5044994210555}
Losses (18/30) {'ner': 410.8888742322903}
Losses (19/30) {'ner': 404.0416318042879}
Losses (20/30) {'ner': 383.4759168651294}
Losses (21/30) {'ner': 492.532485882208}
Losses (22/30) {'ner': 388.9639382295721}
Losses (23/30) {'ner': 412.9968572465007}
Losses (24/30) {'ner': 390.8926224759371}

# Validation

In [43]:
spacy.displacy.render(nlp("Apple is looking at buying U.K. startup for $1 billion"), style="ent")

In [44]:
spacy.displacy.render(nlp("I had a hamburger and chips for lunch today."), style="ent")
spacy.displacy.render(nlp("I decided to have chocolate ice cream as a little treat for myself."), style="ent")
spacy.displacy.render(nlp("I ordered basmati rice, leaf spinach and cheese from Tesco yesterday"), style="ent")

In [45]:
#dictionary to hold our evaluation data
food_evaluation = {
    "one_food": {
        "correct": 0,
        "total": 0,
    },
    "two_foods": {
        "correct": 0,
        "total": 0
    },
    "three_foods": {
        "correct": 0,
        "total": 0
    }
}

word_evaluation = {
    "1_worded_foods": {
        "correct": 0,
        "total": 0
    },
    "2_worded_foods": {
        "correct": 0,
        "total": 0
    },
    "3_worded_foods": {
        "correct": 0,
        "total": 0
    }
}

In [46]:
for key in TEST_FOOD_DATA:
    foods = TEST_FOOD_DATA[key]

    for food in tqdm(foods):
        sentence = food[0]
        entities = food[1]["entities"]

        for entity in entities:
            doc = nlp(sentence)
            correct_text = sentence[entity[0]:entity[1]]
            n_worded_food =  len(correct_text.split())

            for ent in doc.ents:
                if ent.label_ == entity[2] and ent.text == correct_text:
                    food_evaluation[key]["correct"] += 1
                    if n_worded_food > 0:
                        word_evaluation[f"{n_worded_food}_worded_foods"]["correct"] += 1
                    
                    break
            
            food_evaluation[key]["total"] += 1
            if n_worded_food > 0:
                word_evaluation[f"{n_worded_food}_worded_foods"]["total"] += 1

In [47]:
for key in word_evaluation:
    correct = word_evaluation[key]["correct"]
    total = word_evaluation[key]["total"]

    print(f"{key}: {correct / total * 100:.2f}%")

food_total_sum = 0
food_correct_sum = 0

print("---")
for key in food_evaluation:
    correct = food_evaluation[key]["correct"]
    total = food_evaluation[key]["total"]
    
    food_total_sum += total
    food_correct_sum += correct

    print(f"{key}: {correct / total * 100:.2f}%")

print(f"\nTotal: {food_correct_sum/food_total_sum * 100:.2f}%")

1_worded_foods: 97.93%
2_worded_foods: 98.47%
3_worded_foods: 96.09%
---
one_food: 97.22%
two_foods: 98.22%
three_foods: 97.89%

Total: 97.63%


In [48]:
entity_evaluation = {}

def update_results(entity, metric):
    if entity not in entity_evaluation:
        entity_evaluation[entity] = {"correct": 0, "total": 0}
    
    entity_evaluation[entity][metric] += 1

for data in tqdm(TEST_REVISION_DATA):
    sentence = data[0]
    entities = data[1]["entities"]

    for entity in entities:
        doc = nlp(sentence)
        correct_text = sentence[entity[0]:entity[1]]

        for ent in doc.ents:
            if ent.label_ == entity[2] and ent.text == correct_text:
                update_results(ent.label_, "correct")
                break

        update_results(entity[2], "total")

In [49]:
sum_total = 0
sum_correct = 0

for entity in entity_evaluation:
    total = entity_evaluation[entity]["total"]
    correct = entity_evaluation[entity]["correct"]

    sum_total += total
    sum_correct += correct
    
    print("{} | {:.2f}%".format(entity, correct / total * 100))

print()
print("Overall accuracy: {:.2f}%".format(sum_correct / sum_total * 100))

DATE | 79.75%
PERSON | 91.03%
WORK_OF_ART | 67.27%
GPE | 90.21%
ORDINAL | 96.64%
NORP | 87.93%
CARDINAL | 84.83%
ORG | 66.27%
MONEY | 89.58%
QUANTITY | 84.56%
TIME | 68.18%
PERCENT | 96.10%
FAC | 75.93%
LOC | 71.66%
EVENT | 65.52%
PRODUCT | 67.12%
LAW | 28.57%
LANGUAGE | 100.00%

Overall accuracy: 84.17%


In [51]:
nlp.meta["name"] = "food_entity_extractor_v1"
nlp.to_disk("/Users/ElizabethHealy/Desktop/models")

# Try it out

In [80]:
with open("movie_scripts/Pulp Fiction.txt", 'r') as f:
    pulp_str=f.read()

In [71]:
pulp_str = "Hello. My name is Liz and I eat cheese burgers. Also, I bought a computer from Apple and then got a churro from the mall."

In [85]:
from nltk import sent_tokenize

In [97]:
pulp_texts = []
for doc in tqdm(nlp.pipe([pulp_str], batch_size=1, disable=["tagger", "ner", "lemmatizer"])):
    for sentence in doc.sents:
        pulp_texts.append(" ".join(re.split("\s+", sentence.text, flags=re.UNICODE)))




In [98]:
print(len(pulp_texts))

4968


In [102]:
pulp_texts[10]

'YOUNG MAN '

In [103]:
pulp_food_phrases = []

for sent in tqdm(pulp_texts):
    doc = nlp(sent)
    if len(doc.ents) > 0:
        pulp_food_phrases += [str(e.text) for e in list(doc.ents) if e.label_ == "FOOD"]




In [105]:
#import pytest
import nltk
import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context
    
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
from nltk.corpus import wordnet as wn
from nltk import word_tokenize
from nltk import sent_tokenize
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ElizabethHealy/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/ElizabethHealy/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/ElizabethHealy/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [8]:
def if_food_top2(word):

    syns = wn.synsets(str(word))
    last = min(2,len(syns))
    for syn in syns[:last]:
        if 'food' in syn.lexname():
            return 1
    return 0

In [109]:
pulp_food_2 = [word for phrase in pulp_food_phrases 
               for word in word_tokenize(phrase.lower()) if word.isalpha() and if_food_top2(word)]

# On movies

In [116]:
import os
all_files = []
for file in os.listdir("movie_scripts/"):
    if file.endswith(".txt"):
        all_files += [file]

scripts = []

for file_name in tqdm(all_files):
    pth = os.path.join("movie_scripts",file_name)
    with open(pth, 'r') as f:
        scripts += [f.read()]
print(len(scripts))


1176


In [118]:
script_names = [i[:-4] for i in all_files]

In [120]:
script_texts = {}
idx = 0
for doc in tqdm(nlp.pipe(scripts, batch_size=1, disable=["tagger", "ner", "lemmatizer"])):
    doc_texts = []
    name = script_names[idx]
    for sentence in doc.sents:
        doc_texts.append(" ".join(re.split("\s+", sentence.text, flags=re.UNICODE)))
    script_texts[name] = doc_texts
    idx += 1




In [121]:
script_food_words = {}

for k,sents in tqdm(script_texts.items()):
    # don't append sentences that have no entities
    script_food_words[k] = []
    for sent in sents:
        doc = nlp(sent)
        if len(doc.ents) > 0:
            phrases = [str(e.text) for e in list(doc.ents) if e.label_ == "FOOD"]
            script_food_words[k] += [word for phrase in phrases 
               for word in word_tokenize(phrase.lower()) if word.isalpha() and if_food_top2(word)]




In [122]:
script_food_words["Forrest Gump"]

['chocolates',
 'chocolates',
 'chocolate',
 'liquor',
 'chocolates',
 'chocolates',
 'food',
 'soda',
 'shrimp',
 'shrimp',
 'shrimp',
 'shrimp',
 'shrimp',
 'shrimp',
 'shrimp',
 'shrimp',
 'shrimp',
 'shrimp',
 'shrimp',
 'kabobs',
 'shrimp',
 'shrimp',
 'pineapple',
 'shrimp',
 'lemon',
 'shrimp',
 'coconut',
 'shrimp',
 'shrimp',
 'shrimp',
 'soup',
 'shrimp',
 'stew',
 'shrimp',
 'salad',
 'shrimp',
 'potatoes',
 'shrimp',
 'burger',
 'shrimp',
 'sandwich',
 'honey',
 'beer',
 'steaks',
 'beer',
 'beer',
 'shrimp',
 'shrimp',
 'steaks',
 'rice',
 'shrimp',
 'cream',
 'cream',
 'cream',
 'cream',
 'cream',
 'cream',
 'sugar',
 'wine',
 'shrimp',
 'shrimp',
 'shrimp',
 'sandwich',
 'shrimp',
 'shrimp',
 'cocktail',
 'shrimp',
 'shrimp',
 'liquor',
 'shrimp',
 'shrimps',
 'shrimp',
 'cocktails',
 'barbecues',
 'chocolates',
 'shrimp',
 'milk',
 'coffee',
 'sandwich',
 'orange',
 'orange']

In [123]:
script_food_words["Shrek"]

['berries',
 'milk',
 'cookie',
 'gumdrop',
 'beer',
 'beer',
 'beer',
 'beer',
 'beer',
 'drink',
 'parfait',
 'potatoes',
 'crisp',
 'eggs',
 'eggs',
 'tea',
 'fish',
 'candy']

In [124]:
script_food_words["Antz"]

['beer',
 'beer',
 'beer',
 'beer',
 'mixture',
 'crumpets',
 'canapes',
 'jello',
 'egg',
 'salt',
 'potato',
 'salad',
 'sandwich',
 'meal',
 'potato',
 'salad',
 'coke',
 'sweet',
 'nectar',
 'lemonade',
 'lemon',
 'peel',
 'food',
 'honeydew',
 'honeydew']

In [125]:
import json

In [126]:
with open("data/movie_food_words_tagged_top2.json", "w") as outfile: 
    json.dump(script_food_words, outfile)

# Trying Again

In [4]:
#load model
nlp = spacy.load("/Users/ElizabethHealy/Desktop/models")

In [5]:
spacy.displacy.render(nlp("Apple is looking at buying U.K. startup for $1 billion"), style="ent")

In [6]:
spacy.displacy.render(nlp("I had a hamburger and chips for lunch today."), style="ent")
spacy.displacy.render(nlp("I decided to have chocolate ice cream as a little treat for myself."), style="ent")
spacy.displacy.render(nlp("I ordered basmati rice, leaf spinach and cheese from Tesco yesterday"), style="ent")

In [7]:
import nltk
import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context
    
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
from nltk.corpus import wordnet as wn
from nltk import word_tokenize
from nltk import sent_tokenize
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ElizabethHealy/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/ElizabethHealy/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/ElizabethHealy/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [9]:
import os
all_files = []
for file in os.listdir("movie_scripts/"):
    if file.endswith(".txt"):
        all_files += [file]

scripts = []

for file_name in tqdm(all_files):
    pth = os.path.join("movie_scripts",file_name)
    with open(pth, 'r') as f:
        scripts += [f.read()]
print(len(scripts))


1176


In [10]:
script_names = [i[:-4] for i in all_files]

In [11]:
script_texts = {}
idx = 0

for doc in tqdm(nlp.pipe(scripts, batch_size=1, disable=["tagger", "ner", "lemmatizer"])):
    doc_texts = []
    name = script_names[idx]
    for sentence in doc.sents:
        doc_texts.append(" ".join(re.split("\s+", sentence.text, flags=re.UNICODE)))
    script_texts[name] = doc_texts
    idx += 1




In [43]:
len(script_texts)

1176

In [44]:
script_food_NER = {}

for k,sents in tqdm(script_texts.items()):
#for k,sents in tqdm({"Forrest Gump":script_texts["Forrest Gump"]}.items()):
    script_food_NER[k] = []
    for sent in sents:
        doc = nlp(sent)
        if len(doc.ents) > 0:
            phrases = [str(e.text) for e in list(doc.ents) if e.label_ == "FOOD"]
            script_food_NER[k] += phrases




In [48]:
print(script_food_NER["Pulp Fiction"])

['moist', 'munching', 'bacon', 'eating eggs', 'cream', 'sugar', 'fuckin', 'servin', "nothin'", 'wetback gettin', "goin'", "havin'", 'black �', 'long green dusters', 'roll', 'joint', 'hash bar', '�', "goin'", 'paper cup', 'beer', 'french fries', 'nigger gonna', "nothin'", 'eatin', 'foot massage ai', 'stickin', 'holyies', "nothin'", 'foot massage', '�', 'glass- motherfuckin-house', "kill'a motherfucker", "goin'", "somethin'", 'hamburgers', 'french fries', 'soda pops laid', 'blow-dry', 'Hawaiian burger joint', 'tasty burgers', 'burger', 'tasty burger', 'vegetarian', 'fast food drink cup', 'your tasty beverage', "C'm", 'black �', '�', '1964 cherry red', 'food', "pack'a Red Apples", "somethin'", '�', 'red flannel', 'nigger', 'casa', 'su casa', 'dickless piece', "It'a", 'chicken shit', '�', '�', '�', '�', '�', '�', 'scotch', '�', 'scotch', '�', '�', 'drink', 'neon', 'sqaure', 'wannabe beboppers', 'crisp', 'vanilla coke', 'five-dollar shake', 'five-dollar shake', 'ice cream', 'juicy hamburger

In [47]:
with open("data/movie_food_NER.json", "w") as outfile: 
    json.dump(script_food_NER, outfile)

In [46]:
import json

In [30]:
def if_food_top1(word):

    syns = wn.synsets(str(word))
    last = min(1,len(syns))
    for syn in syns[:last]:
        if 'food' in syn.lexname():
            return 1
    return 0

In [34]:
def if_phrase_top2(phrase):
    words = word_tokenize(phrase)
    for word in words:
        if if_food_top2(word):
            return True
    return False
    

In [49]:
script_food_phrases = {}

for k,phrases in tqdm(script_food_NER.items()):
    script_food_phrases[k]=[]
    for phrase in phrases:
        low = phrase.lower()
        if if_phrase_top2(low):
            res = re.sub(r'[^\x00-\x7f]',r'', phrase.lower())
            script_food_phrases[k] += [res]




In [50]:
script_food_phrases["Forrest Gump"]

['chocolates',
 'chocolates',
 'chocolate',
 'liquor bottle',
 'chocolates',
 'chocolates',
 'food',
 'soda',
 'real shrimp',
 'shrimp catchin',
 'momma cooked shrimp',
 'shrimp',
 'her cooked shrimp',
 'shrimp',
 'momma cooked shrimp',
 'shrimp',
 'shrimp',
 'shrimp',
 'shrimp kabobs',
 'shrimp creole',
 'shrimp gumbo',
 'pineapple shrimp',
 'lemon shrimp',
 'coconut shrimp',
 'pepper shrimp',
 'shrimp soup',
 'shrimp stew',
 'shrimp salad',
 'shrimp',
 'potatoes',
 'shrimp burger',
 'shrimp sandwich',
 'honey',
 'beer',
 'barbecuing steaks',
 'drinking beer',
 'beer cans',
 'shrimp',
 'shrimp',
 'steaks burnin',
 'rice paddy',
 'shrimp',
 'ice cream',
 'ice cream',
 'ice cream cone',
 'ice cream',
 'ice cream',
 'ice cream cone',
 'sugar cube',
 'wine',
 'shrimp',
 'shrimp',
 'shrimp',
 'turkey sandwich',
 'one shrimp',
 'shrimp',
 'cocktail',
 'shrimp',
 'shrimp',
 'liquor bottle',
 'shrimp falls',
 'shrimps',
 'shrimp cocktails',
 'barbecues',
 'chocolates',
 'shrimp',
 'milk',
 'c

In [22]:
import re
import string

In [23]:
pattern = re.compile('[\W_]+')

In [27]:
re.sub(r'\W+', '', "five-dollar shake")

'fivedollarshake'

In [37]:
re.sub(r'[^\x00-\x7f]',r'', "food)")

'food)'

In [51]:
with open("data/movie_food_NER_phrase.json", "w") as outfile: 
    json.dump(script_food_NER, outfile)