In [1]:
import spacy
import pandas as pd
from pathlib import Path
from pprint import pprint
from tqdm.notebook import tqdm

In [2]:
input_path = Path("../../data/processed/nyt-data")
output_path = Path("../../data/processed/entities")
corpus = pd.read_csv(input_path / "corpus_all.csv")

# Named Entity Recognition

In [3]:
corpus = corpus[["text"]]

In [4]:
nlp = spacy.load("en_core_web_sm")

In [5]:
def extract_entities(text):
    doc = nlp(text)
    return [(ent.text, ent.label_) for ent in doc.ents]

In [6]:
data = pd.DataFrame(columns=["entities"])

In [7]:
tqdm.pandas(desc="Processing NER")
data['entities'] = corpus["text"].progress_apply(extract_entities)

Processing NER:   0%|          | 0/98360 [00:00<?, ?it/s]

In [8]:
exploded_data = data.explode('entities')
exploded_data[['token', 'entity']] = pd.DataFrame(exploded_data['entities'].tolist(), index=exploded_data.index)
exploded_data = exploded_data.drop(columns=['entities'])

In [9]:
exploded_data.to_csv(output_path / 'entities.csv', index=False)

In [38]:
result = pd.read_csv(output_path / 'entities.csv')

In [39]:
result

Unnamed: 0,token,entity
0,third,ORDINAL
1,207,CARDINAL
2,eastern conference league los angeles,ORG
3,216,CARDINAL
4,next 15,DATE
...,...,...
4955278,maxwell,ORG
4955279,winter,DATE
4955280,monaco grand prix first,ORG
4955281,1929,DATE


In [40]:
filtered_result = result[result['token'].str.contains('merkel', case=False, na=False)]

In [41]:
result['token'] = result['token'].str.split()
result_exploded = result.explode('token')

In [47]:
entity_counts = result_exploded.groupby('token')['entity'].value_counts().unstack(fill_value=0)

In [48]:
entity_counts['entity'] = entity_counts.idxmax(axis=1)

In [59]:
entity_counts = entity_counts.reset_index()

In [60]:
entity_counts = entity_counts[['token', 'entity']]

In [63]:
entity_counts

entity,token,entity.1
0,0,CARDINAL
1,00,CARDINAL
2,01,CARDINAL
3,02,CARDINAL
4,03,CARDINAL
...,...,...
19157,zucchini,ORG
19158,zucker,PERSON
19159,zuckerberg,PERSON
19160,zuckerman,PERSON


In [64]:
entity_counts.columns

Index(['token', 'entity'], dtype='object', name='entity')

In [65]:
entity_counts.to_csv(output_path / 'entities_most_freq.csv', index = False)