In [1]:
import pandas as pd

from geotext import GeoText

from tqdm import tqdm
tqdm.pandas()

import warnings
warnings.filterwarnings("ignore")

import spacy
nlp=spacy.load('en_core_web_lg')
def NER(text):
    doc = nlp(text)

    return {idx:
                {"text": ent.text,
                 "tag": ent.label_,
                 "span": (ent.start_char, ent.end_char)} for idx, ent in enumerate(doc.ents)}

In [2]:
df = pd.read_parquet(r"12JAN24 - One week news dump.parquet")

- ## Time
    - Extracted from doc
    - Publish date
- ## Location
    - Geonym/Toponym extraction
    - Disambiguation
    - Mordecai? 
- ## Actor
    - NER
    - Disambiguation
- ## Topic
    - Atomic Tagging

------
## Geoparsing
- This'll take just a few seconds; 4 seconds per 24k docs in testing
- This will identify countries and cities in the text, as possible Locations

In [7]:
df['GeoText'] = df['text'].progress_apply(GeoText)

df['cities'] = df['GeoText'].apply(lambda x: x.cities)
df['country_mentions'] = df['GeoText'].apply(lambda x: x.country_mentions)
df['countries'] = df['GeoText'].apply(lambda x: x.countries)
df['nationalities'] = df['GeoText'].apply(lambda x: x.nationalities)

100%|██████████████████████████████████████████████████████████████████████████| 24103/24103 [00:04<00:00, 5516.23it/s]


## Named Entity Recognition
- This'll take quite some time; 32 minutes per 24k docs in testing
- This'll scare up candidates for Actors, but also potentially locations.

In [28]:
df['ner'] = df['text'].progress_apply(NER)

100%|████████████████████████████████████████████████████████████████████████████| 24103/24103 [32:13<00:00, 12.47it/s]


In [72]:
df_ent = pd.DataFrame()

def extend_df_ent(docid, ner_object):
    
    global df_ent
    
    df_temp = pd.DataFrame([ner_object[x] for x in ner_object])
    df_temp['docid'] = docid
    df_ent = df_ent.append(df_temp)
    
    del df_temp

In [None]:
## Build the df_ent object
df.progress_apply(lambda row: extend_df_ent(row['docid'], row['ner']), axis=1)
## Reorder columns to be something more useful
df_ent= df_ent[['docid', 'tag', 'span', 'text']]

In [76]:
df_ent['tag'].unique()

array(['DATE', 'ORG', 'GPE', 'PERSON', 'TIME', 'CARDINAL', 'PRODUCT',
       'FAC', 'MONEY', 'PERCENT', 'WORK_OF_ART', 'QUANTITY', 'LOC',
       'ORDINAL', 'NORP', 'LAW', 'LANGUAGE', 'EVENT'], dtype=object)

In [77]:
df_ent['tag'].value_counts()

PERSON         396368
ORG            352781
DATE           290134
GPE            218641
CARDINAL       151793
NORP            65876
ORDINAL         36477
TIME            31383
WORK_OF_ART     30767
MONEY           28638
PRODUCT         22068
LOC             21943
PERCENT         17223
FAC             16359
QUANTITY        14975
EVENT           13707
LAW              4919
LANGUAGE         1119
Name: tag, dtype: int64

In [80]:
df_ent[df_ent['tag']=='PERSON']['text'].value_counts()[:25]

Trump              12424
Biden               7646
Haley               4133
Epstein             3244
Donald Trump        2964
Austin              2660
Barbie              2575
DeSantis            2380
Joe Biden           2170
Kate                1282
Johnson             1263
Nikki Haley         1239
Taylor Swift        1211
Blinken             1145
Max                 1091
Derek               1043
Ron DeSantis         921
Instagram            919
Margot Robbie        863
Smith                794
Swift                792
Taylor               776
Jeffrey Epstein      768
Hunter               738
Prince Andrew        734
Name: text, dtype: int64