In [1]:
import spacy

spacy.cli.download("en_core_web_lg")
nlp = spacy.load("en_core_web_lg")

Collecting en-core-web-lg==3.7.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.7.0/en_core_web_lg-3.7.0-py3-none-any.whl (587.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m587.7/587.7 MB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: en-core-web-lg
Successfully installed en-core-web-lg-3.7.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')


In [6]:
import pandas as pd

# Load the feather file into a pandas dataframe
df_data = pd.read_feather("results.feather")

df_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   index         3 non-null      int64 
 1   year          3 non-null      int64 
 2   title         3 non-null      object
 3   url           3 non-null      object
 4   text_content  3 non-null      object
dtypes: int64(2), object(3)
memory usage: 252.0+ bytes


In [13]:
from tqdm.notebook import tqdm

# Filtering by type
def filtrer_par_type(dataframe, type_recherche):
    masque = dataframe["Type"] == type_recherche
    return dataframe[masque]

# Create a dataframe to store the results
df_result = pd.DataFrame(columns=["entities"])

# Browse the data ...
for index, row in tqdm(df_data.iterrows(), total = df_data.shape[0]):
    # Extract the text_content
    text = row.text_content
    # Analyse text with Spacy
    doc = nlp(text)
    # List to store entities
    entites = []
    # Browse named entities in the doc
    for entite in doc.ents:
        entites.append((entite.text, entite.label_))
    
    # Create a dataframe with the list of entities
    df_entities = pd.DataFrame(entites, columns=["Entité", "Type"])
    
    # Filtering on type "PERSON"
    df_person = df_entities.loc[df_entities["Type"] == "PERSON"]
    
    # Filtering on type "GPE"
    df_gpe = df_entities.loc[df_entities["Type"] == "GPE"]
    
    # Filtering on type "ORG"
    df_org = df_entities.loc[df_entities["Type"] == "ORG"]
    
    # Append the data to the result dataframe
    dict_entities = {
        'url': row.url,
        'PERSON': df_person["Entité"].tolist(),
        'GPE': df_gpe["Entité"].tolist(),
        'ORG': df_org["Entité"].tolist()
    }
    nouvelle_ligne = pd.DataFrame( data={'entities': dict_entities} )
    
    df_result = pd.concat([df_result, nouvelle_ligne])                                                     

df_result.info()
print("-----------------------------------")

# Write to a JSON file
print(" ... Write to JSON file ...")
df_result.to_json('ner.json', orient='records')

print("-----------------------------------")
print("  Process complete !")

  0%|          | 0/3 [00:00<?, ?it/s]

<class 'pandas.core.frame.DataFrame'>
Index: 12 entries, GPE to url
Data columns (total 1 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   entities  12 non-null     object
dtypes: object(1)
memory usage: 192.0+ bytes
-----------------------------------
 ... Write to JSON file ...
-----------------------------------
  Process complete !
