In [30]:
import pandas as pd

# Load the feather file into a pandas dataframe
print("\n------------- Results of downloads -----------------")
df_data = pd.read_feather("results.feather")
df_data.info()


# Load NY-Times tags, from a JSON file
print("\n-------------- NYT tags ----------------")
df_nyt_tags = pd.read_json("results_tags.json")
df_nyt_tags.info()

# Load NER, from a JSON file
print("\n-------------- NER ----------------")
df_entities = pd.read_json("ner.json")
df_entities.info()



------------- Results of downloads -----------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   index         3 non-null      int64 
 1   year          3 non-null      int64 
 2   title         3 non-null      object
 3   url           3 non-null      object
 4   text_content  3 non-null      object
dtypes: int64(2), object(3)
memory usage: 252.0+ bytes

-------------- NYT tags ----------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   article  3 non-null      object
 1   tags     3 non-null      object
dtypes: object(2)
memory usage: 180.0+ bytes

-------------- NER ----------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12 entries, 0 to 11
Data columns (total 1 columns):
 #   Column    Non-Null

## Prepare the data

In [26]:
from tqdm.notebook import tqdm

# Join NYT and URL
print("\n-------------- NYT <-> URL ----------------")
df_joint = pd.merge(df_data, df_nyt_tags, left_on='url', right_on='article')
df_joint.info()

# Create a NER dictionnary, for each URL
print("\n-------------- NER <-> URL ----------------")
dict_ner = {}
tempo_dict = {}


#for index, row in tqdm(df.iterrows(),  total=df.shape[0], desc=f'Reading DF'):
#        print(row(['df_colum'])
for index, row in tqdm(df_entities.iterrows(), total=df_entities.shape[0]):
    line = index%4
    if line == 0:
        tempo_dict['GPE'] = row.entities
    if line == 1:
        tempo_dict['ORG'] = row.entities
    if line == 2:
        tempo_dict['PERSON'] = row.entities
    if line == 3:
        dict_ner[row.entities] = tempo_dict
        tempo_dict = {}
print(len(dict_ner), "entries")


-------------- NYT <-> URL ----------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   index         3 non-null      int64 
 1   year          3 non-null      int64 
 2   title         3 non-null      object
 3   url           3 non-null      object
 4   text_content  3 non-null      object
 5   article       3 non-null      object
 6   tags          3 non-null      object
dtypes: int64(2), object(5)
memory usage: 300.0+ bytes

-------------- NER <-> URL ----------------


  0%|          | 0/12 [00:00<?, ?it/s]

3 entries


## Fusion

In [28]:
import json

fusion_list = []
# Browse dict_ner
print("\n-------------- Dict. NER to list ----------------")
for key, value in tqdm(dict_ner.items()):
    row = df_joint.loc[df_joint['url'] == key]
    local_dict = row.to_dict(orient='records')
    local_dict[0]['NER']= value
    fusion_list.append(local_dict[0])
print(len(fusion_list),"items")

# Convertion de la liste fusion_list en JSON
print("\n-------------- Write to JSON ----------------")
with open("fusion.json", "w") as fichier:
    json.dump(fusion_list, fichier))

print("\n-----------------------------------")
print("  Process complete !")


-------------- Dict. NER to list ----------------


  0%|          | 0/3 [00:00<?, ?it/s]

3 items

-------------- Write to JSON ----------------


0it [00:00, ?it/s]


-----------------------------------
  Process complete !
