In [1]:
import datetime
import json
import pandas as pd
import requests

from mcmetadata import extract
import mediacloud.api

from tqdm.notebook import trange  # to display a progress bar
import ipywidgets as widgets
from IPython.display import display

# Init the connection
mc = mediacloud.api.DirectoryApi('56196a395ee77c33a296073fa08e72f541362a10')

CSV_FILE = 'query.csv'
JSON_FILE = 'data.json'

# Read the query.csv file to obtain information of each item to dump
df_query = pd.read_csv(CSV_FILE)

In [2]:
size_query = df_query.shape[0]
print(size_query, "URLs in the CSV file :", CSV_FILE)

6366 URLs in the CSV file : query.csv


In [3]:
# Create the result dataframe
df_result = pd.DataFrame(columns=['year', 'title', 'url', 'text_content'])

# Create the empty JSON file
with open(JSON_FILE, 'w') as f:
    f.write('[')

# Création du widget d'erreurs
errors_widget = widgets.IntText(description='Erreurs sur URL:', value=0)
display(errors_widget)

rec = 0

for counter in trange(0,size_query) :
    # Query the MediaCloud API with each item
    url_data = df_query.iloc[counter].url
    
    try: 
        metadata = extract(url=url_data)
        metadata['publication_date'] = metadata['publication_date'].year
    
        # Append data to the result dataframe
        list_metadata = []
        for field in ['publication_date', 'article_title', 'url', 'text_content'] :
            list_metadata.append(metadata[field])
        # Append data to the result dataframe
        df_result.loc[counter] = list_metadata
        # And write data in JSON file
        with open(JSON_FILE, 'a') as f:
            if rec == 1 :
                f.write(',')
            else:
                rec = 1
            json.dump(metadata, f)
            f.write('\n')
    except Exception as e:
        # print(f'Erreur -> "{e}", pour l\'url : {url_data}' )
        errors_widget.value += 1
        log_message = f"{datetime.datetime.now()} - {e} ; {url_data}\n"
        with open('log.txt', 'a') as log_file:
            log_file.write(log_message)

with open(JSON_FILE, 'a') as f:
    f.write(']')

with open('log.txt', 'a') as log_file:
    log_file.write("----------------------------------\n")
print(df_result.describe)

IntText(value=0, description='Erreurs sur URL:')

  0%|          | 0/6366 [00:00<?, ?it/s]

<bound method NDFrame.describe of       year                                              title  \
0     2022  US military creates space unit in South Korea ...   
1     2022  China using civilian ships to enhance navy cap...   
3     2022  Burn pit registry is failing to help track vet...   
4     2017  Warthog attack plane finds new life in Trump a...   
5     2022  Fewer troops in CENTCOM? No problem, says 3-st...   
...    ...                                                ...   
6361  2022  Media Defends Mother and Daughter Who Allegedl...   
6362  2022  Monkeypox: The Real ‘Don’t Say Gay’ - The Amer...   
6363  2022  The God of Plague Returns: Snail Fever, COVID-...   
6364  2022  Trump Is Right: Piers Morgan Didn’t Study the ...   
6365  2022  Voices of China: ‘Give Me Liberty or Give Me D...   

                                                    url  \
0     https://www.airforcetimes.com/battlefield-tech...   
1     https://www.airforcetimes.com/flashpoints/chin...   
3     ht

In [4]:
# Export dataframe to a feather format file
df_result.reset_index().to_feather('results.feather')

print("\n-----------------------------------")
print("  Process complete !")


-----------------------------------
  Process complete !
