# Libraries

In [55]:
import datetime
import json
import pandas as pd
import requests
import time
import random
import sys
import os
import os.path

from mcmetadata import extract
# import mediacloud.api

from tqdm.notebook import trange  # to display a progress bar
import ipywidgets as widgets
from IPython.display import display

# Init the connection
# mc = mediacloud.api.DirectoryApi("56196a395ee77c33a296073fa08e72f541362a10")

# Dataset

In [56]:
# query = "Query_NYT_5y_surveillance"
query = "Query_NYT_13y_surveillance"
# query = "Query_US_5y_Svalbard"

CSV_FILE = query + ".csv"
FEATHER_FILE = query + ".feather"

if os.path.isfile(FEATHER_FILE):
    df = pd.read_feather(FEATHER_FILE)

else:
    df = pd.read_csv(CSV_FILE)

    df["year"] = df.loc[:, "publish_date"]
    df["year"] = df["year"].str[:4]
    df["text"] = ""
    df["text_downloaded"] = False

    df = df.drop(
        [
            "ap_syndicated",
            "collect_date",
            "feeds",
            "guid",
            "media_id",
            "media_name",
            "media_url",
            "metadata",
            "language",
            "processed_stories_id",
            "publish_date",
            "story_tags",
            "word_count",
        ],
        axis=1,
    )

df.sample(1)

Unnamed: 0,stories_id,title,url,year,text,text_downloaded
2482,341414440,Plainclothes Officers in New York Find Danger ...,http://www.nytimes.com/2015/05/04/nyregion/pla...,2015,,False


In [60]:
# Error Counter
total = widgets.IntText(description="Total", value=df.shape[0])
done = widgets.IntText(description="Done", value=0)
unavailable = widgets.IntText(description="Unavailable", value=0)
yes = widgets.IntText(description="Downloaded", value=0)
no = widgets.IntText(description="Errors", value=0)
display(total, done, unavailable, yes, no)

for index in trange(0, df.shape[0]):
    if df.iloc[index].text_downloaded == True:
        done.value += 1
        continue

    API = "https://archive.org/wayback/available"
    params = {
        "url": df.iloc[index].url,
        "timeout": "300",
        "closest": "either",
        "status_code": "200",
    }
    headers = {"accept": "application/json"}

    try:
        response = requests.get(API, params=params, headers=headers)
        snapshot = response.json().get("archived_snapshots", {}).get("closest")

        if snapshot:
            metadata = extract(url=snapshot.get("url"))
            df.at[index, "text"] = metadata["text_content"]
            df.at[index, "text_downloaded"] = True
            yes.value += 1

        else:
            unavailable.value += 1

    except Exception as e:
        no.value += 1

    time.sleep(random.uniform(1, 20))

    df.to_feather(FEATHER_FILE)

IntText(value=4245, description='Total')

IntText(value=0, description='Done')

IntText(value=0, description='Unavailable')

IntText(value=0, description='Downloaded')

IntText(value=0, description='Errors')

  0%|          | 0/4245 [00:00<?, ?it/s]

In [58]:
df.to_feather(FEATHER_FILE)
df.sample(5)

Unnamed: 0,stories_id,title,url,year,text,text_downloaded
1898,2642451984,Is Brazil Under Luiz Inácio Lula da Silva ‘Ant...,https://www.nytimes.com/2023/05/02/opinion/bra...,2023,,False
525,1392420174,"Fearing ‘Spy Trains,’ Congress May Ban a Chine...",https://www.nytimes.com/2019/09/14/business/ch...,2019,,False
2252,2787273777,Police Robot,https://www.nytimes.com/2023/10/03/learning/po...,2023,,False
1745,2597365347,Kehinde Wiley’s New Exhibition Is a Chapel of ...,https://www.nytimes.com/2023/03/16/arts/design...,2023,,False
955,1553672772,Your Friday Briefing,https://www.nytimes.com/2020/03/20/briefing/co...,2020,,False
