# Libraries

In [55]:
import datetime
import json
import pandas as pd
import requests
import time
import random
import sys
import os
import os.path

from mcmetadata import extract
# import mediacloud.api

from tqdm.notebook import trange  # to display a progress bar
import ipywidgets as widgets
from IPython.display import display

# Init the connection
# mc = mediacloud.api.DirectoryApi("56196a395ee77c33a296073fa08e72f541362a10")

# Dataset

In [56]:
# query = "Query_NYT_5y_surveillance"
query = "Query_NYT_13y_surveillance"
# query = "Query_US_5y_Svalbard"

CSV_FILE = query + ".csv"
FEATHER_FILE = query + ".feather"

if os.path.isfile(FEATHER_FILE):
    df = pd.read_feather(FEATHER_FILE)

else:
    df = pd.read_csv(CSV_FILE)

    df["year"] = df.loc[:, "publish_date"]
    df["year"] = df["year"].str[:4]
    df["text"] = ""
    df["text_downloaded"] = False

    df = df.drop(
        [
            "ap_syndicated",
            "collect_date",
            "feeds",
            "guid",
            "media_id",
            "media_name",
            "media_url",
            "metadata",
            "language",
            "processed_stories_id",
            "publish_date",
            "story_tags",
            "word_count",
        ],
        axis=1,
    )

df.sample(1)

Unnamed: 0,stories_id,title,url,year,text,text_downloaded
2482,341414440,Plainclothes Officers in New York Find Danger ...,http://www.nytimes.com/2015/05/04/nyregion/pla...,2015,,False


In [66]:
# Error Counter
total = widgets.IntText(description="Total", value=df.shape[0])
done = widgets.IntText(description="Done", value=0)
unavailable = widgets.IntText(description="Unavailable", value=0)
yes = widgets.IntText(description="Downloaded", value=0)
no = widgets.IntText(description="Errors", value=0)
display(total, done, unavailable, yes, no)

for index in trange(0, df.shape[0]):
    if df.iloc[index].text_downloaded == True:
        done.value += 1
        continue

    API = "https://archive.org/wayback/available"
    params = {
        "url": df.iloc[index].url,
        "timeout": "300",
        "closest": "either",
        "status_code": "200",
    }
    headers = {"accept": "application/json"}

    try:
        response = requests.get(API, params=params, headers=headers)
        snapshot = response.json().get("archived_snapshots", {}).get("closest")

        if snapshot:
            metadata = extract(url=snapshot.get("url"))
            df.at[index, "text"] = metadata["text_content"]
            df.at[index, "text_downloaded"] = True
            yes.value += 1

        else:
            unavailable.value += 1

    except Exception as e:
        no.value += 1

    # time.sleep(random.uniform(1, 20)) # This is not bad to run in the night
    # time.sleep(random.uniform(.1, 1))

    df.to_feather(FEATHER_FILE)

IntText(value=4245, description='Total')

IntText(value=0, description='Done')

IntText(value=0, description='Unavailable')

IntText(value=0, description='Downloaded')

IntText(value=0, description='Errors')

  0%|          | 0/4245 [00:00<?, ?it/s]

In [75]:
# df.to_feather(FEATHER_FILE)
df.sample(10)

Unnamed: 0,stories_id,title,url,year,text,text_downloaded
1681,2572965596,"Germany trains Ukrainians on tanks, which it s...",https://www.nytimes.com/live/2023/02/20/world/...,2023,"Amid Air Raid Sirens, Biden Makes Surprise Ukr...",True
966,1559728498,"Coronavirus Is Idling North Korea’s Ships, Ach...",https://www.nytimes.com/2020/03/26/video/coron...,2020,President Trump has called the coronavirus “th...,True
3773,76685475,Museum and Gallery Listings for March 23-29,http://feeds.nytimes.com/click.phdo?i=1a1a6dbe...,2012,,False
2118,2727960822,Trump Employee Released on Bond After Court Ap...,https://www.nytimes.com/2023/07/31/us/politics...,2023,Supported by\nTrump Employee Released on Bond ...,True
3939,149746684,Video: Obama on Surveillance Transparency,http://www.nytimes.com/video/2013/08/09/us/pol...,2013,,False
848,1514271982,"Africa, Intertwined with China, Fears Coronavi...",https://www.nytimes.com/2020/02/06/world/afric...,2020,"Supported by\nAfrica, Intertwined With China, ...",True
2241,2783401546,Friday Briefing: New Trouble for China Evergrande,https://www.nytimes.com/2023/09/28/briefing/ch...,2023,Morning Briefing: Asia Pacific Edition\nPlus t...,True
1234,2370226605,"Ukraine Live Updates: Isolated by West, Putin ...",https://www.nytimes.com/live/2022/07/19/world/...,2022,Russian Invasion of UkraineU.S. Warns That Rus...,True
2478,340546442,Congressional Leaders Suggest Earlier Snowden ...,http://www.nytimes.com/2014/01/20/us/politics/...,2014,WASHINGTON — The heads of the House and Senate...,True
1155,2351126428,The Work of Online Volunteers,https://www.nytimes.com/2022/06/28/technology/...,2022,On Tech: A.I.\nModerators’ work on Reddit and ...,True
