# Libraries

In [1]:
import datetime
import json
import pandas as pd
import requests
import time
import random
import sys
import os
import os.path

from mcmetadata import extract
import mediacloud.api

from tqdm.notebook import trange  # to display a progress bar
import ipywidgets as widgets
from IPython.display import display

# Init the connection
mc = mediacloud.api.DirectoryApi("56196a395ee77c33a296073fa08e72f541362a10")

# Dataset

In [8]:
# query = "Query_NYT_5y_surveillance"
# query = "Query_NYT_13y_surveillance"
query = "Query_US_5y_Svalbard"

CSV_FILE = query + ".csv"
FEATHER_FILE = query + ".feather"

if os.path.isfile(FEATHER_FILE):
    df = pd.read_feather(FEATHER_FILE)

else:
    df = pd.read_csv(CSV_FILE)

    df["year"] = df.loc[:, "publish_date"]
    df["year"] = df["year"].str[:4]
    df["text"] = ""
    df["text_downloaded"] = False

    df = df.drop(
        [
            "ap_syndicated",
            "collect_date",
            "feeds",
            "guid",
            "media_id",
            "media_name",
            "media_url",
            "metadata",
            "language",
            "processed_stories_id",
            "publish_date",
            "story_tags",
            "word_count",
        ],
        axis=1,
    )

df.sample(1)

Unnamed: 0,stories_id,title,url,year,text,text_downloaded
841,2362997804,The hot game on TikTok: Guessing locations on ...,https://www.washingtonpost.com/video-games/202...,2022,,False


In [26]:
# Error Counter
total = widgets.IntText(description="Total", value=df.shape[0])
done = widgets.IntText(description="Done", value=0)
unavailable = widgets.IntText(description="Unavailable", value=0)
yes = widgets.IntText(description="Downloaded", value=0)
no = widgets.IntText(description="Errors", value=0)
display(total, done, unavailable, yes, no)

for index in trange(0, df.shape[0]):
    if df.iloc[index].text_downloaded == True:
        done.value += 1
        continue

    API = "https://archive.org/wayback/available"
    params = {
        "url": df.iloc[index].url,
        "timeout": "30",
        "closest": "either",
        "status_code": "200",
    }
    headers = {"accept": "application/json"}

    try:
        response = requests.get(API, params=params, headers=headers)
        snapshot = response.json().get("archived_snapshots", {}).get("closest")

        if snapshot:
            metadata = extract(url=snapshot.get("url"))
            df.at[index, "text"] = metadata["text_content"]
            df.at[index, "text_downloaded"] = True
            yes.value += 1

        else:
            unavailable.value += 1

    except Exception as e:
        no.value += 1

    # time.sleep(random.uniform(5.0, 10.0))
    time.sleep(random.uniform(0.001, 0.01))

df.to_feather(FEATHER_FILE)

IntText(value=1538, description='Total')

IntText(value=0, description='Done')

IntText(value=0, description='Unavailable')

IntText(value=0, description='Downloaded')

IntText(value=0, description='Errors')

  0%|          | 0/1538 [00:00<?, ?it/s]

In [15]:
df.sample(50)

Unnamed: 0,stories_id,title,url,year,text,text_downloaded
137,1387078156,Modernism Reboots at the Museums,https://www.nytimes.com/2019/09/09/arts/design...,2019,Supported by\nFall Preview\nModernism Reboots ...,True
518,1809672398,"Three Supermoons, Two Blood Moons And A ‘Ring ...",https://www.forbes.com/sites/jamiecartereurope...,2020,Are you one of the millions of people who star...,True
1068,2501626030,Florida lawmaker who sponsored 'don't say gay'...,https://www.google.com/url?rct=j&sa=t&url=http...,2022,,False
710,2129669830,The Margin: ‘Every step we take toward this ca...,http://www.marketwatch.com/news/story.asp?guid...,2021,,False
424,1721255237,A British teenager staged a sit-in on an Arcti...,http://rss.cnn.com/~r/rss/cnn_latest/~3/22fzL0...,2020,,False
129,1381216112,Global heating: geese shift migration stop-off...,https://www.theguardian.com/environment/2019/s...,2019,Barnacle geese are shifting their migratory pa...,True
1063,2499801425,Top 10 polar photobooks,https://www.theguardian.com/books/2022/dec/07/...,2022,As a former biologist turned natural history f...,True
311,1661740761,I was trapped on a boat in the Arctic for almo...,https://www.businessinsider.com/life-trapped-o...,2020,- Gina Jozef embarked on the MOSAiC Expedition...,True
790,2341083655,Secret Polar Bear Population Is Found Living i...,https://www.scientificamerican.com/article/sec...,2022,A secret population of polar bears in Greenlan...,True
736,2307590965,Rays' Wander Franco: The first 100 games | MLB...,https://www.google.com/url?rct=j&sa=t&url=http...,2022,,False
