# Libraries

In [1]:
import datetime
import json
import pandas as pd
import requests
import time
import random
import sys
import os
import os.path

from mcmetadata import extract
import mediacloud.api

from tqdm.notebook import trange  # to display a progress bar
import ipywidgets as widgets
from IPython.display import display

# Init the connection
mc = mediacloud.api.DirectoryApi("56196a395ee77c33a296073fa08e72f541362a10")

# Dataset

In [8]:
# query = "Query_NYT_5y_surveillance"
# query = "Query_NYT_13y_surveillance"
query = "Query_US_5y_Svalbard"

CSV_FILE = query + ".csv"
FEATHER_FILE = query + ".feather"

if os.path.isfile(FEATHER_FILE):
    df = pd.read_feather(FEATHER_FILE)

else:
    df = pd.read_csv(CSV_FILE)

    df["year"] = df.loc[:, "publish_date"]
    df["year"] = df["year"].str[:4]
    df["text"] = ""
    df["text_downloaded"] = False

    df = df.drop(
        [
            "ap_syndicated",
            "collect_date",
            "feeds",
            "guid",
            "media_id",
            "media_name",
            "media_url",
            "metadata",
            "language",
            "processed_stories_id",
            "publish_date",
            "story_tags",
            "word_count",
        ],
        axis=1,
    )

df.sample(1)

Unnamed: 0,stories_id,title,url,year,text,text_downloaded
841,2362997804,The hot game on TikTok: Guessing locations on ...,https://www.washingtonpost.com/video-games/202...,2022,,False


In [13]:
# Error Counter
todo = widgets.IntText(description="ToDo", value=df.shape[0])
done = widgets.IntText(description="Done", value=0)
yes = widgets.IntText(description="Downloaded", value=0)
no = widgets.IntText(description="Errors", value=0)
display(todo, done, yes, no)

for index in trange(0, df.shape[0]):

    if df.iloc[index].text_downloaded == True:
        done.value += 1
        continue 

    API = "https://archive.org/wayback/available"
    params = {
        "url": df.iloc[index].url,
        "timeout": "30",
        "closest": "either",
        "status_code": "200",
    }
    headers = {"accept": "application/json"}

    response = requests.get(API, params=params, headers=headers)

    json = response.json()
    snapshot = json.get("archived_snapshots", {}).get("closest")

    # if snapshot and snapshot.get("available") == True:
    if snapshot:
        try:
            metadata = extract(url=snapshot.get("url"))

            df.at[index, "text"] = metadata["text_content"]
            df.at[index, "text_downloaded"] = True

            yes.value += 1

        except Exception as e:
            no.value += 1

    # time.sleep(random.uniform(5.0, 10.0))
    time.sleep(random.uniform(.1, 1.0))

df.to_feather(FEATHER_FILE)

IntText(value=1538, description='ToDo')

IntText(value=0, description='Done')

IntText(value=0, description='Downloaded')

IntText(value=0, description='Errors')

  0%|          | 0/1538 [00:00<?, ?it/s]

In [4]:
df.sample(50)

Unnamed: 0,stories_id,title,url,year,text,text_downloaded
399,1696567108,"Get naked, and other advice to fend off a pola...",https://nypost.com/2020/08/29/get-naked-how-to...,2020,"The other day, a polar bear killed a 38-year-o...",True
1526,2822743220,Linder adds three players for UW men's hoops |...,https://www.google.com/url?rct=j&sa=t&url=http...,2023,,False
797,2346347998,Russia and China eye NATO's 'Arctic Achilles h...,https://www.rawstory.com/russia-and-china-eye-...,2022,"Russian flags flap in the stiff polar breeze, ...",True
1131,2534360389,Walrus detectives and the Year of the Tiger: W...,https://www.theguardian.com/artanddesign/2023/...,2023,,False
1334,2646004025,Bill restricting Chinese from buying Florida l...,https://www.google.com/url?rct=j&sa=t&url=http...,2023,,False
1535,2832352708,"Precious water: As more of the world thirsts, ...",https://www.newsday.com/news/nation/water-luxu...,2023,"Precious water: As more of the world thirsts, ...",True
599,1911870758,TV tonight: Guy Martin takes to the skies in a...,https://www.theguardian.com/tv-and-radio/2021/...,2021,"Guy Martin’s Battle of Britain\n9pm, Channel 4...",True
1306,2633545662,Rays' Jeffrey Springs to have Tommy John surge...,https://www.google.com/url?rct=j&sa=t&url=http...,2023,,False
16,1142246399,"Parts of Austria, Southern Germany Sink Deeper...",https://www.usnews.com/news/world/articles/201...,2019,"Parts of Austria, Southern Germany Sink Deeper...",True
1280,2625880598,Rays pitcher Jeffrey Springs to miss at least ...,https://www.google.com/url?rct=j&sa=t&url=http...,2023,,False
