# Libraries

In [1]:
import datetime
import json
import pandas as pd
import requests
import time
import random
import sys
import os
import os.path

from mcmetadata import extract
# pip install mediacloud-metadata

from tqdm.notebook import trange  # to display a progress bar
import ipywidgets as widgets
from IPython.display import display

# Init the connection
# mc = mediacloud.api.DirectoryApi("56196a395ee77c33a296073fa08e72f541362a10")

# Dataset

In [2]:
# query = "Query_NYT_5y_surveillance"
# query = "Query_NYT_13y_surveillance"
# query = "Query_US_5y_Svalbard"
query = "Query_AI-magazines-mediacloud-2021-24"

CSV_FILE = query + ".csv"
FEATHER_FILE = query + ".feather"

if os.path.isfile(FEATHER_FILE):
    df = pd.read_feather(FEATHER_FILE)

else:
    df = pd.read_csv(CSV_FILE)

    df["year"] = df.loc[:, "publish_date"]
    df["year"] = df["year"].str[:4]
    df["text"] = ""
    df["text_downloaded"] = False

    # df = df.drop(
    #     [
    #         "ap_syndicated",
    #         "collect_date",
    #         "feeds",
    #         "guid",
    #         "media_id",
    #         "media_name",
    #         "media_url",
    #         "metadata",
    #         "language",
    #         "processed_stories_id",
    #         "publish_date",
    #         "story_tags",
    #         "word_count",
    #     ],
    #     axis=1,
    # )

df.sample(1)

Unnamed: 0,id,indexed_date,language,media_name,media_url,publish_date,title,url,year,text,text_downloaded
2111,ac9b3ff1d609dbf76c995306a7ae8e00d0955c0ef32ee9...,2024-09-08 13:19:19.450227,en,technologyreview.com,technologyreview.com,2021-11-09,I Was There When: Facebook put profits over sa...,https://www.technologyreview.com/2021/11/09/10...,2021,,False


In [3]:
# Error Counter
total = widgets.IntText(description="Total", value=df.shape[0])
done = widgets.IntText(description="Done", value=0)
unavailable = widgets.IntText(description="Unavailable", value=0)
yes = widgets.IntText(description="Downloaded", value=0)
no = widgets.IntText(description="Errors", value=0)
display(total, done, unavailable, yes, no)

for index in trange(0, df.shape[0]):
    if df.iloc[index].text_downloaded == True:
        done.value += 1
        continue

    API = "https://archive.org/wayback/available"
    params = {
        "url": df.iloc[index].url,
        "timeout": "300",
        "closest": "either",
        "status_code": "200",
    }
    headers = {"accept": "application/json"}

    try:
        response = requests.get(API, params=params, headers=headers)
        snapshot = response.json().get("archived_snapshots", {}).get("closest")

        if snapshot:
            metadata = extract(url=snapshot.get("url"))
            df.at[index, "text"] = metadata["text_content"]
            df.at[index, "text_downloaded"] = True
            yes.value += 1

        else:
            unavailable.value += 1

    except Exception as e:
        no.value += 1

    # time.sleep(random.uniform(1, 20)) # This is not bad to run in the night
    # time.sleep(random.uniform(.1, 1))

    df.to_feather(FEATHER_FILE)

IntText(value=2633, description='Total')

IntText(value=0, description='Done')

IntText(value=0, description='Unavailable')

IntText(value=0, description='Downloaded')

IntText(value=0, description='Errors')

  0%|          | 0/2633 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [4]:
# df.to_feather(FEATHER_FILE)
df.sample(10)

Unnamed: 0,id,indexed_date,language,media_name,media_url,publish_date,title,url,year,text,text_downloaded
898,456d0f10b8419a8f9166162b8904f8515269fed57dd8af...,2024-02-19 10:52:53.991158,en,theverge.com,theverge.com,2023-10-11,Firefox tests a built-in checker for fake reviews,https://www.theverge.com/2023/10/11/23912457/f...,2023,,False
2276,47a086428051fe71042975f9ef963e42dfbd3597b21f20...,2024-08-26 18:38:21.134252,en,theverge.com,theverge.com,2021-08-19,What Twitter should copy from Tinder,https://www.theverge.com/22631996/what-twitter...,2021,,False
1784,07b653c3b5465ba11c9434527be736c80bb20d3f25412f...,2024-04-30 20:59:18.034074,en,theverge.com,theverge.com,2022-09-30,How to watch Tesla’s 2022 AI Day event,https://www.theverge.com/2022/9/30/23374542/te...,2022,,False
284,b5ec122a4c39a06c351351c5a4dc34fe6faf465573d621...,2024-06-10 15:25:50.997065,en,technologyreview.com,technologyreview.com,2024-06-10,Digital twins are helping scientists run the w...,https://www.technologyreview.com/2024/06/10/10...,2024,Digital twins are helping scientists run the w...,True
765,3d8cc8f1b112abe9aa058edcffa07ccfc8f533d93eb43b...,2024-02-16 09:25:02.106514,en,theatlantic.com,theatlantic.com,2023-11-20,Christopher Nolan on the Promise and Peril of ...,https://www.theatlantic.com/technology/archive...,2023,,False
776,71ccdf9858977062d838bcf2a829dedbdfd7a2e878b04d...,2024-02-16 13:33:27.883664,en,technologyreview.com,technologyreview.com,2023-11-17,This company is building AI for African languages,https://www.technologyreview.com/2023/11/17/10...,2023,This company is building AI for African langua...,True
308,af1ac2dea4406a7032f85e701163a763bf81a8d7edbee0...,2024-06-04 15:18:56.956510,en,technologyreview.com,technologyreview.com,2024-06-04,What I learned from the UN’s “AI for Good” summit,https://www.technologyreview.com/2024/06/04/10...,2024,,False
1735,4f532701126820902770b87d9e4cf96f208c415d6900ec...,2024-04-28 20:14:57.035418,en,technologyreview.com,technologyreview.com,2022-11-02,Iron batteries might provide the long-term sto...,https://www.technologyreview.com/2022/11/02/10...,2022,,False
1346,b7a56aaffeaadcf287708ea35ad626a63ef88f68582f82...,2024-02-28 17:14:37.169711,en,theverge.com,theverge.com,2023-05-05,Microsoft is reportedly helping AMD expand int...,https://www.theverge.com/2023/5/5/23712242/mic...,2023,,False
1361,6666894592da5c51f38833c143d6b5f91ebea6817ce77e...,2024-02-28 21:55:17.169842,en,theverge.com,theverge.com,2023-05-02,How Bluesky should capitalize on its viral suc...,https://www.theverge.com/2023/5/2/23707877/blu...,2023,,False
