# Libraries

In [1]:
import datetime
import json
import pandas as pd
import requests
import time
import random
import sys
import os
import os.path

from mcmetadata import extract
# pip install mediacloud-metadata

from tqdm.notebook import trange  # to display a progress bar
import ipywidgets as widgets
from IPython.display import display

# Init the connection
# mc = mediacloud.api.DirectoryApi("56196a395ee77c33a296073fa08e72f541362a10")

# Dataset

In [2]:
# query = "Query_NYT_5y_surveillance"
# query = "Query_NYT_13y_surveillance"
# query = "Query_US_5y_Svalbard"
query = "Query_AI-magazines-mediacloud-2021-24"

CSV_FILE = query + ".csv"
FEATHER_FILE = query + ".feather"

if os.path.isfile(FEATHER_FILE):
    df = pd.read_feather(FEATHER_FILE)

else:
    df = pd.read_csv(CSV_FILE)

    df["year"] = df.loc[:, "publish_date"]
    df["year"] = df["year"].str[:4]
    df["text"] = ""
    df["text_downloaded"] = False

    # df = df.drop(
    #     [
    #         "ap_syndicated",
    #         "collect_date",
    #         "feeds",
    #         "guid",
    #         "media_id",
    #         "media_name",
    #         "media_url",
    #         "metadata",
    #         "language",
    #         "processed_stories_id",
    #         "publish_date",
    #         "story_tags",
    #         "word_count",
    #     ],
    #     axis=1,
    # )

df.sample(1)

Unnamed: 0,id,indexed_date,language,media_name,media_url,publish_date,title,url,year,text,text_downloaded
1176,c55f4491295d5624bfec2a64ba4dc33f683dd0d2940af4...,2024-02-25 09:27:42.634827,en,theverge.com,theverge.com,2023-06-30,European companies slam the EU’s incoming AI r...,https://www.theverge.com/2023/6/30/23779611/eu...,2023,,False


In [8]:
# Error Counter
total = widgets.IntText(description="Total", value=df.shape[0])
done = widgets.IntText(description="Done", value=0)
unavailable = widgets.IntText(description="Unavailable", value=0)
yes = widgets.IntText(description="Downloaded", value=0)
no = widgets.IntText(description="Errors", value=0)
display(total, done, unavailable, yes, no)

for index in trange(0, df.shape[0]):
    if df.iloc[index].text_downloaded == True:
        done.value += 1
        continue

    API = "https://archive.org/wayback/available"
    params = {
        "url": df.iloc[index].url,
        "timeout": "300",
        "closest": "either",
        "status_code": "200",
    }
    headers = {"accept": "application/json"}

    try:
        response = requests.get(API, params=params, headers=headers)
        snapshot = response.json().get("archived_snapshots", {}).get("closest")

        if snapshot:
            metadata = extract(url=snapshot.get("url"))
            df.at[index, "text"] = metadata["text_content"]
            df.at[index, "text_downloaded"] = True
            yes.value += 1

        else:
            unavailable.value += 1

    except Exception as e:
        no.value += 1

    # time.sleep(random.uniform(1, 20)) # This is not bad to run in the night
    # time.sleep(random.uniform(.1, 1))

    df.to_feather(FEATHER_FILE)

IntText(value=2633, description='Total')

IntText(value=0, description='Done')

IntText(value=0, description='Unavailable')

IntText(value=0, description='Downloaded')

IntText(value=0, description='Errors')

  0%|          | 0/2633 [00:00<?, ?it/s]

In [9]:
# df.to_feather(FEATHER_FILE)
df.sample(10)

Unnamed: 0,id,indexed_date,language,media_name,media_url,publish_date,title,url,year,text,text_downloaded
1487,87b7bbba6a4991a8c6a5c7d8c69a6ef80e6f755de02603...,2024-03-02 14:28:53.118418,en,wired.com,wired.com,2023-03-15,AI-Generated Voice Deepfakes Aren’t Scary Good...,https://www.wired.com/story/ai-voice-deep-fakes/,2023,Amid the generative-artificial-intelligence fr...,True
1081,03c88b832f1a1280a47951815409071c2c1a15cf912444...,2024-02-23 09:03:46.685707,en,theatlantic.com,theatlantic.com,2023-08-07,Here Comes the Second Year of AI College,https://www.theatlantic.com/ideas/archive/2023...,2023,Here Comes the Second Year of AI College\nUniv...,True
1521,c12759273a6c5f6d2c7f3a184b33643886e15f352280a8...,2024-03-07 09:26:06.922598,en,theatlantic.com,theatlantic.com,2023-03-03,What Isaac Asimov Can Teach Us About AI,https://www.theatlantic.com/books/archive/2023...,2023,What Isaac Asimov Can Teach Us About AI\nThe s...,True
70,c6312dee0ea4f2ea737a30fc275e268ecb0d8aad01a32d...,2024-08-17 08:18:38.497359,en,wired.com,wired.com,2024-08-16,Elon Musk Is No Climate Hero,https://www.wired.com/story/elon-musk-trump-cl...,2024,WIRED has been writing about Elon Musk—he of t...,True
712,5799e61424d469a1ea3549f9fa5770189e3a7f310da488...,2023-12-16 07:28:33,en,wired.com,wired.com,2023-12-14,My Surprisingly Unbiased Week With Elon Musk’s...,https://www.wired.com/story/fast-forward-elon-...,2023,Some Elon Musk enthusiasts have been alarmed t...,True
393,f7d2f1a34f054a029bcd58d4f9137f6df5c63ab96d6830...,2024-05-04 06:18:02.905615,en,theatlantic.com,theatlantic.com,2024-05-03,Trump’s VP Search Is Different This Time,https://www.theatlantic.com/newsletters/archiv...,2024,Trump’s VP Search Is Different This Time\nHis ...,True
1913,606da52d0c4e7e8f6c1a8d42a8e2d22d9a264235b5264d...,2024-05-08 15:19:30.052826,en,wired.com,wired.com,2022-06-08,China’s Jidu Robo-1 Looks Like It’s From the F...,https://www.wired.com/story/baidu-jidu-robo-1-...,2022,Application\nAutonomous driving\nHardware\nEnd...,True
519,1292aec5d039811d8110a19de2ea0e200a931d7e572cda...,2024-03-14 04:49:21.107591,en,theverge.com,theverge.com,2024-03-12,DHS wants $101 million to upgrade its border s...,https://www.theverge.com/2024/3/12/24098881/dh...,2024,If the Department of Homeland Security (DHS) g...,True
735,9f1298bcaa7abfbbb037a7c1da3109d7873650035abbfe...,2024-02-15 11:08:52.882223,en,technologyreview.com,technologyreview.com,2023-12-01,A high school’s deepfake porn scandal is pushi...,https://www.technologyreview.com/2023/12/01/10...,2023,A high school’s deepfake porn scandal is pushi...,True
1311,b6035f42fc4da16f3a568dadd675a3b04e0f01844880fe...,2024-02-28 01:13:41.957806,en,theverge.com,theverge.com,2023-05-16,"CNET staff are unionizing, citing editorial in...",https://www.theverge.com/2023/5/16/23723959/cn...,2023,Months after CNET came under fire for quietly ...,True


In [6]:
df.shape

(2633, 11)

In [10]:
df.to_csv('bd_AI_magazines.csv',index=False)