In [1]:
import json
import os
from os.path import exists
import pandas as pd
import requests
from tqdm import tqdm

# Setup API for HTTPS requests through Wikipedia/Wikidata

In [2]:
CONFIG_PATH = SCRAPED_DATA_PATH = os.path.join(os.path.abspath(""), "config.json")
BASE_URL = "https://api.themoviedb.org/3/movie/"
BASE_URL_TOP_RATED = BASE_URL + "top_rated?"

In [3]:
CONFIG_PATH

'/home/aleksandar-lukic/school/02805_Final_project/code/config.json'

In [4]:
# Load the config file with API credentials
if exists(CONFIG_PATH):
    with open(CONFIG_PATH) as config_file:
        config = json.load(config_file)
        APP_NAME = config["WIKI"]["WIKI_APPLICATION_NAME"]
        AUTH_USER = config["WIKI"]["WIKI_EMAIL"]
        AUTH_TOKEN = config["WIKI"]["WIKI_BEARER_KEY"]
        AUTH_CLIENT_ID = config["WIKI"]["WIKI_CLIENT_ID"]
        AUTH_CLIENT_SECRECT = config["WIKI"]["WIKI_CLIENT_SECRET"]
    
    # Define the headers to include the authentication token
    HEADERS = {
        "Authorization": f"Bearer {AUTH_TOKEN}",
        "User-Agent": f"{APP_NAME} ({AUTH_USER})",
    }
    print(f"Config found - HEADER generated from:\n{CONFIG_PATH}")

else:
    print("Config not found!")

Config found - HEADER generated from:
/home/aleksandar-lukic/school/02805_Final_project/code/config.json


# Read features dataset from .csv

In [5]:
SCRAPED_FEATURES_PATH = os.path.join(os.path.abspath(""), "TMDB_scraped_features.csv")

In [6]:
SCRAPED_FEATURES_PATH

'/home/aleksandar-lukic/school/02805_Final_project/code/TMDB_scraped_features.csv'

In [7]:
features_df = pd.read_csv(SCRAPED_FEATURES_PATH)

In [8]:
features_df.iloc[0]["wikidata_id"]

'Q172241'

# Scrape Wikipedia based on each feature's wikidata\_id

In [58]:
def get_wikipedia_page(wikidata_id):
    
    ### Step 1: Get the English Wikipedia page URL from Wikidata
    wikidata_url = f"https://www.wikidata.org/wiki/Special:EntityData/{wikidata_id}.json"
    response = requests.get(wikidata_url, HEADERS)
    response.raise_for_status()
    data = response.json()

    # Check if the wikidata_id is valid.
    # Otherwise, take alternative id
    if wikidata_id not in data["entities"]:
        old_wikidata_id = wikidata_id
        wikidata_id = list(data["entities"])[0]
        print(f"Wikidata_id({old_wikidata_id}) was substituted with new wikidata_id({wikidata_id})")

    # Extract the english wikipedia page for the feature.
    # Otherwise, return None
    sitelinks = data["entities"][wikidata_id]["sitelinks"]
    if "enwiki" not in sitelinks:
        return None, None

    # Get the title and wikipedia url of the feature
    title = sitelinks["enwiki"]["title"]
    wikipedia_url = sitelinks["enwiki"]["url"]
    
    ### Step 2: Fetch the content from the Wikipedia page
    wikipedia_api_url = "https://en.wikipedia.org/w/api.php"
    params = {
        "action": "query",
        "format": "json",
        "prop": "extracts",
        "explaintext": True,
        "titles": title,
        "headers": HEADERS
    }
    page_response = requests.get(wikipedia_api_url, params=params)
    page_response.raise_for_status()
    pages = page_response.json()["query"]["pages"]
    page_content = next(iter(pages.values()))["extract"]
    
    return wikipedia_url, page_content

In [52]:
# wiki_df = pd.DataFrame()

# # Create the tqdm progress bar
# progress_bar = tqdm(features_df.iterrows(), total=len(features_df), desc="Scraping Wikipedia")

# for idx, row in progress_bar:

#     url, content = get_wikipedia_page(row["wikidata_id"])
    
#     current_df = pd.DataFrame({
#         "feature_id": [row["feature_id"]],
#         "title": [row["title"]],
#         "url": [url],
#         "page_content": [content],
#         "content_size": [len(content)],
#         "wikidata_id": [row["wikidata_id"]]
#     })

#     wiki_df = pd.concat([wiki_df, current_df])

In [53]:
wiki_df_1000 = pd.DataFrame()

# Create the tqdm progress bar
progress_bar = tqdm(features_df.iloc[0:1000].iterrows(), total=len(features_df.iloc[0:1000]), desc="Scraping Wikipedia")

for idx, row in progress_bar:

    url, content = get_wikipedia_page(row["wikidata_id"])

    content_size = None
    if content:
        content_size = len(content)
    
    current_df = pd.DataFrame({
        "feature_id": [row["feature_id"]],
        "title": [row["title"]],
        "url": [url],
        "page_content": [content],
        "content_size": [content_size],
        "wikidata_id": [row["wikidata_id"]]
    })

    wiki_df_1000 = pd.concat([wiki_df_1000, current_df])

Scraping Wikipedia:   4%|█▋                                    | 43/1000 [00:38<12:07,  1.32it/s]

wikidata_id not in entities!
New wikidata_id found: Q115686450


Scraping Wikipedia: 100%|████████████████████████████████████| 1000/1000 [14:41<00:00,  1.13it/s]


In [56]:
SCRAPED_WIKIPEDIA_PATH = os.path.join(os.path.abspath(""), "WIKI_scraped_pages_1000.csv")
wiki_df_1000.to_csv(SCRAPED_WIKIPEDIA_PATH, index = False)

In [None]:
wiki_df_2000 = pd.DataFrame()

# Create the tqdm progress bar
progress_bar = tqdm(features_df.iloc[1000:2000].iterrows(), total=len(features_df.iloc[1000:2000]), desc="Scraping Wikipedia")

for idx, row in progress_bar:

    url, content = get_wikipedia_page(row["wikidata_id"])

    content_size = None
    if content:
        content_size = len(content)
    
    current_df = pd.DataFrame({
        "feature_id": [row["feature_id"]],
        "title": [row["title"]],
        "url": [url],
        "page_content": [content],
        "content_size": [content_size],
        "wikidata_id": [row["wikidata_id"]]
    })

    wiki_df_2000 = pd.concat([wiki_df_2000, current_df])

Scraping Wikipedia:  10%|███▊                                 | 103/1000 [01:23<13:03,  1.15it/s]

In [None]:
SCRAPED_WIKIPEDIA_PATH = os.path.join(os.path.abspath(""), "WIKI_scraped_pages_2000.csv")
wiki_df_2000.to_csv(SCRAPED_WIKIPEDIA_PATH, index = False)

In [None]:
wiki_df_3000 = pd.DataFrame()

# Create the tqdm progress bar
progress_bar = tqdm(features_df.iloc[2000:3000].iterrows(), total=len(features_df.iloc[2000:3000]), desc="Scraping Wikipedia")

for idx, row in progress_bar:

    url, content = get_wikipedia_page(row["wikidata_id"])

    content_size = None
    if content:
        content_size = len(content)
    
    current_df = pd.DataFrame({
        "feature_id": [row["feature_id"]],
        "title": [row["title"]],
        "url": [url],
        "page_content": [content],
        "content_size": [content_size],
        "wikidata_id": [row["wikidata_id"]]
    })

    wiki_df_3000 = pd.concat([wiki_df_3000, current_df])

In [None]:
SCRAPED_WIKIPEDIA_PATH = os.path.join(os.path.abspath(""), "WIKI_scraped_pages_3000.csv")
wiki_df_3000.to_csv(SCRAPED_WIKIPEDIA_PATH, index = False)

In [None]:
wiki_df_4000 = pd.DataFrame()

# Create the tqdm progress bar
progress_bar = tqdm(features_df.iloc[3000:4000].iterrows(), total=len(features_df.iloc[3000:4000]), desc="Scraping Wikipedia")

for idx, row in progress_bar:

    url, content = get_wikipedia_page(row["wikidata_id"])

    content_size = None
    if content:
        content_size = len(content)
    
    current_df = pd.DataFrame({
        "feature_id": [row["feature_id"]],
        "title": [row["title"]],
        "url": [url],
        "page_content": [content],
        "content_size": [content_size],
        "wikidata_id": [row["wikidata_id"]]
    })

    wiki_df_4000 = pd.concat([wiki_df_4000, current_df])

In [None]:
SCRAPED_WIKIPEDIA_PATH = os.path.join(os.path.abspath(""), "WIKI_scraped_pages_4000.csv")
wiki_df_4000.to_csv(SCRAPED_WIKIPEDIA_PATH, index = False)

In [None]:
wiki_df_5000 = pd.DataFrame()

# Create the tqdm progress bar
progress_bar = tqdm(features_df.iloc[4000:5000].iterrows(), total=len(features_df.iloc[4000:5000]), desc="Scraping Wikipedia")

for idx, row in progress_bar:

    url, content = get_wikipedia_page(row["wikidata_id"])

    content_size = None
    if content:
        content_size = len(content)
    
    current_df = pd.DataFrame({
        "feature_id": [row["feature_id"]],
        "title": [row["title"]],
        "url": [url],
        "page_content": [content],
        "content_size": [content_size],
        "wikidata_id": [row["wikidata_id"]]
    })

    wiki_df_5000 = pd.concat([wiki_df_5000, current_df])

In [None]:
SCRAPED_WIKIPEDIA_PATH = os.path.join(os.path.abspath(""), "WIKI_scraped_pages_5000.csv")
wiki_df_5000.to_csv(SCRAPED_WIKIPEDIA_PATH, index = False)

In [None]:
wiki_df_6000 = pd.DataFrame()

# Create the tqdm progress bar
progress_bar = tqdm(features_df.iloc[5000:6000].iterrows(), total=len(features_df.iloc[5000:6000]), desc="Scraping Wikipedia")

for idx, row in progress_bar:

    url, content = get_wikipedia_page(row["wikidata_id"])

    content_size = None
    if content:
        content_size = len(content)
    
    current_df = pd.DataFrame({
        "feature_id": [row["feature_id"]],
        "title": [row["title"]],
        "url": [url],
        "page_content": [content],
        "content_size": [content_size],
        "wikidata_id": [row["wikidata_id"]]
    })

    wiki_df_6000 = pd.concat([wiki_df_6000, current_df])

In [None]:
SCRAPED_WIKIPEDIA_PATH = os.path.join(os.path.abspath(""), "WIKI_scraped_pages_6000.csv")
wiki_df_6000.to_csv(SCRAPED_WIKIPEDIA_PATH, index = False)

In [None]:
wiki_df_7000 = pd.DataFrame()

# Create the tqdm progress bar
progress_bar = tqdm(features_df.iloc[6000:7000].iterrows(), total=len(features_df.iloc[6000:7000]), desc="Scraping Wikipedia")

for idx, row in progress_bar:

    url, content = get_wikipedia_page(row["wikidata_id"])

    content_size = None
    if content:
        content_size = len(content)
    
    current_df = pd.DataFrame({
        "feature_id": [row["feature_id"]],
        "title": [row["title"]],
        "url": [url],
        "page_content": [content],
        "content_size": [content_size],
        "wikidata_id": [row["wikidata_id"]]
    })

    wiki_df_7000 = pd.concat([wiki_df_7000, current_df])

In [None]:
SCRAPED_WIKIPEDIA_PATH = os.path.join(os.path.abspath(""), "WIKI_scraped_pages_7000.csv")
wiki_df_7000.to_csv(SCRAPED_WIKIPEDIA_PATH, index = False)

In [None]:
wiki_df_8000 = pd.DataFrame()

# Create the tqdm progress bar
progress_bar = tqdm(features_df.iloc[7000:8000].iterrows(), total=len(features_df.iloc[7000:8000]), desc="Scraping Wikipedia")

for idx, row in progress_bar:

    url, content = get_wikipedia_page(row["wikidata_id"])

    content_size = None
    if content:
        content_size = len(content)
    
    current_df = pd.DataFrame({
        "feature_id": [row["feature_id"]],
        "title": [row["title"]],
        "url": [url],
        "page_content": [content],
        "content_size": [content_size],
        "wikidata_id": [row["wikidata_id"]]
    })

    wiki_df_8000 = pd.concat([wiki_df_8000, current_df])

In [None]:
SCRAPED_WIKIPEDIA_PATH = os.path.join(os.path.abspath(""), "WIKI_scraped_pages_8000.csv")
wiki_df_8000.to_csv(SCRAPED_WIKIPEDIA_PATH, index = False)

In [None]:
wiki_df_9000 = pd.DataFrame()

# Create the tqdm progress bar
progress_bar = tqdm(features_df.iloc[8000:9000].iterrows(), total=len(features_df.iloc[8000:9000]), desc="Scraping Wikipedia")

for idx, row in progress_bar:

    url, content = get_wikipedia_page(row["wikidata_id"])

    content_size = None
    if content:
        content_size = len(content)
    
    current_df = pd.DataFrame({
        "feature_id": [row["feature_id"]],
        "title": [row["title"]],
        "url": [url],
        "page_content": [content],
        "content_size": [content_size],
        "wikidata_id": [row["wikidata_id"]]
    })

    wiki_df_9000 = pd.concat([wiki_df_9000, current_df])

In [None]:
SCRAPED_WIKIPEDIA_PATH = os.path.join(os.path.abspath(""), "WIKI_scraped_pages_9000.csv")
wiki_df_9000.to_csv(SCRAPED_WIKIPEDIA_PATH, index = False)

In [None]:
wiki_df_10000 = pd.DataFrame()

# Create the tqdm progress bar
progress_bar = tqdm(features_df.iloc[9000:].iterrows(), total=len(features_df.iloc[9000:]), desc="Scraping Wikipedia")

for idx, row in progress_bar:

    url, content = get_wikipedia_page(row["wikidata_id"])

    content_size = None
    if content:
        content_size = len(content)
    
    current_df = pd.DataFrame({
        "feature_id": [row["feature_id"]],
        "title": [row["title"]],
        "url": [url],
        "page_content": [content],
        "content_size": [content_size],
        "wikidata_id": [row["wikidata_id"]]
    })

    wiki_df_10000 = pd.concat([wiki_df_10000, current_df])

In [None]:
SCRAPED_WIKIPEDIA_PATH = os.path.join(os.path.abspath(""), "WIKI_scraped_pages_10000.csv")
wiki_df_10000.to_csv(SCRAPED_WIKIPEDIA_PATH, index = False)

In [None]:
wiki_df = pd.concat[
    wiki_df_1000,
    wiki_df_2000,
    wiki_df_3000,
    wiki_df_4000,
    wiki_df_5000,
    wiki_df_6000,
    wiki_df_7000,
    wiki_df_8000,
    wiki_df_9000,
    wiki_df_10000
]

In [None]:
wiki_clean_df = wiki_df.sort_values(by = "feature_id", ascending = True)

wiki_clean_df = wiki_clean_df.drop_duplicates(subset = ["feature_id", ""], keep = "first")

wiki_clean_df = wiki_clean_df.reset_index()

wiki_clean_df = wiki_clean_df.drop("index", axis=1)

In [None]:
SCRAPED_WIKIPEDIA_PATH = os.path.join(os.path.abspath(""), "WIKI_scraped_pages.csv")
SCRAPED_FEATURES_PATH

In [None]:
wiki_df.to_csv(SCRAPED_WIKIPEDIA_PATH, index = False)