In [1]:
import json
import os
from os.path import exists
import pandas as pd
import requests
from tqdm import tqdm

# Setup API for HTTPS requests through Wikipedia/Wikidata

In [2]:
CONFIG_PATH = SCRAPED_DATA_PATH = os.path.join(os.path.abspath(""), "config.json")
BASE_URL = "https://api.themoviedb.org/3/movie/"
BASE_URL_TOP_RATED = BASE_URL + "top_rated?"

In [3]:
CONFIG_PATH

'/home/aleksandar-lukic/school/02805_Final_project/code/config.json'

In [4]:
# Load the config file with API credentials
if exists(CONFIG_PATH):
    with open(CONFIG_PATH) as config_file:
        config = json.load(config_file)
        APP_NAME = config["WIKI"]["WIKI_APPLICATION_NAME"]
        AUTH_USER = config["WIKI"]["WIKI_EMAIL"]
        AUTH_TOKEN = config["WIKI"]["WIKI_BEARER_KEY"]
        AUTH_CLIENT_ID = config["WIKI"]["WIKI_CLIENT_ID"]
        AUTH_CLIENT_SECRECT = config["WIKI"]["WIKI_CLIENT_SECRET"]
    
    # Define the headers to include the authentication token
    HEADERS = {
        "Authorization": f"Bearer {AUTH_TOKEN}",
        "User-Agent": f"{APP_NAME} ({AUTH_USER})",
    }
    print(f"Config found - HEADER generated from:\n{CONFIG_PATH}")

else:
    print("Config not found!")

Config found - HEADER generated from:
/home/aleksandar-lukic/school/02805_Final_project/code/config.json


# Read features dataset from .csv

In [5]:
SCRAPED_FEATURES_PATH = os.path.join(os.path.abspath(""), "TMDB_scraped_features.csv")

In [6]:
SCRAPED_FEATURES_PATH

'/home/aleksandar-lukic/school/02805_Final_project/code/TMDB_scraped_features.csv'

In [7]:
features_df = pd.read_csv(SCRAPED_FEATURES_PATH)

In [8]:
features_df.iloc[0]["wikidata_id"]

'Q172241'

# Scrape Wikipedia based on each feature's wikidata\_id

In [9]:
def get_wikipedia_page(wikidata_id):
    
    ### Step 1: Get the English Wikipedia page URL from Wikidata
    wikidata_url = f"https://www.wikidata.org/wiki/Special:EntityData/{wikidata_id}.json"
    response = requests.get(wikidata_url, HEADERS)
    response.raise_for_status()
    data = response.json()

    # Check if the wikidata_id is valid.
    # Otherwise, take alternative id
    if wikidata_id not in data["entities"]:
        old_wikidata_id = wikidata_id
        wikidata_id = list(data["entities"])[0]
        print(f"Wikidata_id({old_wikidata_id}) was substituted with new wikidata_id({wikidata_id})")

    # Extract the english wikipedia page for the feature.
    # Otherwise, return None
    sitelinks = data["entities"][wikidata_id]["sitelinks"]
    if "enwiki" not in sitelinks:
        return None, None

    # Get the title and wikipedia url of the feature
    title = sitelinks["enwiki"]["title"]
    wikipedia_url = sitelinks["enwiki"]["url"]
    
    ### Step 2: Fetch the content from the Wikipedia page
    wikipedia_api_url = "https://en.wikipedia.org/w/api.php"
    params = {
        "action": "query",
        "format": "json",
        "prop": "extracts",
        "explaintext": True,
        "titles": title,
        "headers": HEADERS
    }
    page_response = requests.get(wikipedia_api_url, params=params)
    page_response.raise_for_status()
    pages = page_response.json()["query"]["pages"]
    page_content = next(iter(pages.values()))["extract"]
    
    return wikipedia_url, page_content

In [10]:
wiki_df = pd.DataFrame()

# Create the tqdm progress bar
progress_bar = tqdm(features_df.iterrows(), total=len(features_df), desc="Scraping Wikipedia")

for idx, row in progress_bar:

    url, content = get_wikipedia_page(row["wikidata_id"])

    content_size = None
    if content:
        content_size = len(content)
    
    current_df = pd.DataFrame({
        "feature_id": [row["feature_id"]],
        "title": [row["title"]],
        "url": [url],
        "page_content": [content],
        "content_size": [content_size],
        "wikidata_id": [row["wikidata_id"]]
    })

    wiki_df = pd.concat([wiki_df, current_df])

Scraping Wikipedia:   0%|▏                                   | 42/9557 [00:35<2:01:14,  1.31it/s]

Wikidata_id(Q117705982) was substituted with new wikidata_id(Q115686450)


Scraping Wikipedia:  14%|████▊                             | 1348/9557 [17:07<1:26:17,  1.59it/s]

Wikidata_id(Q123023029) was substituted with new wikidata_id(Q119260191)


Scraping Wikipedia:  43%|███████████████▋                    | 4154/9557 [53:37<59:39,  1.51it/s]

Wikidata_id(Q20023116) was substituted with new wikidata_id(Q15929448)


Scraping Wikipedia: 100%|██████████████████████████████████| 9557/9557 [1:56:29<00:00,  1.37it/s]


### Extract rows without English Wikipedia pages

Some of the results scraped are empty. This is because the API recognizes the "wikidata\_id" and subsequently finds it's entry on Wikidata. However, if the entry does not specifically have an English Wikipedia page, it will just return None.

To counter this, a mask is created with all the rows of features, where any cell is empty, e.g. the "page\_content". 

In [11]:
mask = wiki_df.isnull().any(axis=1)
rows_to_remove = wiki_df[mask]

rows_to_remove = rows_to_remove.sort_values(by = "feature_id", ascending = True)

rows_to_remove = rows_to_remove.reset_index()

rows_to_remove = rows_to_remove.drop("index", axis=1)

rows_to_remove

Unnamed: 0,feature_id,title,url,page_content,content_size,wikidata_id
0,8880,Che: Part Two,,,,Q1068674
1,8881,Che: Part One,,,,Q860425
2,12622,Violent Cop,,,,Q1156893
3,13285,Barbie Fairytopia: Mermaidia,https://en.wikipedia.org/wiki/Barbie:_Mermaidia,,,Q2545667
4,13741,2 Alone in Paris,,,,Q3480756
...,...,...,...,...,...,...
114,1147400,"Miraculous World: Paris, Tales of Shadybug and...",,,,Q123158267
115,1172674,The Rat Catcher,,,,Q122581287
116,1172675,The Swan,,,,Q122580387
117,1172676,Poison,,,,Q122889419


The dropped rows are stored on the side for easier access if needed.

In [12]:
DROPPED_WIKIPEDIA_PATH = os.path.join(os.path.abspath(""), "WIKI_dropped_pages.csv")
rows_to_remove.to_csv(DROPPED_WIKIPEDIA_PATH, index = False)

### Cleaning the scraped Wikipedia pages

With the mask in hand, the wiki\_df can be cleaned and reindexes.

In [13]:
# Extract the 'feature_id's from 'rows'
rows = rows_to_remove["feature_id"]

# Filter 'df' to exclude rows with 'feature_id's present in 'rows'
wiki_clean_df = wiki_df[~wiki_df["feature_id"].isin(rows)]

wiki_clean_df = wiki_clean_df.sort_values(by = "feature_id", ascending = True)

wiki_clean_df = wiki_clean_df.drop_duplicates(subset = [
    "feature_id", 
    "title",
    "url",
    "page_content",
    "content_size",
    "wikidata_id"
], keep = "first")

wiki_clean_df = wiki_clean_df.reset_index()

wiki_clean_df = wiki_clean_df.drop("index", axis=1)

In [14]:
wiki_clean_df

Unnamed: 0,feature_id,title,url,page_content,content_size,wikidata_id
0,2,Ariel,https://en.wikipedia.org/wiki/Ariel_(film),Ariel is a 1988 Finnish drama film directed an...,3346,Q658627
1,3,Shadows in Paradise,https://en.wikipedia.org/wiki/Shadows_in_Parad...,Shadows in Paradise (Finnish: Varjoja paratiis...,1073,Q2778460
2,5,Four Rooms,https://en.wikipedia.org/wiki/Four_Rooms,Four Rooms is a 1995 American anthology \nfarc...,9614,Q1137372
3,6,Judgment Night,https://en.wikipedia.org/wiki/Judgment_Night_(...,Judgment Night is a 1993 American action film ...,10087,Q1710973
4,11,Star Wars,https://en.wikipedia.org/wiki/Star_Wars_(film),Star Wars (later retitled Star Wars: Episode I...,76468,Q17738
...,...,...,...,...,...,...
9433,1155089,Justice League: Crisis on Infinite Earths Part...,https://en.wikipedia.org/wiki/Justice_League:_...,Justice League: Crisis on Infinite Earths is a...,17544,Q123268896
9434,1160164,TAYLOR SWIFT | THE ERAS TOUR,https://en.wikipedia.org/wiki/Taylor_Swift:_Th...,Taylor Swift: The Eras Tour is a 2023 American...,34856,Q122160859
9435,1161048,The Conference,https://en.wikipedia.org/wiki/The_Conference_(...,The Conference (Swedish: Konferensen) is a 202...,6860,Q116634285
9436,1184918,The Wild Robot,https://en.wikipedia.org/wiki/The_Wild_Robot,The Wild Robot is a 2024 American animated sci...,17428,Q124378349


### Saved the scraped wikipedia pages

In [15]:
SCRAPED_WIKIPEDIA_PATH = os.path.join(os.path.abspath(""), "WIKI_scraped_pages.csv")
SCRAPED_WIKIPEDIA_PATH

'/home/aleksandar-lukic/school/02805_Final_project/code/WIKI_scraped_pages.csv'

In [16]:
wiki_clean_df.to_csv(SCRAPED_WIKIPEDIA_PATH, index = False)