In [1]:
%%capture
%pip install scrapy requests pandas

In [2]:
import json
import os
from os.path import exists
import pandas as pd
import requests
# import scrapy
from tqdm import tqdm

# Setup API for HTTPS requests

### Authentication to use TMDB API

TMDB requires users to create an account to access its API. After creating an account, you can request a bearer token, which is then used to authenticate HTTPS requests. For using this code, the provided _config.json_ file contains the neccessary credentials used in the request header. Finally, the header is parsed with each request.

In [3]:
CONFIG_PATH = SCRAPED_DATA_PATH = os.path.join(os.path.abspath(""), "config.json")
BASE_URL = "https://api.themoviedb.org/3/movie/top_rated?"

In [4]:
# Load the config file with API credentials
if exists(CONFIG_PATH):
    with open(CONFIG_PATH) as config_file:
        config = json.load(config_file)
        APP_NAME = config["TMDB_APPLICATION_NAME"]
        AUTH_USER = config["TMDB_EMAIL"]
        AUTH_TOKEN = config["TMDB_BEARER_KEY"]
    
    # Define the headers to include the authentication token
    HEADERS = {
        "accept": "application/json",
        "Authorization": f"Bearer {AUTH_TOKEN}",
    }

else:
    print("Config not found!")

In order to check if authentication is valid, the response should return 200:

In [5]:
response = requests.get(BASE_URL+"authentication", headers=HEADERS)
print(response)

<Response [200]>


# Scraping TMDB

The TMDB web structure is page-based, meaning any search performed on their database returns results one page at a time. It is the user's responsibility to specify which page to request. Therefore, to retrieve all search results, it is necessary to determine the total number of pages.

## Finding the number of pages

The total number of pages can be found by using the default base url and access the _"total\_pages"_ field.

In [6]:
response = requests.get(BASE_URL, headers=HEADERS)

In [7]:
TOTAL_PAGES = response.json()["total_pages"]
print(f"{TOTAL_PAGES=}")

TOTAL_PAGES=490


## Extracting features from pages

Using the total number of pages, all features can be extracted incrementally for each page and appended to a pandas DataFrame. The DataFrame can then be saved as a .csv file. 
Initially, the DataFrame is set up as follows:

In [8]:
df = pd.DataFrame()

Then, using a for-loop, the incemental page-number with the parameters are parsed the HTTPS request via the TMDB API. 
The parameters used in the code return all the highest rated movies from TMDB and sort them by their average rating.
These are then stored as rows in the DataFrame.

In [9]:
# Create the tqdm progress bar
progress_bar = tqdm(range(1, TOTAL_PAGES+1), desc="Scraping TMDB")

for PAGE in progress_bar:

    params = {
        "language": "en-US",
        "page": PAGE,
        "sort_by": "vote_average.desc"
    }

    response = requests.get(
        BASE_URL, 
        headers=HEADERS, 
        params=params
    )
    
    respone_json = response.json()

    current_df = pd.json_normalize(
        respone_json, 
        record_path = "results", 
        meta = [
            "page"
        ]
    )

    df = pd.concat([df, current_df])

Scraping TMDB: 100%|██████████████████████████████████| 490/490 [02:13<00:00,  3.67it/s]


### Clean the data and reindex the rows

In [10]:
clean_df = df.drop_duplicates(subset = "id", keep = "first")

clean_df = clean_df.reset_index()

clean_df = clean_df.drop("index", axis=1)

In [11]:
clean_df

Unnamed: 0,adult,backdrop_path,genre_ids,id,original_language,original_title,overview,popularity,poster_path,release_date,title,video,vote_average,vote_count,page
0,False,/zfbjgQE1uSd9wiPTX4VzsLi0rGG.jpg,"[18, 80]",278,en,The Shawshank Redemption,Imprisoned in the 1940s for the double murder ...,201.361,/9cqNxx0GxF0bflZmeSMuL5tnGzr.jpg,1994-09-23,The Shawshank Redemption,False,8.708,27239,1
1,False,/tmU7GeKVybMWFButWEGl2M4GeiP.jpg,"[18, 80]",238,en,The Godfather,"Spanning the years 1945 to 1955, a chronicle o...",170.359,/3bhkrj58Vtu7enYsRolD1fZdja1.jpg,1972-03-14,The Godfather,False,8.689,20689,1
2,False,/kGzFbGhp99zva6oZODW5atUtnqi.jpg,"[18, 80]",240,en,The Godfather Part II,In the continuing saga of the Corleone crime f...,85.193,/hek3koDUyRQk7FIhPXsa6mT2Zc3.jpg,1974-12-20,The Godfather Part II,False,8.600,12482,1
3,False,/zb6fM1CX41D9rF9hdgclu0peUmy.jpg,"[18, 36, 10752]",424,en,Schindler's List,The true story of how businessman Oskar Schind...,84.546,/sF1U4EUQS8YHUYjNl3pMGNIQyr0.jpg,1993-12-15,Schindler's List,False,8.566,15892,1
4,False,/qqHQsStV6exghCM7zbObuYBiYxw.jpg,[18],389,en,12 Angry Men,The defense and the prosecution have rested an...,55.792,/ow3wq89wM8qd5X7hWKxiRfsFf9C.jpg,1957-04-10,12 Angry Men,False,8.547,8691,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9775,False,/lcLyZzhB1ctfdH0hGBsTFrbflqP.jpg,"[28, 14, 27]",12142,en,Alone in the Dark,Edward Carnby is a private investigator specia...,13.378,/bSxrbVCyWW077zhtpuYlo3zgyug.jpg,2005-01-28,Alone in the Dark,False,3.246,606,490
9776,False,/5V6jAFS0Q49SI07qjyFRMYlbfR9.jpg,[35],13805,en,Disaster Movie,"Over the course of one evening, an unsuspectin...",21.967,/3J8XKUfhJiNzwobUZVtizXYPe8b.jpg,2008-08-29,Disaster Movie,False,3.200,1024,490
9777,False,/aNUEHLNsNMprLZt6fjf5nqDq6er.jpg,"[27, 28, 53]",11059,en,House of the Dead,"Set on an island off the coast, a techno rave ...",9.978,/z2mDGbV4pLtsvSMNnmnSgoVZSWK.jpg,2003-04-11,House of the Dead,False,3.100,386,490
9778,False,/oHrrgAPEKpz0S1ofQntiZNrmGrM.jpg,"[28, 12, 14, 878, 53]",14164,en,Dragonball Evolution,"On his 18th birthday, Goku receives a mystical...",16.594,/sunS9xhPnFNP5wlOWrvbpBteAB.jpg,2009-03-12,Dragonball Evolution,False,2.891,2037,490


## Saving the scraped data to a .csv file

In [12]:
SCRAPED_DATA_PATH = os.path.join(os.path.abspath(""), "TMDB_scraped_data.csv")

In [13]:
SCRAPED_DATA_PATH

'C:\\Users\\Aleksandar\\School\\02805_Social_Graphs_and_Interactions\\FINAL_PROJECT\\TMDB_scraped_data.csv'

In [14]:
clean_df.to_csv(SCRAPED_DATA_PATH, index = False)

# Downloading feature posters from TMDB 

### Create folder for images

To automize the process of fetching and storing the feature images from TMDB, a folder _"images"_ is created if it doesn't already exist.

In [15]:
IMAGE_FOLDER_PATH = os.path.join(os.path.abspath(""), "images")

In [17]:
if os.path.exists(IMAGE_FOLDER_PATH):
    print(f"Found folder:\n{IMAGE_FOLDER_PATH}")
else:
    os.makedirs(IMAGE_FOLDER_PATH)
    print(f"Created folder:\n{IMAGE_FOLDER_PATH}")

Created folder:
C:\Users\Aleksandar\School\02805_Social_Graphs_and_Interactions\FINAL_PROJECT\images


### Iterate over the DataFrame to request the images
 
With the DataFrame complete, the _"backdrop\_path"_ column contains the endpoint for each feature's image backdrop.

By appending these endpoints to the modified base URL, the corresponding .jpg files can be retrieved and stored locally.

In [18]:
def save_feature_image(img, folder_path, feature_id):
    path = os.path.join(folder_path, f"{feature_id}_poster.jpg")
    with open(path, "wb") as f:
        f.write(img.content)

In [19]:
BASE_URL_IMAGE = "https://image.tmdb.org/t/p/original"
HEADERS_IMG = {
    "accept": "application/jpg",
    "Authorization": f"Bearer {AUTH_TOKEN}",
}

In [20]:
# Simply for showing it works.
# Should be updated later!
sample_df = clean_df.iloc[0:5]
sample_df

Unnamed: 0,adult,backdrop_path,genre_ids,id,original_language,original_title,overview,popularity,poster_path,release_date,title,video,vote_average,vote_count,page
0,False,/zfbjgQE1uSd9wiPTX4VzsLi0rGG.jpg,"[18, 80]",278,en,The Shawshank Redemption,Imprisoned in the 1940s for the double murder ...,201.361,/9cqNxx0GxF0bflZmeSMuL5tnGzr.jpg,1994-09-23,The Shawshank Redemption,False,8.708,27239,1
1,False,/tmU7GeKVybMWFButWEGl2M4GeiP.jpg,"[18, 80]",238,en,The Godfather,"Spanning the years 1945 to 1955, a chronicle o...",170.359,/3bhkrj58Vtu7enYsRolD1fZdja1.jpg,1972-03-14,The Godfather,False,8.689,20689,1
2,False,/kGzFbGhp99zva6oZODW5atUtnqi.jpg,"[18, 80]",240,en,The Godfather Part II,In the continuing saga of the Corleone crime f...,85.193,/hek3koDUyRQk7FIhPXsa6mT2Zc3.jpg,1974-12-20,The Godfather Part II,False,8.6,12482,1
3,False,/zb6fM1CX41D9rF9hdgclu0peUmy.jpg,"[18, 36, 10752]",424,en,Schindler's List,The true story of how businessman Oskar Schind...,84.546,/sF1U4EUQS8YHUYjNl3pMGNIQyr0.jpg,1993-12-15,Schindler's List,False,8.566,15892,1
4,False,/qqHQsStV6exghCM7zbObuYBiYxw.jpg,[18],389,en,12 Angry Men,The defense and the prosecution have rested an...,55.792,/ow3wq89wM8qd5X7hWKxiRfsFf9C.jpg,1957-04-10,12 Angry Men,False,8.547,8691,1


In [21]:
# Create the tqdm progress bar
progress_bar = tqdm(sample_df.iterrows(), total=len(sample_df), desc="Saving posters")

for idx, row in progress_bar:
                
        # get id and backdrop endpoint
        feature_id = row["id"]
        feature_backdrop_path = row["poster_path"]
        
        
        # Update progress bar with current id
        progress_bar.set_postfix(current_id=feature_id)
        
        # Send HTTPS GET request to retrieve the image and then save it to folder
        img = requests.get(BASE_URL_IMAGE + feature_backdrop_path, headers = HEADERS_IMG)
        save_feature_image(img, IMAGE_FOLDER_PATH, feature_id)

Saving posters: 100%|█████████████████████| 5/5 [00:01<00:00,  2.97it/s, current_id=389]
