In [None]:
import os
import requests
import pandas as pd
import numpy as np
import time
import random
from bs4 import BeautifulSoup
from pprint import pprint as pp

## Data import

In [None]:
links_df = pd.read_csv(
    '../data/ml-latest-small/links.csv', 
    usecols=['movieId', 'imdbId', 'tmdbId'],
    dtype={'movieId': int, 'imdbId': str, 'tmdbId': str}
)
links_df.head(1)

# Scrape metadata 
## Implement rotating proxies
We will choosing an IP address at random from a list of free proxies sourced from https://free-proxy-list.net/ and https://proxyspace.pro/

We will also add a random fake user-agent header to the requests to make it look like the request is coming from a browser.

In [3]:
USER_AGENTS = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.96 Safari/537.36"
]

class TmdbWebScraper:
    BASE_URL = "https://www.themoviedb.org/movie/"
    
    def __init__(self,
                 proxy_list=None,
                 movie_dir_path='../data/ml-latest-small/',
                 movie_people_filename='movie_cast_and_crew.csv',
                 poster_links_filename='poster_links.csv',
                 max_retries=3):

        self.proxy_list = proxy_list.copy() if proxy_list else []
        self.movie_dir_path = movie_dir_path
        self.movie_people_filename = movie_people_filename
        self.poster_links_filename = poster_links_filename
        self.max_retries = max_retries
        
        self._movie_people = []
        self._poster_links = []
        self._failed_requests = []
        self._crew_not_found = []
        self._cast_not_found = []
        self._poster_link_not_found = []
        self._current_iteration = 0
        self._movieIds_already_checked = []
        self.last_reported_percent = -1
        self.delay_min = 0.25
        self.delay_max = 0.75
        self.num_links_to_scrape = 0
        self._scraping_stopped = False


    def update_proxy_list(self, new_proxy_list):
        """Updates the proxy list only if it is a non-empty list."""
    
        if not isinstance(new_proxy_list, list):
            print("Error: Provided proxy list is not a list.")
            return 
    
        if not new_proxy_list:
            print("Error: Proxy list is empty. Please provide a list of proxies.")
            return 
    
        self.proxy_list = new_proxy_list.copy()
        print(f"Proxy list updated successfully with {len(new_proxy_list)} proxies.")

        if self._scraping_stopped:
            print("You can now rerun `TmdbWebScraper.aggregate_movie_casts_and_crews()` to resume scraping.")
            self._scraping_stopped = False
        
    def _get_random_proxy(self):
        if not self.proxy_list: 
            self._scraping_stopped = True
            return None
            
        return random.choice(self.proxy_list)

    def _fetch_movie_data(self, movie_id, tmdb_id, session):
        """ 
        Apparently TMDB has a rate limit of ~40 requests per 10 seconds so we want
        to introduce a random time delay to make our scraper appear more human-like.
        We will start at 200ms and work from there.
        """
        if self._scraping_stopped:
            print(f"Skipping Movie {movie_id} due to lack of proxies.")
            self._failed_requests.append((movie_id, tmdb_id, "No Proxy Available"))
            return None  
        
        url = f"{self.BASE_URL}{tmdb_id}/"
        
        for attempt in range(self.max_retries):
            time.sleep(random.uniform(self.delay_min, self.delay_max))

            proxy = self._get_random_proxy()
            
            if proxy is None:
                print(f"Skipping Movie {movie_id} due to lack of proxies.")
                self._failed_requests.append((movie_id, tmdb_id, "No Proxy Available"))
                return None
            
            proxies = {"https": proxy}

            try:
                response = session.get(url, proxies=proxies, timeout=5) 
                
                if response.status_code == 200:
                    if "captcha" in response.text.lower() or len(response.text) < 500:  
                        # TMDB might block with a bot detector or short HTML
                        print(f"Soft Block Detected for Movie {movie_id} - Response Length: {len(response.text)}")
                        self._failed_requests.append((movie_id, tmdb_id, "Soft Block"))
                        return None
                    return response.text
    
                elif response.status_code == 429:
                    # If we still hit the rate limit then we use an exponential backoff up to 10 sec
                    # and then increase our random delay range
                    wait_time = min(2 ** attempt, 10)
                    print(f"Rate limit hit for Movie {movie_id}. Retrying in {wait_time} seconds...")
                    time.sleep(wait_time)
                    self.delay_max *= 1.2
                else:
                    if response:
                        print(f"Request failed: MovieId {movie_id} - Status Code {response.status_code}")
                        self._failed_requests.append((movie_id, tmdb_id, response.status_code))
                    else:
                        self._failed_requests.append((movie_id, tmdb_id, "No Response"))

            except requests.exceptions.RequestException:
                print(f"Proxy failed: {proxy}, switching to a new proxy...")
                try:
                    self.proxy_list.remove(proxy)
                    print(f"Removed proxy: {proxy}")
                except ValueError:
                    print(f"Proxy {proxy} was not found in the list. It might have already been removed.")
                continue 
                
        
        return None 
        
    def _scrape_movie_metadata_from_tmdb(self, movie_row, session):
        movie_id = movie_row.movieId
        tmdb_id = movie_row.tmdbId
        people = []
        poster_link = None

        if movie_id not in self._movieIds_already_checked:
        
            response = self._fetch_movie_data(movie_id, tmdb_id, session)   
    
            if response:
                soup = BeautifulSoup(response, 'html.parser')
            
                crew_list = soup.find('ol', class_='people no_image')
                if crew_list:
                    crew = crew_list.find_all('li', class_='profile')
                    for person in crew:
                        name = person.find('a').text
                        jobs = person.find('p', class_='character').text
                        jobs = [job.strip() for job in jobs.split(',')]
                        for job in jobs:
                            people.append((movie_id, name, job))
                else:
                    print(f"Crew not found in html: MovieId {movie_id}")
                    self._crew_not_found.append((movie_id, tmdb_id))
            
                cast_list = soup.find('ol', class_='people scroller')
                if cast_list:
                    cast = cast_list.find_all('li', class_='card')
                    for actor in cast:
                        name_tag = actor.find('img', alt=True)
                        actor_name = name_tag['alt'] if name_tag else "Unknown"
                        people.append((movie_id, actor_name, 'Actor'))
                else:
                    print(f"Cast not found in html: MovieId {movie_id}")
                    self._cast_not_found.append((movie_id, tmdb_id))
                
                poster_img_tag = soup.find('img', class_='poster w-full')
                if poster_img_tag:
                    poster_link = (movie_id, poster_img_tag.get('src'))
                else:
                    print(f"Poster link not found in html: MovieId {movie_id}")
                    self._poster_link_not_found.append((movie_id, tmdb_id))

                self._movieIds_already_checked.append(movie_id)
    
        return people, poster_link

    def _track_percentage_completed(self):
        percent_completed = round(self._current_iteration/self.num_links_to_scrape*100)
        if percent_completed % 2 == 0 and percent_completed > self.last_reported_percent:
            print(f"{percent_completed}% completed")
            self.last_reported_percent = percent_completed

    def aggregate_movie_casts_and_crews(self, links_df):
    
        print("Beginning movie metadata data aggregation.")
        
        self.num_links_to_scrape = links_df.shape[0]

        with requests.Session() as session:
            # Add a random user-agent header to make it look like the request is  
            # coming from a browser to stop the request being blocked.
            session.headers.update({"User-Agent": random.choice(USER_AGENTS) })

            for index, movie_row in links_df.iterrows():
                if self._scraping_stopped:
                    print("Scraping stopped due to lack of proxies. ")
                    print("Please update the proxy list with the `TmdbWebScraper.update_proxy_list(proxy_list:List)` method.")
                    print("Then rerun `TmdbWebScraper.aggregate_movie_casts_and_crews()` to continue scraping.")
                    print("Exiting...")
                    break 
                movie_cast_and_crew, poster_link = self._scrape_movie_metadata_from_tmdb(movie_row, session)

                if movie_cast_and_crew:
                    self._movie_people.extend(movie_cast_and_crew)
                if poster_link:
                    self._poster_links.append(poster_link)

                self._current_iteration += 1
                self._track_percentage_completed()
                
        print("Data aggregation complete.")

    def rerun_aggregation_for_failed_requests(self):
        # Reset the counters
        self._current_iteration = 0
        self.last_reported_percent = -1
        
        failed_requests_df = self.get_failed_requests_df()
        
        # Reset the list for tracking failed requests
        self._failed_requests = []
        self.aggregate_movie_casts_and_crews(failed_requests_df)
        
    def get_movie_cast_and_crew_df(self):
        return pd.DataFrame(self._movie_people, columns=['movieId', 'name', 'role'])

    def get_poster_links_df(self):
        return pd.DataFrame(self._poster_links, columns=['movieId', 'poster_link'])

    def get_failed_requests_df(self):
        return pd.DataFrame(self._failed_requests, columns=['movieId', 'tmdbId', 'statusCode'])

    def get_crew_not_found_df(self):
        return pd.DataFrame(self._crew_not_found, columns=['movieId', 'tmdbId'])
        
    def get_cast_not_found_df(self):
        return pd.DataFrame(self._cast_not_found, columns=['movieId', 'tmdbId'])

    def get_poster_link_not_found_df(self):
        return pd.DataFrame(self._poster_link_not_found, columns=['movieId', 'tmdbId'])
        
    def save_movie_cast_and_crew_data(self):
        self.get_movie_cast_and_crew_df.to_csv(self.movie_dir_path+self.movie_people_filename, index=False)
        print(f"Movie cast & crew saved to {self.movie_dir_path}{self.movie_people_filename}")

    def save_poster_links_data(self):
        self.get_poster_links_df.to_csv(self.movie_dir_path+self.poster_links_filename, index=False)
        print(f"Poster links saved to {self.movie_dir_path}{self.poster_links_filename}")

In [None]:
proxy_list = [
    "https://8.219.97.248",
    "https://67.43.228.250",
    "https://13.245.75.139",
    "https://222.252.194.204",
    "https://51.210.54.186",
    "https://204.236.137.68",
    "https://63.35.64.177",
    "https://113.160.133.32",
    "https://3.90.100.12",
    "https://13.59.156.167",
    "https://54.152.3.36",
    "https://3.130.65.162",
    "https://52.67.10.183",
    "https://18.228.198.164",
    "https://54.233.119.172",
    "https://46.51.249.135",
    "https://3.123.150.192",
    "https://18.228.149.161",
    "https://3.124.133.93",
    "https://3.127.62.252",
    "https://35.79.120.242",
    "https://35.72.118.126",
    "https://3.126.147.182",
    "https://2.59.181.125",
    "https://3.136.29.104",
    "https://13.56.192.187",
]

In [None]:
links_df = pd.read_csv(
    '../data/ml-latest-small/links.csv', 
    usecols=['movieId', 'imdbId', 'tmdbId'],
    dtype={'movieId': int, 'imdbId': str, 'tmdbId': str}
)
movies_missing_tmdbIds = links_df[links_df.tmdbId.isna()]
links_df = links_df[~links_df.movieId.isin(movies_missing_tmdbIds.movieId)]

scraper = TmdbWebScraper(proxy_list)
scraper.aggregate_movie_casts_and_crews(links_df)

## Result: Failure - The website is identifying the proxies immediately and blocking the requests