In [2]:
import os
import requests
import zipfile
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from pprint import pprint as pp

# Scrape metadata

In [2]:
""" 
themoviedb.org turned out to be the best option. 
It has a list of Directors, Screenwriters, etc as well as well as a short list of top-billed actors, rather than an extensive list of every single actor in the movie.
"""

tmdbId = '862'

url = f"https://www.themoviedb.org/movie/{tmdbId}/"
response = requests.get(url)

soup = BeautifulSoup(response.content, 'html.parser')
print(soup.prettify)

<bound method Tag.prettify of <!DOCTYPE html>

<html class="no-js" lang="de">
<head>
<title>Toy Story (1995) ‚Äî The Movie Database (TMDB)</title>
<meta content="on" http-equiv="cleartype"/>
<meta charset="utf-8"/>
<meta content="Movies, TV Shows, Streaming, Reviews, API, Actors, Actresses, Photos, User Ratings, Synopsis, Trailers, Teasers, Credits, Cast" name="keywords"/>
<meta content="yes" name="mobile-web-app-capable"/>
<meta content="yes" name="apple-mobile-web-app-capable"/>
<meta content="width=device-width,initial-scale=1" name="viewport"/>
<meta content="Als Andys Lieblingspuppe hat Woody im Kinderzimmer das Sagen. Kaum ist der Junge nicht da, erwacht die Cowboy-Figur zum Leben, und mit ihm auch all das andere Spielzeug um ihn herum. Aufgeregt debattiert man √ºber Andys bevorstehenden Geburtstag. Es wird bef√ºrchtet, da√ü ihr Besitzer ein neues Geschenk bevorzugen wird. Und tats√§chlich: Der Neuank√∂mmling Buzz Lightyear, ein stolzer Space Ranger, avanciert zu Andys Favoriten.

## web-scraping from themoviedb.org
To improve our recommendations it would be beneficial to train a similarity model on the people associated with the movies. 
<br>
The themoviedb.org page for each movie contains information on the people involved in the production of the movies under tags such as "Screenplay", "Story", "Director" and "Characters", as well as a list of top-billed actors.
<br>
The tags are not consistent across all movies. For example, "Characters" usually has a value such as "Stan Lee", who created the characters but was not involved in producing the movies. Many movies do not have this tag at all.
<br>
Stan Lee is obviously a very relevant name to associate with a movie though, as people who like one Marvel movie will tend to like other Marvel movies.

It is may not be necessary to save the roles of the people. Clint Eastwood is both an actor and a director and his fans will probably be consistent across both roles. 
<br>
Then again, Jordan Peele was a comedy actor and the movies he has directed are most definitely not comedies (Get Out).
<br>
We are going to be using this data for Matrix Factorization which is a technique for reducing dimensionality and finding latent features. In layman's terms this means finding groups of similar things which often go together, such as Quentin Tarantino or Wes Anderson often using the same actors. 

Movies with the same actor will probably have a weaker connection than movies with the same director. Brad Pitt has been in two Tarantino movies but he was also Thelma & Louise. All Tarantino movies have a similar style but actors might star in movies from totally different genres.

So let's leave in the jobs (feature engineering). It will make our dataset bigger and sparser but its not a huge dataset anyway so we should be fine.



In [3]:
""" 
To figure out where the relevant information is in the big mess of html that is the soup, we inspect the actual webpage in the browser using DevTools, and find the elements we are after. 

For example, we can find John Lasseter's name and job in the html code below.

<li class="profile">
<p><a href="/person/7879-john-lasseter">John Lasseter</a></p>
<p class="character">Director, Story</p>
</li>

"""

creator_list = soup.find('ol', class_='people no_image')

if creator_list:
    profiles = creator_list.find_all('li', class_='profile')
    for person in profiles:
        name = person.find('a').text
        jobs = person.find('p', class_='character').text
        
        jobs = jobs.split(',') if ',' in jobs else [jobs]
        jobs = [r.strip() for r in jobs] 
    
        for job in jobs:
            print(f"{name} ({job})")
else:
    print(f"Creator list not found for movie {movieId}")

actor_list = soup.find('ol', class_='people scroller')

if actor_list:
    actors = actor_list.find_all('li', class_='card') 
    for actor in actors:
        name_tag = actor.find('img', alt=True)
        actor_name = name_tag['alt'] if name_tag else "Unknown"
        
        print(f"{actor_name} (Actor)")
else:
    print(f"Actor list not found for movie {movieId}")
    

John Lasseter (Director)
John Lasseter (Story)
Andrew Stanton (Screenplay)
Andrew Stanton (Story)
Alec Sokolow (Screenplay)
Joel Cohen (Screenplay)
Joss Whedon (Screenplay)
Pete Docter (Story)
Joe Ranft (Story)
Tom Hanks (Actor)
Tim Allen (Actor)
Don Rickles (Actor)
Jim Varney (Actor)
Wallace Shawn (Actor)
John Ratzenberger (Actor)
Annie Potts (Actor)
John Morris (Actor)
Erik von Detten (Actor)


In [4]:
# Locate the <img> tag with the class 'poster'
poster_img_tag = soup.find('img', class_='poster w-full')

# Extract the 'src' attribute, which contains the poster link
if poster_img_tag:
    poster_link = poster_img_tag.get('src')
    print(f"Poster Link: {poster_link}")
else:
    print("Poster not found.")

Poster Link: https://media.themoviedb.org/t/p/w300_and_h450_bestv2/om4fMx3e4xkx27sAsacoFP4WiEd.jpg


### Scrape the data

In [41]:
import pandas as pd
import time
import random
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
import undetected_chromedriver as uc
import pandas.api.types as ptypes


class TmdbWebScraper:
    BASE_URL = "https://www.themoviedb.org/movie/"
    
    def __init__(self,
                 tmdb_data_path='../data/tmdb/',
                 movie_people_filename='movie_cast_and_crew.csv',
                 poster_links_filename='poster_links.csv',
                 max_retries=3):

        self.tmdb_data_path = tmdb_data_path
        self.movie_people_filename = movie_people_filename
        self.poster_links_filename = poster_links_filename
        self.max_retries = max_retries
        
        self._movie_people = []
        self._poster_links = []
        self._failed_requests = 0
        self._crew_not_found = []
        self._cast_not_found = []
        self._poster_link_not_found = []
        self._pages_not_found = []
        self._current_iteration = 0
        self._last_reported_percent = -1
        self._num_links_to_scrape = 0
        self._prior_movie_people_data_df = pd.DataFrame(columns=['movieId', 'name', 'role'])
        self._prior_poster_links_data_df = pd.DataFrame(columns=['movieId', 'poster_link'])

        self.driver = self._initialize_driver()

    def _initialize_driver(self):
        """Sets up a Selenium undetected Chrome driver."""
        options = uc.ChromeOptions()
        options.add_argument("--headless=new")  # Run in headless mode
        options.add_argument("--disable-blink-features=AutomationControlled")  # Bypass bot detection
        options.add_argument("--window-size=1920x1080")  # Standard window size
        options.add_argument("--no-sandbox")  # Run in environments without GUI
        options.add_argument("--disable-gpu")  # Avoid GPU rendering issues
        options.add_argument("--log-level=3")  # Reduce console logs

        try:
            return uc.Chrome(service=Service(ChromeDriverManager().install()), options=options)
        except Exception as e:
            print(f"Error initializing Selenium: {e}")
            return None

    def _fetch_movie_data(self, movie_id, tmdb_id):
        """ 
        Apparently TMDB has a rate limit of ~40 requests per 10 seconds so we want
        to introduce a random time delay to make our scraper appear more human-like.
        We will start at 200ms and work from there.
        """
        url = f"{self.BASE_URL}{tmdb_id}/"
        
        for attempt in range(self.max_retries):
            try:
                print(f"Fetching: {url}")
                self.driver.get(url)  # Open the page in Selenium

                time.sleep(random.uniform(5, 10))  # Add delay to mimic human behavior
                
                html = self.driver.page_source
                
                # Check for CAPTCHAs or blocks
                if "captcha" in html.lower() or len(html) < 500:
                    wait_time = min(2 ** attempt, 10)  # Exponential backoff of max 10 sec
                    print(f"CAPTCHA detected for Movie {movie_id}. Retrying in {wait_time} seconds...")
                    time.sleep(wait_time)
                    continue

                if "<title>" in html:
                    print(f"Success: Page loaded for Movie {movie_id}")
                    
                else:
                    print(f"Warning: Page for Movie {movie_id} might not have loaded correctly.")
                
                return html
            
            except Exception as e:
                print(f"Error fetching Movie {movie_id}: {e}")
        
        print(f"Failed to fetch Movie {movie_id} after {self.max_retries} attempts.")
        return None
        
    def _scrape_movie_metadata_from_tmdb(self, movie_row):
        movie_id = movie_row.movieId
        tmdb_id = movie_row.tmdbId
        people = []
        poster_link = None
        
        html = self._fetch_movie_data(movie_id, tmdb_id)
        if not html:
            return [], None
            
        soup = BeautifulSoup(html, 'html.parser')

        title = soup.find('title')
        page_not_found = True if 'Page Not Found' in title.text else False
        if page_not_found:
            self._pages_not_found.append(movie_id)
            print(f"Page not found for movieId: {movie_id}, tmdbId: {tmdb_id}")
            return [], None
    
        crew_list = soup.find('ol', class_='people no_image')
        if crew_list:
            crew = crew_list.find_all('li', class_='profile')
            for person in crew:
                name = person.find('a').text
                jobs = person.find('p', class_='character').text
                jobs = [job.strip() for job in jobs.split(',')]
                for job in jobs:
                    people.append((movie_id, name, job))
            self._failed_requests = 0
        else:
            print(f"Crew not found in html: MovieId {movie_id}")
            self._crew_not_found.append((movie_id, tmdb_id))
            self._failed_requests += 1
    
        cast_list = soup.find('ol', class_='people scroller')
        if cast_list:
            cast = cast_list.find_all('li', class_='card')
            for actor in cast:
                name_tag = actor.find('img', alt=True)
                actor_name = name_tag['alt'] if name_tag else "Unknown"
                people.append((movie_id, actor_name, 'Actor'))
        else:
            print(f"Cast not found in html: MovieId {movie_id}")
            self._cast_not_found.append((movie_id, tmdb_id))
        
        poster_img_tag = soup.find('img', class_='poster w-full')
        if poster_img_tag:
            poster_link = (movie_id, poster_img_tag.get('src'))
        else:
            print(f"Poster link not found in html: MovieId {movie_id}")
            self._poster_link_not_found.append((movie_id, tmdb_id))
    
        return people, poster_link

    def _track_percentage_completed(self):
        percent_completed = round(self._current_iteration/self._num_links_to_scrape*100)
        if percent_completed % 2 == 0 and percent_completed > self._last_reported_percent:
            print(f"{percent_completed}% completed")
            self._last_reported_percent = percent_completed

    def _check_if_data_has_already_been_aggregated(self):  
        try:      
            self._prior_movie_people_data_df = pd.read_csv(self.tmdb_data_path+self.movie_people_filename)
        except FileNotFoundError:
            self._prior_movie_people_data_df = pd.DataFrame(columns=['movieId', 'name', 'role'])
    
        try:
            self._prior_poster_links_data_df = pd.read_csv(self.tmdb_data_path+self.poster_links_filename)
        except FileNotFoundError:
            self._prior_poster_links_data_df = pd.DataFrame(columns=['movieId', 'poster_link'])

    def _validate_ids_df(self, ids_df):
        required_columns = {"movieId", "tmdbId"}
    
        if not required_columns.issubset(ids_df.columns):
            missing_columns = required_columns - set(ids_df.columns)
            raise ValueError(f"Missing required columns: {missing_columns}")
    
        if not ptypes.is_integer_dtype(ids_df["movieId"]):
            raise TypeError("Column 'movieId' must contain integer values.")
    
        if not ptypes.is_string_dtype(ids_df["tmdbId"]):
            raise TypeError("Column 'tmdbId' must contain string values.")
    
    def _check_input_data(self, ids_to_aggregate_df, ids_df):
        self._validate_ids_df(ids_df)
        
        if ids_df.empty:
            """ Prevent possible division by zero error when calculating prior_percent_aggregated """
            print("The dataframe of IDs you supplied is empty.")
            
        elif ids_to_aggregate_df.empty:
            print("All of the data has already been aggregated.")
            
        else:
            prior_percent_aggregated = (self._prior_movie_people_data_df.movieId.nunique() / ids_df.shape[0]) * 100
            print(f"{prior_percent_aggregated:.2f}% of the data has already been aggregated")
    
    def aggregate_movie_metadata(self, ids_df):
    
        print("Beginning movie metadata data aggregation.")

        self._check_if_data_has_already_been_aggregated()

        ids_to_aggregate_df = ids_df[~ids_df.movieId.isin(self._prior_movie_people_data_df['movieId'])]

        self._check_input_data(ids_to_aggregate_df, ids_df)
        
        self._num_links_to_scrape = ids_to_aggregate_df.shape[0]

        for index, movie_row in ids_to_aggregate_df.iterrows():
            if self._failed_requests > 10:
                print("The website has begun blocking this IP address. Try again later.")
                break
                
            movie_cast_and_crew, poster_link = self._scrape_movie_metadata_from_tmdb(movie_row)

            if movie_cast_and_crew:
                self._movie_people.extend(movie_cast_and_crew)
            if poster_link:
                self._poster_links.append(poster_link)

            self._current_iteration += 1
            self._track_percentage_completed()
                
        print("Data aggregation complete.")

    def get_crew_not_found_df(self):
        return pd.DataFrame(self._crew_not_found, columns=['movieId', 'tmdbId'])
        
    def get_cast_not_found_df(self):
        return pd.DataFrame(self._cast_not_found, columns=['movieId', 'tmdbId'])

    def get_poster_link_not_found_df(self):
        return pd.DataFrame(self._poster_link_not_found, columns=['movieId', 'tmdbId'])
        
    def get_movie_cast_and_crew_df(self):
        new_data = pd.DataFrame(self._movie_people, columns=['movieId', 'name', 'role'])
        if self._prior_movie_people_data_df.empty:
            return new_data
        return pd.concat([self._prior_movie_people_data_df, new_data], ignore_index=True)
        
    def get_poster_links_df(self):
        new_data = pd.DataFrame(self._poster_links, columns=['movieId', 'poster_link'])
        if self._prior_poster_links_data_df.empty:
            return new_data
        return pd.concat([self._prior_poster_links_data_df, new_data], ignore_index=True)
        
    def save_movie_cast_and_crew_data(self):
        self.get_movie_cast_and_crew_df().to_csv(self.tmdb_data_path+self.movie_people_filename, index=False)
        print(f"Movie cast & crew saved to {self.tmdb_data_path}{self.movie_people_filename}")

    def save_poster_links_data(self):
        self.get_poster_links_df().to_csv(self.tmdb_data_path+self.poster_links_filename, index=False)
        print(f"Poster links saved to {self.tmdb_data_path}{self.poster_links_filename}")

    def close(self):
        """Closes Selenium driver."""
        self.driver.quit()
        print("Selenium session closed.")

    def __del__(self):
        """Ensure Selenium driver closes when object is deleted."""
        self.close()


In [11]:
links_df = pd.read_csv(
    '../data/ml-latest-small/links.csv', 
    usecols=['movieId', 'imdbId', 'tmdbId'],
    dtype={'movieId': int, 'imdbId': str, 'tmdbId': str}
)
movies_missing_tmdbIds = links_df[links_df.tmdbId.isna()]
links_df = links_df[~links_df.movieId.isin(movies_missing_tmdbIds.movieId)]
links_df.head()

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862
1,2,113497,8844
2,3,113228,15602
3,4,114885,31357
4,5,113041,11862


In [42]:
scraper = TmdbWebScraper()
print("Scraper object instantiated.")

Selenium session closed.
Scraper object instantiated.


In [13]:
scraper.aggregate_movie_metadata(links_df)

Beginning movie metadata data aggregation.
91.03% of the data has already been aggregated
Fetching: https://www.themoviedb.org/movie/12773/
Success: Page loaded for Movie 4207
Page not found for movieId: 4207, tmdbId: 12773
0% completed
Fetching: https://www.themoviedb.org/movie/17882/
Success: Page loaded for Movie 4568
Page not found for movieId: 4568, tmdbId: 17882
Fetching: https://www.themoviedb.org/movie/68149/
Success: Page loaded for Movie 5069
Page not found for movieId: 5069, tmdbId: 68149
Fetching: https://www.themoviedb.org/movie/24549/
Success: Page loaded for Movie 5209
Page not found for movieId: 5209, tmdbId: 24549
Fetching: https://www.themoviedb.org/movie/14980/
Success: Page loaded for Movie 7646
Page not found for movieId: 7646, tmdbId: 14980
Fetching: https://www.themoviedb.org/movie/164721/
Success: Page loaded for Movie 7669
Page not found for movieId: 7669, tmdbId: 164721
Fetching: https://www.themoviedb.org/movie/140207/
Success: Page loaded for Movie 7762
Page

In [44]:
scraper.save_movie_cast_and_crew_data()

Movie cast & crew saved to ../data/tmdb/movie_cast_and_crew.csv


In [45]:
scraper.save_poster_links_data()

Poster links saved to ../data/tmdb/poster_links.csv


In [30]:
cast_and_crew_df = scraper.get_movie_cast_and_crew_df()
cast_and_crew_df.head()

Unnamed: 0,movieId,name,role
0,1,John Lasseter,Director
1,1,John Lasseter,Story
2,1,Andrew Stanton,Screenplay
3,1,Andrew Stanton,Story
4,1,Joel Cohen,Screenplay


In [31]:
cast_and_crew_df.isna().sum()

movieId    0
name       0
role       0
dtype: int64

In [32]:
cast_and_crew_df.shape

(116302, 3)

In [33]:
cast_and_crew_df.movieId.nunique()

9625

In [28]:
''' Let's check some of the movieIds where we got the 'Page not found' response '''
cast_and_crew_df[cast_and_crew_df.movieId==4568]

Unnamed: 0,movieId,name,role


In [34]:
poster_links_df = scraper.get_poster_links_df()
poster_links_df.head()

Unnamed: 0,movieId,poster_link
0,1,https://media.themoviedb.org/t/p/w300_and_h450...
1,2,https://media.themoviedb.org/t/p/w300_and_h450...
2,3,https://media.themoviedb.org/t/p/w300_and_h450...
3,4,https://media.themoviedb.org/t/p/w300_and_h450...
4,5,https://media.themoviedb.org/t/p/w300_and_h450...


In [35]:
poster_links_df.shape

(9621, 2)

In [36]:
poster_links_df.isna().sum()

movieId        0
poster_link    0
dtype: int64

In [37]:
poster_links_df.nunique()

movieId        9621
poster_link    9620
dtype: int64

In [51]:
scraper.close()

Selenium session closed.


<div class="alert alert-info">
    <strong>
        I checked the tmdbIds which failed to return results on themoviedb.org and the requests are failing because their IDs have changed. 
    </strong>
    <br>
    <strong>
        There is nothing we can do about this. We will have to remove them from the data.
    </strong>
    
</div>

# Check the saved data

In [74]:
cast_and_crew_df = pd.read_csv('../data/tmdb/movie_cast_and_crew.csv')
print("cast_and_crew_df.isna().sum():\n", cast_and_crew_df.isna().sum())
print("\ncast_and_crew_df.shape: ", cast_and_crew_df.shape)
print("\ncast_and_crew_df.movieId.nunique(): ", cast_and_crew_df.movieId.nunique())

cast_and_crew_df.head()

cast_and_crew_df.isna().sum():
 movieId    0
name       0
role       0
dtype: int64

cast_and_crew_df.shape:  (116302, 3)

cast_and_crew_df.movieId.nunique():  9625


Unnamed: 0,movieId,name,role
0,1,John Lasseter,Director
1,1,John Lasseter,Story
2,1,Andrew Stanton,Screenplay
3,1,Andrew Stanton,Story
4,1,Joel Cohen,Screenplay


In [76]:
# Check for empty strings
empty_strings_df = cast_and_crew_df[(cast_and_crew_df['name'].str.strip() == '') | (cast_and_crew_df['role'].str.strip() == '')]
empty_strings_df

Unnamed: 0,movieId,name,role


In [3]:
poster_links_df = pd.read_csv('../data/tmdb/poster_links.csv')
print("poster_links_df.isna().sum():\n", poster_links_df.isna().sum())
print("\nposter_links_df.shape: ", poster_links_df.shape)
print("\nposter_links_df.movieId.nunique(): ", poster_links_df.movieId.nunique())

poster_links_df.head()

poster_links_df.isna().sum():
 movieId        0
poster_link    0
dtype: int64

poster_links_df.shape:  (9621, 2)

poster_links_df.movieId.nunique():  9621


Unnamed: 0,movieId,poster_link
0,1,https://media.themoviedb.org/t/p/w300_and_h450...
1,2,https://media.themoviedb.org/t/p/w300_and_h450...
2,3,https://media.themoviedb.org/t/p/w300_and_h450...
3,4,https://media.themoviedb.org/t/p/w300_and_h450...
4,5,https://media.themoviedb.org/t/p/w300_and_h450...


In [7]:
# Check for empty strings
empty_strings_df = poster_links_df[poster_links_df['poster_link'].str.strip() == '']
empty_strings_df

Unnamed: 0,movieId,poster_link


In [10]:
# Check for bad links
bad_links_df = poster_links_df[~poster_links_df['poster_link'].str.contains('https')]
bad_links_df

Unnamed: 0,movieId,poster_link


In [67]:
movies_df = pd.read_csv('../data/ml-latest-small/movies.csv')
print("movies_df.isna().sum():\n", movies_df.isna().sum())
print("\nmovies_df.shape: ", movies_df.shape)
print("\nmovies_df.movieId.nunique(): ", movies_df.movieId.nunique())

movies_df.head()

movies_df.isna().sum():
 movieId    0
title      0
genres     0
dtype: int64

movies_df.shape:  (9734, 3)

movies_df.movieId.nunique():  9734


Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


## Compare poster links with cast & crew

In [68]:
poster_movie_ids = set(poster_links_df.movieId.unique())
crew_movie_ids = set(cast_and_crew_df.movieId.unique())

# Find movieIds missing in each DataFrame
crew_ids_missing_in_posters = crew_movie_ids - poster_movie_ids
poster_ids_missing_in_crew = poster_movie_ids - crew_movie_ids

crew_ids_missing_in_posters_df = pd.DataFrame({'movieId': list(crew_ids_missing_in_posters)})
poster_ids_missing_in_crew_df = pd.DataFrame({'movieId': list(poster_ids_missing_in_crew)})

print("Movies in cast_and_crew_df but missing in poster_links_df:")
print(crew_ids_missing_in_posters_df)

print("\nMovies in poster_links_df but missing in cast_and_crew_df:")
print(poster_ids_missing_in_crew_df)

Movies in cast_and_crew_df but missing in poster_links_df:
   movieId
0   152711
1   178129
2   193579
3   180777

Movies in poster_links_df but missing in cast_and_crew_df:
Empty DataFrame
Columns: [movieId]
Index: []


### Movies missing posters

In [69]:
movies_df[movies_df.movieId.isin(crew_ids_missing_in_posters_df.movieId)]

Unnamed: 0,movieId,title,genres
9226,152711,Who Killed Chea Vichea? (2010),Documentary
9617,178129,Adventures in Plymptoons! (2011),Documentary
9647,180777,Die Frauen von Ravensbr√ºck (2005),Documentary
9728,193579,Jon Stewart Has Left the Building (2015),Documentary


## Compare poster links with movielens data

In [70]:
poster_movie_ids = set(poster_links_df.movieId.unique())
movies = set(movies_df.movieId.unique())

# Find movieIds missing in each DataFrame
movie_ids_missing_in_posters = movies - poster_movie_ids
poster_ids_missing_in_movies = poster_movie_ids - movies

movie_ids_missing_in_posters_df = pd.DataFrame({'movieId': list(movie_ids_missing_in_posters)})
poster_ids_missing_in_movies_df = pd.DataFrame({'movieId': list(poster_ids_missing_in_movies)})

print("Movies in movies_df but missing in poster_links_df:")
print(movie_ids_missing_in_posters_df)

print("\nMovies in poster_links_df but missing in movies_df:")
print(poster_ids_missing_in_movies_df)

Movies in movies_df but missing in poster_links_df:
     movieId
0     171011
1     150548
2      90647
3      26649
4     180263
..       ...
108    66544
109     7669
110    26614
111   141816
112    62970

[113 rows x 1 columns]

Movies in poster_links_df but missing in movies_df:
Empty DataFrame
Columns: [movieId]
Index: []


### Movies missing metadata

In [71]:
movies_df[movies_df.movieId.isin(movie_ids_missing_in_posters_df.movieId)]

Unnamed: 0,movieId,title,genres
3123,4207,Navy Seals (1990),Action|Adventure|War
3358,4568,Best of the Best (1989),Action
3676,5069,Escaflowne: The Movie (Escaflowne) (2000),Action|Adventure|Animation|Drama|Fantasy
3737,5209,Ffolkes (1979),Action|Adventure|Thriller
4977,7646,Rose Red (2002),Horror|Mystery|Thriller
...,...,...,...
9643,180263,The Shining (1997),Drama|Horror|Thriller
9647,180777,Die Frauen von Ravensbr√ºck (2005),Documentary
9682,184257,Making a Murderer (2015),Crime|Documentary
9694,185135,Sherlock - A Study in Pink (2010),Crime


In [73]:
print(f"Percentage of movielens movies missing metadata: {movie_ids_missing_in_posters_df.shape[0]/movies_df.shape[0]*100:.2f}")

Percentage of movielens movies missing metadata: 1.16


In [23]:
def test_selenium():
    try:
        print("Initializing Selenium driver...")
        options = uc.ChromeOptions()
        options.add_argument("--headless=new")
        driver = uc.Chrome(service=Service(ChromeDriverManager().install()), options=options)
        print("Selenium driver initialized successfully!")
        driver.quit()
    except Exception as e:
        print(f"Selenium Error: {e}")

test_selenium()

Initializing Selenium driver...
Selenium driver initialized successfully!
