In [2]:
import os
import requests
import zipfile
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from pprint import pprint as pp

## Data import

In [3]:
links_df = pd.read_csv(
    '../data/ml-latest-small/links.csv', 
    usecols=['movieId', 'imdbId', 'tmdbId'],
    dtype={'movieId': int, 'imdbId': str, 'tmdbId': str}
)
links_df.head(1)

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862


# Scrape metadata
## Check links
Check the different websites we have movie IDs for

### IMDB

In [8]:
""" IMDB doesn't allow webscraping. """

imdbid = '0114709'

url = f"https://www.imdb.com/title/tt{imdbid}/"
response = requests.get(url)

soup_imdb = BeautifulSoup(response.content, 'html.parser')

In [9]:
print(soup_imdb.prettify)

<bound method Tag.prettify of <html>
<head><title>403 Forbidden</title></head>
<body>
<center><h1>403 Forbidden</h1></center>
</body>
</html>
>


<div class="alert alert-danger">
    <strong>IMDB doesn't allow web-scraping</strong>
</div>

### movielens.org

In [10]:
movieId = 1
url = f"https://movielens.org/movies/{str(movieId)}/"
response = requests.get(url)

soup_movie_lens = BeautifulSoup(response.content, 'html.parser')
print(soup_movie_lens.prettify)

<bound method Tag.prettify of <!DOCTYPE html>

<!--[if lt IE 7]>      <html class="no-js lt-ie9 lt-ie8 lt-ie7"> <![endif]-->
<!--[if IE 7]>         <html class="no-js lt-ie9 lt-ie8"> <![endif]-->
<!--[if IE 8]>         <html class="no-js lt-ie9"> <![endif]-->
<!--[if gt IE 8]><!-->
<html class="no-js" xmlns="http://www.w3.org/1999/html"> <!--<![endif]-->
<head>
<base href="/"/>
<meta charset="utf-8"/>
<meta content="IE=edge,chrome=1" http-equiv="X-UA-Compatible"/>
<title>MovieLens</title>
<meta content="" name="description"/>
<meta content="width=device-width, initial-scale=1.0" name="viewport"/>
<!-- do not cache this page, please, to make upgrades easier -->
<meta content="no-store, must-revalidate" http-equiv="Cache-control"/>
<!-- favicon. thanks to realfavicongenerator.net -->
<link href="/apple-touch-icon-57x57.png" rel="apple-touch-icon" sizes="57x57"/>
<link href="/apple-touch-icon-60x60.png" rel="apple-touch-icon" sizes="60x60"/>
<link href="/apple-touch-icon-72x72.png" rel="a

<div class="alert alert-info">
    <strong>This works but the information on the page is not actually as useful as that shown on themoviedb.org</strong>
</div>

### themoviedb.org

In [4]:
# Page not found for movieId: 26887, tmdbId: 152426

tmdbId = '12773'

url = f"https://www.themoviedb.org/movie/{tmdbId}/"
response = requests.get(url)

soup = BeautifulSoup(response.content, 'html.parser')
print(soup.prettify)

<bound method Tag.prettify of <!DOCTYPE html>

<html class="no-js" lang="de">
<head>
<title>Page Not Found — The Movie Database (TMDB)</title>
<meta content="on" http-equiv="cleartype"/>
<meta charset="utf-8"/>
<meta content="Movies, TV Shows, Streaming, Reviews, API, Actors, Actresses, Photos, User Ratings, Synopsis, Trailers, Teasers, Credits, Cast" name="keywords"/>
<meta content="yes" name="mobile-web-app-capable"/>
<meta content="yes" name="apple-mobile-web-app-capable"/>
<meta content="width=device-width,initial-scale=1" name="viewport"/>
<meta content="The Movie Database (TMDB) is a popular, user editable database for movies and TV shows." name="description"/>
<meta content="/assets/2/v4/icons/mstile-144x144-30e7905a8315a080978ad6aeb71c69222b72c2f75d26dab1224173a96fecc962.png" name="msapplication-TileImage"/>
<meta content="#032541" name="msapplication-TileColor"/>
<meta content="#032541" name="theme-color"/>
<link href="/assets/2/apple-touch-icon-57ed4b3b0450fd5e9a0c20f34e814b8

In [9]:
title = soup.find('title')
page_not_found = True if 'Page Not Found' in title.text else False
if page_not_found:
    print('true')

true


<div class="alert alert-success">
    <strong>
        themoviedb.org turned out to be the best option. 
    </strong>
    <br>
    It has a list of Directors, Screenwriters, etc as well as well as a short list of top-billed actors, rather than an extensive list of every single actor in the movie.
    
</div>



## web-scraping from themoviedb.org

To improve our recommendations it would be beneficial to train a similarity model on the people associated with the movies.
The themoviedb.org page for each movie contains information on the people involved in the production of the movies under tags such as "Screenplay", "Story", "Director" and "Characters", as well as a list of top-billed actors.
The tags are not consistent across all movies. For example, "Characters" usually has a value such as "Stan Lee", who created the characters but was not involved in producing the movies. Many movies do not have this tag at all.
Stan Lee is obviously a very relevant name to associate with a movie though, as people who like one Marvel movie will tend to like other Marvel movies.

It is may not be necessary to save the roles of the people. Clint Eastwood is both an actor and a director and his fans will probably be consistent across both roles.
Then again, Jordan Peele was a comedy actor and the movies he has directed are most definitely not comedies (Get Out).
We are going to be using this data for Matrix Factorization which is a technique for reducing dimensionality and finding latent features. In layman's terms this means finding groups of similar things which often go together, such as Quentin Tarantino or Wes Anderson often using the same actors.

Movies with the same actor will probably have a weaker connection than movies with the same director. Brad Pitt has been in two Tarantino movies but he was also Thelma & Louise. All Tarantino movies have a similar style but actors might star in movies from totally different genres.

So let's leave in the jobs (feature engineering). It will make our dataset bigger and sparser but its not a huge dataset anyway so we should be fine.

### Find cast & crew in HTML

In [13]:
""" 
To figure out where the relevant information is in the big mess of html that is the soup, we inspect the actual webpage in the browser using DevTools, and find the elements we are after. 

For example, we can find John Lasseter's name and job in the html code below.

<li class="profile">
<p><a href="/person/7879-john-lasseter">John Lasseter</a></p>
<p class="character">Director, Story</p>
</li>

"""

creator_list = soup.find('ol', class_='people no_image')

if creator_list:
    profiles = creator_list.find_all('li', class_='profile')
    for person in profiles:
        name = person.find('a').text
        jobs = person.find('p', class_='character').text
        
        jobs = jobs.split(',') if ',' in jobs else [jobs]
        jobs = [r.strip() for r in jobs] 
    
        for job in jobs:
            print(f"{name} ({job})")
else:
    print(f"Creator list not found for movie {movieId}")

actor_list = soup.find('ol', class_='people scroller')

if actor_list:
    actors = actor_list.find_all('li', class_='card') 
    for actor in actors:
        name_tag = actor.find('img', alt=True)
        actor_name = name_tag['alt'] if name_tag else "Unknown"
        
        print(f"{actor_name} (Actor)")
else:
    print(f"Actor list not found for movie {movieId}")

John Lasseter (Director)
John Lasseter (Story)
Andrew Stanton (Screenplay)
Andrew Stanton (Story)
Joss Whedon (Screenplay)
Joel Cohen (Screenplay)
Alec Sokolow (Screenplay)
Pete Docter (Story)
Joe Ranft (Story)
Tom Hanks (Actor)
Tim Allen (Actor)
Don Rickles (Actor)
Jim Varney (Actor)
Wallace Shawn (Actor)
John Ratzenberger (Actor)
Annie Potts (Actor)
John Morris (Actor)
Erik von Detten (Actor)


### Find poster link in HTML

In [14]:
# Locate the <img> tag with the class 'poster'
poster_img_tag = soup.find('img', class_='poster w-full')

# Extract the 'src' attribute, which contains the poster link
if poster_img_tag:
    poster_link = poster_img_tag.get('src')
    print(f"Poster Link: {poster_link}")
else:
    print("Poster not found.")

Poster Link: https://media.themoviedb.org/t/p/w300_and_h450_bestv2/om4fMx3e4xkx27sAsacoFP4WiEd.jpg


### Scrape the data

In [None]:
class tmdbWebScraper:
    def __init__(self):
        self.

def _scrape_movie_metadata_from_tmdb(movie_row):
    movie_id = movie_row.movieId
    tmdb_id = movie_row.tmdbId
    
    url = f"https://www.themoviedb.org/movie/{tmdb_id}/"
    response = requests.get(url)    
    soup = BeautifulSoup(response.content, 'html.parser')

    people = []

    crew_list = soup.find('ol', class_='people no_image')
    if crew_list:
        crew = crew_list.find_all('li', class_='profile')
        for person in crew:
            name = person.find('a').text
            jobs = person.find('p', class_='character').text
            jobs = [job.strip() for job in jobs.split(',')]
            for job in jobs:
                people.append((movie_id, name, job))

    cast_list = soup.find('ol', class_='people scroller')
    if cast_list:
        cast = cast_list.find_all('li', class_='card')
        for actor in cast:
            name_tag = actor.find('img', alt=True)
            actor_name = name_tag['alt'] if name_tag else "Unknown"
            people.append((movie_id, actor_name, 'Actor'))
    
    poster_img_tag = soup.find('img', class_='poster w-full')
    if poster_img_tag:
        poster_link = (movie_id, poster_img_tag.get('src'))
    else:
        print(f"Poster not found for movieId: {movie_id}.")
        poster_link = None

    return people, poster_link

def aggregate_movie_cast_and_crew(links_df):

    print("Beginning movie cast & crew data aggregation.")
    
    num_links = links_df.shape[0]
    i = 0
    
    movie_people = []
    poster_links = []
    
    for index, movie_row in links_df.iterrows():
    
        movie_cast_and_crew, poster_link = _scrape_movie_metadata_from_tmdb(movie_row)

        movie_people.extend(movie_cast_and_crew)
        if poster_link:
            poster_links.append(poster_link)
     
        percent_completed = i/num_links*100
        if percent_completed % 5 == 0 and percent_completed > 0:
            print(f"{percent_completed}% completed")
        i+= 1

    print("Data aggregation complete.")

    movie_people_df = pd.DataFrame(movie_people, columns=['movieId', 'name', 'role'])
    poster_links_df = pd.DataFrame(poster_links, columns=['movieId', 'poster_link'])
    
    

    return movie_people_df, poster_links_df

def save_aggregated_data(df, movie_dir_path, filename):
    df.to_csv(movie_dir_path+filename, index=False)
    print(f"Data saved to {movie_dir_path} dir")

In [None]:
movie_dir_path = '../data/ml-latest-small/'

movie_people_df, poster_links_df = aggregate_movie_cast_and_crew(links_df)
save_aggregated_data(movie_people_df, movie_dir_path, filename='movie_cast_and_crew.csv')
save_aggregated_data(poster_links_df, movie_dir_path, filename='poster_links.csv')

## Result: Failure - The IP gets blocked after ~300 requests