In [1]:
import os
import requests
import zipfile
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from pprint import pprint as pp

# Import data from MovieLens

In [2]:
url = "http://files.grouplens.org/datasets/movielens/ml-latest-small.zip"

# Create the 'data' directory if it doesn't exist
data_dir = '../data'
os.makedirs(data_dir, exist_ok=True)

# Save the ZIP file inside the 'data' directory
zip_path = os.path.join(data_dir, 'ml-latest-small.zip')

# Get the data
response = requests.get(url, stream=True)

# Save the data
with open(zip_path, 'wb') as file:
    for chunk in response.iter_content(chunk_size=1024):
        if chunk:
            file.write(chunk)

# Extract the ZIP file within the 'data' directory
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    # zip_ref.extractall()
    for member in zip_ref.namelist():
        target_path = os.path.join(data_dir, member)
        zip_ref.extract(member, path=data_dir)

print(f"Dataset downloaded and extracted to: {data_dir}")

Dataset downloaded and extracted to: ../data


## Check the links data

In [3]:
links_df = pd.read_csv(
    '../data/ml-latest-small/links.csv', 
    usecols=['movieId', 'imdbId', 'tmdbId'],
    dtype={'movieId': int, 'imdbId': str, 'tmdbId': str}
)
links_df

Unnamed: 0,movieId,imdbId,tmdbId
0,1,0114709,862
1,2,0113497,8844
2,3,0113228,15602
3,4,0114885,31357
4,5,0113041,11862
...,...,...,...
9737,193581,5476944,432131
9738,193583,5914996,445030
9739,193585,6397426,479308
9740,193587,8391976,483455


In [4]:
"""
When I initially loaded the imdbId values as ints, some had only 6 digits and were missing
a preceding 0 which caused them to no correspond to the correct IMDB url. When loaded as 
strings, they appear to have the preceding 0s but we need to check this.
"""

# Check for imdbIds of the wrong length.
links_df[links_df.imdbId.apply(lambda x:len(x)!=7)]

Unnamed: 0,movieId,imdbId,tmdbId


In [5]:
# Check for NaN vals
print(links_df.isna().sum()) 

movieId    0
imdbId     0
tmdbId     8
dtype: int64


In [6]:
''' 
We need to make sure these movieIds are filtered out of the other datasets because we are training
the Matrix Factorization model on the tmdb data (see web-scraping section below) and we need all the models to be working on the same data.
'''

links_df[links_df.isna().tmdbId==True]

Unnamed: 0,movieId,imdbId,tmdbId
624,791,113610,
843,1107,102336,
2141,2851,81454,
3027,4051,56600,
5532,26587,92337,
5854,32600,377059,
6059,40697,105946,
7382,79299,874957,


## Filter out movies with missing tmdbIds

### movies.csv

In [7]:
movies_missing_tmdbIds = links_df[links_df.isna().tmdbId==True]

movies_df = pd.read_csv('../data/ml-latest-small/movies.csv')
movies_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [8]:
movies_df.shape

(9742, 3)

In [9]:
""" These don't look like popular movies anyway. """
movies_df[movies_df.index.isin(movies_missing_tmdbIds.index)]

Unnamed: 0,movieId,title,genres
624,791,"Last Klezmer: Leopold Kozlowski, His Life and ...",Documentary
843,1107,Loser (1991),Comedy
2141,2851,Saturn 3 (1980),Adventure|Sci-Fi|Thriller
3027,4051,Horrors of Spider Island (Ein Toter Hing im Ne...,Horror|Sci-Fi
5532,26587,"Decalogue, The (Dekalog) (1989)",Crime|Drama|Romance
5854,32600,Eros (2004),Drama
6059,40697,Babylon 5,Sci-Fi
7382,79299,"No. 1 Ladies' Detective Agency, The (2008)",Comedy|Crime|Mystery


In [10]:
""" The eight movies have been removed. """
movies_df = movies_df[~movies_df.index.isin(movies_missing_tmdbIds.index)]
movies_df.shape

(9734, 3)

### ratings.csv

In [11]:
ratings_df = pd.read_csv('../data/ml-latest-small/ratings.csv')
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [12]:
ratings_df.shape

(100836, 4)

In [13]:
"""
Let's just double check how much data we're dropping.
-> No big deal. These 8 movies have only been rated once or twice anyway.
"""
ratings_df[ratings_df.movieId.isin(movies_missing_tmdbIds.movieId)]

Unnamed: 0,userId,movieId,rating,timestamp
282,3,2851,5.0,1306463925
14095,90,791,4.0,856354451
16539,105,26587,5.0,1446573090
23344,160,1107,3.5,1077749409
30090,210,40697,4.0,1527266191
31391,217,2851,3.0,955942393
42665,288,2851,2.0,1020369199
44588,298,1107,0.5,1479065495
69291,448,4051,0.5,1263237066
72810,469,2851,3.0,965335989


In [14]:
""" Remove the movies. """
ratings_df = ratings_df[~ratings_df.movieId.isin(movies_missing_tmdbIds.movieId)]
ratings_df.shape

(100823, 4)

### tags.csv

In [15]:
tags_df = pd.read_csv('../data/ml-latest-small/tags.csv')
tags_df.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


In [16]:
tags_df.shape

(3683, 4)

In [17]:
""" Remove the movies. """
tags_df = tags_df[~tags_df.movieId.isin(movies_missing_tmdbIds.movieId)]
tags_df.shape

(3683, 4)

### links.csv

In [18]:
print(f"Num links before: {links_df.shape[0]}")
links_df = links_df[~links_df.movieId.isin(movies_missing_tmdbIds.movieId)]
print(f"Num links after: {links_df.shape[0]}")

Num links before: 9742
Num links after: 9734


## Overwrite the CSVs

In [19]:
movies_df.to_csv('../data/ml-latest-small/movies.csv', index=False)
ratings_df.to_csv('../data/ml-latest-small/ratings.csv', index=False)
tags_df.to_csv('../data/ml-latest-small/tags.csv', index=False)
links_df.to_csv('../data/ml-latest-small/links.csv', index=False)

## Reload the data and check the size

In [20]:
movies_df = pd.read_csv('../data/ml-latest-small/movies.csv')
ratings_df = pd.read_csv('../data/ml-latest-small/ratings.csv')
tags_df = pd.read_csv('../data/ml-latest-small/tags.csv')
links_df = pd.read_csv(
    '../data/ml-latest-small/links.csv', 
    usecols=['movieId', 'imdbId', 'tmdbId'],
    dtype={'movieId': int, 'imdbId': str, 'tmdbId': str}
)

print(f"movies_df shape: {movies_df.shape}")
print(f"ratings_df shape: {ratings_df.shape}")
print(f"tags_df shape: {tags_df.shape}")
print(f"links_df shape: {links_df.shape}")

links_df.head()

movies_df shape: (9734, 3)
ratings_df shape: (100823, 4)
tags_df shape: (3683, 4)
links_df shape: (9734, 3)


Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862
1,2,113497,8844
2,3,113228,15602
3,4,114885,31357
4,5,113041,11862


# Scrape metadata

## Check links

In [24]:
""" IMDB doesn't allow webscraping. """

imdbid = '0114709'

url = f"https://www.imdb.com/title/tt{imdbid}/"
response = requests.get(url)

soup_imdb = BeautifulSoup(response.content, 'html.parser')

In [61]:
print(soup_imdb.prettify)

<bound method Tag.prettify of <html>
<head><title>403 Forbidden</title></head>
<body>
<center><h1>403 Forbidden</h1></center>
</body>
</html>
>


In [62]:
movieId = 1
url = f"https://movielens.org/movies/{str(movieId)}/"
response = requests.get(url)

soup_movie_lens = BeautifulSoup(response.content, 'html.parser')
print(soup_movie_lens.prettify)

<bound method Tag.prettify of <!DOCTYPE html>

<!--[if lt IE 7]>      <html class="no-js lt-ie9 lt-ie8 lt-ie7"> <![endif]-->
<!--[if IE 7]>         <html class="no-js lt-ie9 lt-ie8"> <![endif]-->
<!--[if IE 8]>         <html class="no-js lt-ie9"> <![endif]-->
<!--[if gt IE 8]><!-->
<html class="no-js" xmlns="http://www.w3.org/1999/html"> <!--<![endif]-->
<head>
<base href="/"/>
<meta charset="utf-8"/>
<meta content="IE=edge,chrome=1" http-equiv="X-UA-Compatible"/>
<title>MovieLens</title>
<meta content="" name="description"/>
<meta content="width=device-width, initial-scale=1.0" name="viewport"/>
<!-- do not cache this page, please, to make upgrades easier -->
<meta content="no-store, must-revalidate" http-equiv="Cache-control"/>
<!-- favicon. thanks to realfavicongenerator.net -->
<link href="/apple-touch-icon-57x57.png" rel="apple-touch-icon" sizes="57x57"/>
<link href="/apple-touch-icon-60x60.png" rel="apple-touch-icon" sizes="60x60"/>
<link href="/apple-touch-icon-72x72.png" rel="a

In [6]:
""" 
themoviedb.org turned out to be the best option. 
It has a list of Directors, Screenwriters, etc as well as well as a short list of top-billed actors, rather than an extensive list of every single actor in the movie.
"""

tmdbId = '862'

url = f"https://www.themoviedb.org/movie/{tmdbId}/"
response = requests.get(url)

soup = BeautifulSoup(response.content, 'html.parser')
print(soup.prettify)

<bound method Tag.prettify of <!DOCTYPE html>

<html class="no-js" lang="de">
<head>
<title>Toy Story (1995) — The Movie Database (TMDB)</title>
<meta content="on" http-equiv="cleartype"/>
<meta charset="utf-8"/>
<meta content="Movies, TV Shows, Streaming, Reviews, API, Actors, Actresses, Photos, User Ratings, Synopsis, Trailers, Teasers, Credits, Cast" name="keywords"/>
<meta content="yes" name="mobile-web-app-capable"/>
<meta content="yes" name="apple-mobile-web-app-capable"/>
<meta content="width=device-width,initial-scale=1" name="viewport"/>
<meta content="Als Andys Lieblingspuppe hat Woody im Kinderzimmer das Sagen. Kaum ist der Junge nicht da, erwacht die Cowboy-Figur zum Leben, und mit ihm auch all das andere Spielzeug um ihn herum. Aufgeregt debattiert man über Andys bevorstehenden Geburtstag. Es wird befürchtet, daß ihr Besitzer ein neues Geschenk bevorzugen wird. Und tatsächlich: Der Neuankömmling Buzz Lightyear, ein stolzer Space Ranger, avanciert zu Andys Favoriten. Diese 

## web-scraping from themoviedb.org
To improve our recommendations it would be beneficial to train a similarity model on the people associated with the movies. 
<br>
The themoviedb.org page for each movie contains information on the people involved in the production of the movies under tags such as "Screenplay", "Story", "Director" and "Characters", as well as a list of top-billed actors.
<br>
The tags are not consistent across all movies. For example, "Characters" usually has a value such as "Stan Lee", who created the characters but was not involved in producing the movies. Many movies do not have this tag at all.
<br>
Stan Lee is obviously a very relevant name to associate with a movie though, as people who like one Marvel movie will tend to like other Marvel movies.

It is may not be necessary to save the roles of the people. Clint Eastwood is both an actor and a director and his fans will probably be consistent across both roles. 
<br>
Then again, Jordan Peele was a comedy actor and the movies he has directed are most definitely not comedies (Get Out).
<br>
We are going to be using this data for Matrix Factorization which is a technique for reducing dimensionality and finding latent features. In layman's terms this means finding groups of similar things which often go together, such as Quentin Tarantino or Wes Anderson often using the same actors. 

Movies with the same actor will probably have a weaker connection than movies with the same director. Brad Pitt has been in two Tarantino movies but he was also Thelma & Louise. All Tarantino movies have a similar style but actors might star in movies from totally different genres.

So let's leave in the jobs (feature engineering). It will make our dataset bigger and sparser but its not a huge dataset anyway so we should be fine.



In [23]:
""" 
To figure out where the relevant information is in the big mess of html that is the soup, we inspect the actual webpage in the browser using DevTools, and find the elements we are after. 

For example, we can find John Lasseter's name and job in the html code below.

<li class="profile">
<p><a href="/person/7879-john-lasseter">John Lasseter</a></p>
<p class="character">Director, Story</p>
</li>

"""

creator_list = soup.find('ol', class_='people no_image')

if creator_list:
    profiles = creator_list.find_all('li', class_='profile')
    for person in profiles:
        name = person.find('a').text
        jobs = person.find('p', class_='character').text
        
        jobs = jobs.split(',') if ',' in jobs else [jobs]
        jobs = [r.strip() for r in jobs] 
    
        for job in jobs:
            print(f"{name} ({job})")
else:
    print(f"Creator list not found for movie {movieId}")

actor_list = soup.find('ol', class_='people scroller')

if actor_list:
    actors = actor_list.find_all('li', class_='card') 
    for actor in actors:
        name_tag = actor.find('img', alt=True)
        actor_name = name_tag['alt'] if name_tag else "Unknown"
        
        print(f"{actor_name} (Actor)")
else:
    print(f"Actor list not found for movie {movieId}")
    

John Lasseter (Director)
John Lasseter (Story)
Andrew Stanton (Screenplay)
Andrew Stanton (Story)
Alec Sokolow (Screenplay)
Joss Whedon (Screenplay)
Joel Cohen (Screenplay)
Joe Ranft (Story)
Pete Docter (Story)
Tom Hanks (Actor)
Tim Allen (Actor)
Don Rickles (Actor)
Jim Varney (Actor)
Wallace Shawn (Actor)
John Ratzenberger (Actor)
Annie Potts (Actor)
John Morris (Actor)
Erik von Detten (Actor)


In [64]:
# Locate the <img> tag with the class 'poster'
poster_img_tag = soup.find('img', class_='poster w-full')

# Extract the 'src' attribute, which contains the poster link
if poster_img_tag:
    poster_link = poster_img_tag.get('src')
    print(f"Poster Link: {poster_link}")
else:
    print("Poster not found.")

Poster Link: https://media.themoviedb.org/t/p/w300_and_h450_bestv2/om4fMx3e4xkx27sAsacoFP4WiEd.jpg


### Scrape the data

In [24]:
import requests
import pandas as pd
import time
import random
from bs4 import BeautifulSoup

USER_AGENTS = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.96 Safari/537.36"
]

class TmdbWebScraper:
    BASE_URL = "https://www.themoviedb.org/movie/"
    
    def __init__(self,
                 proxy_list=None,
                 movie_dir_path='../data/ml-latest-small/',
                 movie_people_filename='movie_cast_and_crew.csv',
                 poster_links_filename='poster_links.csv',
                 max_retries=3):

        self.proxy_list = proxy_list.copy() if proxy_list else []
        self.movie_dir_path = movie_dir_path
        self.movie_people_filename = movie_people_filename
        self.poster_links_filename = poster_links_filename
        self.max_retries = max_retries
        
        self._movie_people = []
        self._poster_links = []
        self._failed_requests = []
        self._crew_not_found = []
        self._cast_not_found = []
        self._poster_link_not_found = []
        self._current_iteration = 0
        self._movieIds_already_checked = []
        self.last_reported_percent = -1
        self.delay_min = 0.25
        self.delay_max = 0.75
        self.num_links_to_scrape = 0
        self._scraping_stopped = False


    def update_proxy_list(self, new_proxy_list):
        """Updates the proxy list only if it is a non-empty list."""
    
        if not isinstance(new_proxy_list, list):
            print("Error: Provided proxy list is not a list.")
            return 
    
        if not new_proxy_list:
            print("Error: Proxy list is empty. Please provide a list of proxies.")
            return 
    
        self.proxy_list = new_proxy_list.copy()
        print(f"Proxy list updated successfully with {len(new_proxy_list)} proxies.")

        if self._scraping_stopped:
            print("You can now rerun `TmdbWebScraper.aggregate_movie_casts_and_crews()` to resume scraping.")
            self._scraping_stopped = False
        
    def _get_random_proxy(self):
        if not self.proxy_list: 
            self._scraping_stopped = True
            return None
            
        return random.choice(self.proxy_list)

    def _fetch_movie_data(self, movie_id, tmdb_id, session):
        """ 
        Apparently TMDB has a rate limit of ~40 requests per 10 seconds so we want
        to introduce a random time delay to make our scraper appear more human-like.
        We will start at 200ms and work from there.
        """
        if self._scraping_stopped:
            print(f"Skipping Movie {movie_id} due to lack of proxies.")
            self._failed_requests.append((movie_id, tmdb_id, "No Proxy Available"))
            return None  
        
        url = f"{self.BASE_URL}{tmdb_id}/"
        
        for attempt in range(self.max_retries):
            time.sleep(random.uniform(self.delay_min, self.delay_max))

            proxy = self._get_random_proxy()
            
            if proxy is None:
                print(f"Skipping Movie {movie_id} due to lack of proxies.")
                self._failed_requests.append((movie_id, tmdb_id, "No Proxy Available"))
                return None
            
            proxies = {"https": proxy}

            try:
                response = session.get(url, proxies=proxies, timeout=5) 
                
                if response.status_code == 200:
                    if "captcha" in response.text.lower() or len(response.text) < 500:  
                        # TMDB might block with a bot detector or short HTML
                        print(f"Soft Block Detected for Movie {movie_id} - Response Length: {len(response.text)}")
                        self._failed_requests.append((movie_id, tmdb_id, "Soft Block"))
                        return None
                    return response.text
    
                elif response.status_code == 429:
                    # If we still hit the rate limit then we use an exponential backoff up to 10 sec
                    # and then increase our random delay range
                    wait_time = min(2 ** attempt, 10)
                    print(f"Rate limit hit for Movie {movie_id}. Retrying in {wait_time} seconds...")
                    time.sleep(wait_time)
                    self.delay_max *= 1.2
                else:
                    if response:
                        print(f"Request failed: MovieId {movie_id} - Status Code {response.status_code}")
                        self._failed_requests.append((movie_id, tmdb_id, response.status_code))
                    else:
                        self._failed_requests.append((movie_id, tmdb_id, "No Response"))

            except requests.exceptions.RequestException:
                print(f"Proxy failed: {proxy}, switching to a new proxy...")
                try:
                    self.proxy_list.remove(proxy)
                    print(f"Removed proxy: {proxy}")
                except ValueError:
                    print(f"Proxy {proxy} was not found in the list. It might have already been removed.")
                continue 
                
        
        return None 
        
    def _scrape_movie_metadata_from_tmdb(self, movie_row, session):
        movie_id = movie_row.movieId
        tmdb_id = movie_row.tmdbId
        people = []
        poster_link = None

        if movie_id not in self._movieIds_already_checked:
        
            response = self._fetch_movie_data(movie_id, tmdb_id, session)   
    
            if response:
                soup = BeautifulSoup(response, 'html.parser')
            
                crew_list = soup.find('ol', class_='people no_image')
                if crew_list:
                    crew = crew_list.find_all('li', class_='profile')
                    for person in crew:
                        name = person.find('a').text
                        jobs = person.find('p', class_='character').text
                        jobs = [job.strip() for job in jobs.split(',')]
                        for job in jobs:
                            people.append((movie_id, name, job))
                else:
                    print(f"Crew not found in html: MovieId {movie_id}")
                    self._crew_not_found.append((movie_id, tmdb_id))
            
                cast_list = soup.find('ol', class_='people scroller')
                if cast_list:
                    cast = cast_list.find_all('li', class_='card')
                    for actor in cast:
                        name_tag = actor.find('img', alt=True)
                        actor_name = name_tag['alt'] if name_tag else "Unknown"
                        people.append((movie_id, actor_name, 'Actor'))
                else:
                    print(f"Cast not found in html: MovieId {movie_id}")
                    self._cast_not_found.append((movie_id, tmdb_id))
                
                poster_img_tag = soup.find('img', class_='poster w-full')
                if poster_img_tag:
                    poster_link = (movie_id, poster_img_tag.get('src'))
                else:
                    print(f"Poster link not found in html: MovieId {movie_id}")
                    self._poster_link_not_found.append((movie_id, tmdb_id))

                self._movieIds_already_checked.append(movie_id)
    
        return people, poster_link

    def _track_percentage_completed(self):
        percent_completed = round(self._current_iteration/self.num_links_to_scrape*100)
        if percent_completed % 2 == 0 and percent_completed > self.last_reported_percent:
            print(f"{percent_completed}% completed")
            self.last_reported_percent = percent_completed

    def aggregate_movie_casts_and_crews(self, links_df):
    
        print("Beginning movie metadata data aggregation.")
        
        self.num_links_to_scrape = links_df.shape[0]

        with requests.Session() as session:
            # Add a random user-agent header to make it look like the request is  
            # coming from a browser to stop the request being blocked.
            session.headers.update({"User-Agent": random.choice(USER_AGENTS) })

            for index, movie_row in links_df.iterrows():
                if self._scraping_stopped:
                    print("Scraping stopped due to lack of proxies. ")
                    print("Please update the proxy list with the `TmdbWebScraper.update_proxy_list(proxy_list:List)` method.")
                    print("Then rerun `TmdbWebScraper.aggregate_movie_casts_and_crews()` to continue scraping.")
                    print("Exiting...")
                    break 
                movie_cast_and_crew, poster_link = self._scrape_movie_metadata_from_tmdb(movie_row, session)

                if movie_cast_and_crew:
                    self._movie_people.extend(movie_cast_and_crew)
                if poster_link:
                    self._poster_links.append(poster_link)

                self._current_iteration += 1
                self._track_percentage_completed()
                
        print("Data aggregation complete.")

    def rerun_aggregation_for_failed_requests(self):
        # Reset the counters
        self._current_iteration = 0
        self.last_reported_percent = -1
        
        failed_requests_df = self.get_failed_requests_df()
        
        # Reset the list for tracking failed requests
        self._failed_requests = []
        self.aggregate_movie_casts_and_crews(failed_requests_df)
        
    def get_movie_cast_and_crew_df(self):
        return pd.DataFrame(self._movie_people, columns=['movieId', 'name', 'role'])

    def get_poster_links_df(self):
        return pd.DataFrame(self._poster_links, columns=['movieId', 'poster_link'])

    def get_failed_requests_df(self):
        return pd.DataFrame(self._failed_requests, columns=['movieId', 'tmdbId', 'statusCode'])

    def get_crew_not_found_df(self):
        return pd.DataFrame(self._crew_not_found, columns=['movieId', 'tmdbId'])
        
    def get_cast_not_found_df(self):
        return pd.DataFrame(self._cast_not_found, columns=['movieId', 'tmdbId'])

    def get_poster_link_not_found_df(self):
        return pd.DataFrame(self._poster_link_not_found, columns=['movieId', 'tmdbId'])
        
    def save_movie_cast_and_crew_data(self):
        self.get_movie_cast_and_crew_df.to_csv(self.movie_dir_path+self.movie_people_filename, index=False)
        print(f"Movie cast & crew saved to {self.movie_dir_path}{self.movie_people_filename}")

    def save_poster_links_data(self):
        self.get_poster_links_df.to_csv(self.movie_dir_path+self.poster_links_filename, index=False)
        print(f"Poster links saved to {self.movie_dir_path}{self.poster_links_filename}")


In [27]:
proxy_list = [
    "https://8.219.97.248",
    "https://67.43.228.250",
    "https://13.245.75.139",
    "https://222.252.194.204",
    "https://51.210.54.186",
    "https://204.236.137.68",
    "https://63.35.64.177",
    "https://113.160.133.32",
    "https://3.90.100.12",
    "https://13.59.156.167",
    "https://54.152.3.36",
    "https://3.130.65.162",
    "https://52.67.10.183",
    "https://18.228.198.164",
    "https://54.233.119.172",
    "https://46.51.249.135",
    "https://3.123.150.192",
    "https://18.228.149.161",
    "https://3.124.133.93",
    "https://3.127.62.252",
    "https://35.79.120.242",
    "https://35.72.118.126",
    "https://3.126.147.182",
    "https://2.59.181.125",
    "https://3.136.29.104",
    "https://13.56.192.187",
]


In [28]:
links_df = pd.read_csv(
    '../data/ml-latest-small/links.csv', 
    usecols=['movieId', 'imdbId', 'tmdbId'],
    dtype={'movieId': int, 'imdbId': str, 'tmdbId': str}
)
movies_missing_tmdbIds = links_df[links_df.tmdbId.isna()]
links_df = links_df[~links_df.movieId.isin(movies_missing_tmdbIds.movieId)]

scraper = TmdbWebScraper(proxy_list)
scraper.aggregate_movie_casts_and_crews(links_df)

Beginning movie metadata data aggregation.
Proxy failed: https://3.124.133.93, switching to a new proxy...
Removed proxy: https://3.124.133.93
Proxy failed: https://13.56.192.187, switching to a new proxy...
Removed proxy: https://13.56.192.187
Proxy failed: https://13.59.156.167, switching to a new proxy...
Removed proxy: https://13.59.156.167
0% completed
Proxy failed: https://2.59.181.125, switching to a new proxy...
Removed proxy: https://2.59.181.125
Proxy failed: https://54.233.119.172, switching to a new proxy...
Removed proxy: https://54.233.119.172
Proxy failed: https://51.210.54.186, switching to a new proxy...
Removed proxy: https://51.210.54.186
Proxy failed: https://46.51.249.135, switching to a new proxy...
Removed proxy: https://46.51.249.135
Proxy failed: https://18.228.149.161, switching to a new proxy...
Removed proxy: https://18.228.149.161
Proxy failed: https://222.252.194.204, switching to a new proxy...
Removed proxy: https://222.252.194.204
Proxy failed: https://

In [130]:
people, poster_link = _scrape_movie_metadata_from_tmdb(movie_row)
people

[(269    310
  Name: movieId, dtype: int64,
  'Jean-Luc Godard',
  'Director'),
 (269    310
  Name: movieId, dtype: int64,
  'Jean-Luc Godard',
  'Screenplay'),
 (269    310
  Name: movieId, dtype: int64,
  'Claude Chabrol',
  'Story'),
 (269    310
  Name: movieId, dtype: int64,
  'François Truffaut',
  'Story'),
 (269    310
  Name: movieId, dtype: int64,
  'Jean-Paul Belmondo',
  'Actor'),
 (269    310
  Name: movieId, dtype: int64,
  'Jean Seberg',
  'Actor'),
 (269    310
  Name: movieId, dtype: int64,
  'Daniel Boulanger',
  'Actor'),
 (269    310
  Name: movieId, dtype: int64,
  'Henri-Jacques Huet',
  'Actor'),
 (269    310
  Name: movieId, dtype: int64,
  'Roger Hanin',
  'Actor'),
 (269    310
  Name: movieId, dtype: int64,
  'Van Doude',
  'Actor'),
 (269    310
  Name: movieId, dtype: int64,
  'Claude Mansard',
  'Actor'),
 (269    310
  Name: movieId, dtype: int64,
  'Liliane Dreyfus',
  'Actor'),
 (269    310
  Name: movieId, dtype: int64,
  'Unknown',
  'Actor')]

In [131]:
poster_link

(269    310
 Name: movieId, dtype: int64,
 'https://media.themoviedb.org/t/p/w300_and_h450_bestv2/D1YIS8ljrSxCZvru88OVTgIjid.jpg')

In [121]:
poster_links_df.shape

(2893, 2)

In [124]:
movie_people_df.movieId.nunique()

2893

In [125]:
movies_df.movieId.nunique()

9734

In [132]:
movie_people_df[movie_people_df.movieId==310]

Unnamed: 0,movieId,name,role


In [None]:
# Collect all data
movie_people = []
for index, movie_row in links_df.iterrows():
    movie_people.extend(extract_movie_people(movie_row))

# Convert to a DataFrame
movie_people_df = pd.DataFrame(movie_people, columns=['movieId', 'name', 'role'])
# Save to csv
movie_people_df.to_csv('../data/ml-latest-small/movie_people.csv', index=False)

print(movie_people_df.shape)