1. Data Scraping

We collect the top *n* movies sorted by year with the following criteria:
- Have atleast 1000 reviews
- Is a Movie (Not a series)
- Is in English

In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from time import sleep
from imdb import Cinemagoer
import calendar
import re
import IPython

import socket
socket.setdefaulttimeout(120)

In [3]:
base_url = 'https://www.imdb.com/search/title/'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}

cg = Cinemagoer()

# Initialize lists and session
titles = []
plots = []
ratings = []
budgets = []
years = []
session = requests.Session()  # Crucial for maintaining cookies

# New Lists
cast = []
mpa_ratings = []
directors = []
distributors = []
producers = []
genres = []
runtime = []

# Loop through each year and month from 2019 to 2024
for year in range(2020, 2021+1):
    for month in range(1, 13):
        IPython.display.clear_output()
        print(f"{year}-{month:02d}")

        # Calculate start and end dates for the month
        # Generated by DeepSeek
        last_day = calendar.monthrange(int(year), int(month))[1]
        start_date = f"{year}-{month:02d}-01"
        end_date = f"{year}-{month:02d}-{last_day:02d}"

        # Set parameters with the current month's date range
        # Generated by DeepSeek
        params = {
            'title_type': 'feature',
            'num_votes': 1000,
            'primary_language': 'en',
            'sort': 'year,asc',
            'release_date': f"{start_date},{end_date}"
        }

        # Make request for the current month
        # Generated by ChatGPT
        response = session.get(base_url, params=params, headers=headers, timeout=70)
        if response.status_code != 200:
            print(f"Failed to retrieve data for {year}-{month:02d}, status code: {response.status_code}")
            continue

        soup = BeautifulSoup(response.content, 'html.parser')
        movies = soup.find_all('div', class_='ipc-metadata-list-summary-item__tc')

        for movie in movies:
            RETRY_ATTEMPTS = 3
            for attempt in range(1, RETRY_ATTEMPTS + 1):
                try:
                    # Title
                    title_tag = movie.find('h3', class_='ipc-title__text')
                    title = title_tag.text.strip() if title_tag else 'N/A'
                    id = re.search(r"/title/tt([0-9]+)/",title_tag.parent.get("href")).group(1)
                    
                    # Actors
                    movie = cg.get_movie(id)
                    actors = movie.get('cast')
                    actors = [str(actor) for actor in actors]
                    actors = ",".join(actors)

                    #Rating
                    rating = movie['rating']
                    
                    #Budget
                    bo = movie.get('box office')
                    budget = bo.get('Budget') if bo else 'N/A'
                    
                    #Plot
                    plot = movie.get('plot')
                    plot = " ".join(plot) if plot else 'N/A'
                    
                    # MPA Ratings
                    certs = movie.get('certificates')
                    certs = ",".join(certs) + " "
                    certs = re.search(r"United States:(.*?)[\b,\: ]",certs)
                    certs = certs.group(1) if certs else 'N/A'

                    # Directors
                    director = movie.get('director')
                    director = [str(direct) for direct in director]
                    director = ",".join(director)

                    # Producers
                    prod_list = movie.get('production companies')
                    prod_list = [str(prod) for prod in prod_list]
                    prod_list = ",".join(prod_list)
                    
                    # Distributors
                    dist_list = movie.get('distributors')
                    dist_list = [str(dist) for dist in dist_list]
                    dist_list = ",".join(dist_list)
                    
                    # Genre
                    genre = movie.get('genres')
                    genre = [str(gen) for gen in genre]
                    genre = ",".join(genre)

                    # Runtime
                    run = movie.get('runtimes')
                    run = ",".join(run)

                    #appends
                    ratings.append(rating)
                    titles.append(title)
                    genres.append(genre)
                    distributors.append(dist_list)
                    directors.append(director)
                    mpa_ratings.append(certs)
                    plots.append(plot)
                    producers.append(prod_list)
                    budgets.append(budget)
                    cast.append(actors)
                    runtime.append(run)
                    years.append(year)
                    break
                except:
                    print(id)
                    sleep(1)

2021-12


In [60]:
len(mpa_ratings)

25

In [4]:
# Create DataFrame
df = pd.DataFrame({
    'Title': titles,
    'Year' : years,
    'Plot': plots,
    'Rating': ratings,
    'Budget': budgets,
    'Cast': cast,
    'MPA Rating': mpa_ratings,
    'Directors': directors,
    'Distributors': distributors,
    'Producers': producers,
    'Genre': genres,
    'Runtime in Minutes': runtime
})

print(f"Total movies scraped: {len(df)}")
df

Total movies scraped: 589


Unnamed: 0,Title,Year,Plot,Rating,Budget,Cast,MPA Rating,Directors,Distributors,Producers,Genre,Runtime in Minutes
0,1. Promising Young Woman,2020,Nothing in Cassie's life is what it appears to...,7.5,,"Adam Brody,Ray Nicholson,Sam Richardson,Carey ...",R,Emerald Fennell,"Focus Features,Parco Co. Ltd.,Roadshow Films,S...","FilmNation Entertainment,Focus Features,LuckyChap","Crime,Drama,Mystery,Thriller",113
1,2. Sonic the Hedgehog,2020,"Taking refuge on Earth, when Sonic uses his in...",6.5,"$90,000,000 (estimated)","Ben Schwartz,James Marsden,Jim Carrey,Tika Sum...",TV-G,Jeff Fowler,"Paramount Pictures,Paramount+,Andes Films,B&H ...","Paramount Pictures,Sega Sammy Group,Original F...","Action,Adventure,Comedy,Family,Fantasy,Sci-Fi",99
2,3. Palm Springs,2020,Nyles and Sarah find themselves stuck in a tim...,7.4,,"Andy Samberg,Cristin Milioti,J.K. Simmons,Pete...",R,Max Barbakow,"Cinema Mondo,Con Un Pack Distribución,Husky Fi...","Limelight,Sun Entertainment Culture,The Lonely...","Comedy,Fantasy,Mystery,Romance",90
3,4. The Father,2020,A man refuses all assistance from his daughter...,8.2,"$6,000,000 (estimated)","Olivia Colman,Anthony Hopkins,Mark Gatiss,Oliv...",PG-13,Florian Zeller,"Lionsgate,Elevation Pictures,Entract Films,Son...","Les Films du Cru,Film4,Orange Studio,Canal+,Ci...","Drama,Mystery",97
4,5. The Night House,2020,A widow begins to uncover her recently decease...,6.4,,"Rebecca Hall,Sarah Goldberg,Vondie Curtis-Hall...",R,David Bruckner,"Searchlight Pictures,Walt Disney Studios Motio...","Searchlight Pictures,TSG Entertainment,Anton,P...","Horror,Mystery,Thriller",107
...,...,...,...,...,...,...,...,...,...,...,...,...
584,21. National Champions,2021,Follows star quarterback who ignites a players...,5.7,"$8,000,000 (estimated)","Stephan James,J.K. Simmons,Alexander Ludwig,Li...",R,Ric Roman Waugh,"STX Entertainment,CAA Media Finance,Elevation ...","Game1,Thunder Road Pictures,Amet Entertainment...","Drama,Sport",116
585,22. American Sicario,2021,American gangster Erik Vasquez is scheming to ...,4.3,,"Philippe A. Haddad,Maurice Compte,Maya Stojan,...",R,RJ Collins,"Saban Films,Eagle Entertainment,Lionsgate Home...","Rumble Riot Pictures,GFG Entertainment,GotFilms","Action,Crime,Drama,Thriller",101
586,23. Zoey's Extraordinary Christmas,2021,"On her first holiday without her father, Zoey ...",7.1,,"Jane Levy,Skylar Astin,Alex Newell,John Claren...",TV-14,Richard Shepard,"Roku,The Roku Channel,The Roku Channel","Feigco Entertainment,Lionsgate Television,Lion...","Comedy,Drama,Fantasy,Musical",99
587,24. Mixtape,2021,When a young girl accidentally destroys the mi...,6.6,,"Gemma Brooke Allen,Julie Bowen,Audrey Hsieh,Ol...",TV-PG,Valerie Weiss,"Netflix,Netflix","Netter Productions,Reunion Pacific Entertainment","Comedy,Drama,Family,Romance",93


In [5]:
df.to_csv("raw4.csv")
print("DONE")

DONE
