1. Data Scraping

We collect the top *n* movies sorted by year with the following criteria:
- Have atleast 1000 reviews
- Is a Movie (Not a series)
- Is in English

In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from time import sleep
from imdb import Cinemagoer
import calendar
import re
import IPython

import socket
socket.setdefaulttimeout(120)

In [3]:
base_url = 'https://www.imdb.com/search/title/'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}

cg = Cinemagoer()

# Initialize lists and session
titles = []
plots = []
ratings = []
budgets = []
years = []
session = requests.Session()  # Crucial for maintaining cookies

# New Lists
cast = []
mpa_ratings = []
directors = []
distributors = []
producers = []
genres = []
runtime = []

# Loop through each year and month from 2019 to 2024
for year in range(2022, 2024+1):
    for month in range(1, 13):
        IPython.display.clear_output()
        print(f"{year}-{month:02d}")

        # Calculate start and end dates for the month
        # Generated by DeepSeek
        last_day = calendar.monthrange(int(year), int(month))[1]
        start_date = f"{year}-{month:02d}-01"
        end_date = f"{year}-{month:02d}-{last_day:02d}"

        # Set parameters with the current month's date range
        # Generated by DeepSeek
        params = {
            'title_type': 'feature',
            'num_votes': 1000,
            'primary_language': 'en',
            'sort': 'year,asc',
            'release_date': f"{start_date},{end_date}"
        }

        # Make request for the current month
        # Generated by ChatGPT
        response = session.get(base_url, params=params, headers=headers, timeout=70)
        if response.status_code != 200:
            print(f"Failed to retrieve data for {year}-{month:02d}, status code: {response.status_code}")
            continue

        soup = BeautifulSoup(response.content, 'html.parser')
        movies = soup.find_all('div', class_='ipc-metadata-list-summary-item__tc')

        for movie in movies:
            RETRY_ATTEMPTS = 3
            for attempt in range(1, RETRY_ATTEMPTS + 1):
                try:
                    # Title
                    title_tag = movie.find('h3', class_='ipc-title__text')
                    title = title_tag.text.strip() if title_tag else 'N/A'
                    id = re.search(r"/title/tt([0-9]+)/",title_tag.parent.get("href")).group(1)
                    
                    # Actors
                    movie = cg.get_movie(id)
                    actors = movie.get('cast')
                    actors = [str(actor) for actor in actors]
                    actors = ",".join(actors)

                    #Rating
                    rating = movie['rating']
                    
                    #Budget
                    bo = movie.get('box office')
                    budget = bo.get('Budget') if bo else 'N/A'
                    
                    #Plot
                    plot = movie.get('plot')
                    plot = " ".join(plot) if plot else 'N/A'
                    
                    # MPA Ratings
                    certs = movie.get('certificates')
                    certs = ",".join(certs) + " "
                    certs = re.search(r"United States:(.*?)[\b,\: ]",certs)
                    certs = certs.group(1) if certs else 'N/A'

                    # Directors
                    director = movie.get('director')
                    director = [str(direct) for direct in director]
                    director = ",".join(director)

                    # Producers
                    prod_list = movie.get('production companies')
                    prod_list = [str(prod) for prod in prod_list]
                    prod_list = ",".join(prod_list)
                    

                    # Distributors
                    dist_list = movie.get('distributors')
                    dist_list = [str(dist) for dist in dist_list]
                    dist_list = ",".join(dist_list)
                    
                    # Genre
                    genre = movie.get('genres')
                    genre = [str(gen) for gen in genre]
                    genre = ",".join(genre)

                    # Runtime
                    run = movie.get('runtimes')
                    run = ",".join(run)

                    #appends
                    ratings.append(rating)
                    titles.append(title)
                    genres.append(genre)
                    distributors.append(dist_list)
                    directors.append(director)
                    mpa_ratings.append(certs)
                    plots.append(plot)
                    producers.append(prod_list)
                    budgets.append(budget)
                    cast.append(actors)
                    runtime.append(run)
                    years.append(year)
                    break
                except:
                    print(id)
                    sleep(1)

2024-12
18224682
18224682
18224682


In [4]:
len(mpa_ratings)

877

In [5]:
# Create DataFrame
df = pd.DataFrame({
    'Title': titles,
    'Year' : years,
    'Plot': plots,
    'Rating': ratings,
    'Budget': budgets,
    'Cast': cast,
    'MPA Rating': mpa_ratings,
    'Directors': directors,
    'Distributors': distributors,
    'Producers': producers,
    'Genre': genres,
    'Runtime in Minutes': runtime
})

print(f"Total movies scraped: {len(df)}")
df

Total movies scraped: 877


Unnamed: 0,Title,Year,Plot,Rating,Budget,Cast,MPA Rating,Directors,Distributors,Producers,Genre,Runtime in Minutes
0,1. Fresh,2022,"After quitting dating apps, a woman meets the ...",6.7,,"Daisy Edgar-Jones,Sebastian Stan,Jojo T. Gibbs...",R,Mimi Cave,"Hulu,Disney+,Disney+,RTL Zwei,RTL2,SRF zwei,Sc...","Searchlight Pictures,Legendary Entertainment,H...","Horror,Thriller",114
1,2. Scream,2022,25 years after a streak of brutal murders shoc...,6.3,"$24,000,000 (estimated)","Neve Campbell,Courteney Cox,David Arquette,Mel...",R,"Matt Bettinelli-Olpin,Tyler Gillett","Paramount Pictures,Paramount Pictures UK,Param...","Paramount Pictures,Spyglass Media Group,Projec...","Horror,Mystery,Thriller",114
2,3. Speak No Evil,2022,A Danish family visits a Dutch family they met...,6.6,"$3,200,000 (estimated)","Morten Burian,Sidsel Siem Koch,Fedja van Huêt,...",Not,Christian Tafdrup,"Capella Film,Night Edge Pictures,Nordisk Film ...","Profile Pictures,OAK Motion Pictures,Det Dansk...","Drama,Horror,Mystery,Thriller",97
3,4. Living,2022,"In 1950s London, a humorless bureaucrat decide...",7.2,,"Alex Sharp,Adrian Rawlins,Hubert Burton,Oliver...",PG-13,Oliver Hermanus,"Lionsgate,Sony Pictures Classics,Cine Canibal,...","Film4,County Hall,Lipsync Productions,Rocket S...",Drama,102
4,5. Watcher,2022,A young American woman moves with her husband ...,6.3,,"Maika Monroe,Karl Glusman,Burn Gorman,Tudor Pe...",R,Chloe Okuno,"IFC Midnight,Cine Canibal,Universal Pictures I...","Animal Casting Time,Imagenation Abu Dhabi FZ,L...","Drama,Horror,Thriller",96
...,...,...,...,...,...,...,...,...,...,...,...,...
872,10. F*** Marry Kill,2024,As a serial killer targets women on dating app...,5.3,,"Lucy Hale,Virginia Gardner,Brooke Nevin,Samer ...",R,Laura Murphy,"Lionsgate,Amazon Prime Video,Apple TV+,SPI Int...","Buzzfeed Studios,BondIt Media Capital,CR8IV DNA","Comedy,Mystery,Romance,Thriller",97
873,11. Dirty Angels,2024,It centers on a group of female soldiers who d...,4.3,,"Eva Green,Maria Bakalova,Ruby Rose,Reza Brojer...",R,Martin Campbell,"Gravel Road Distribution Group,Pioneer Films,T...","I Road Productions,Millennium Media,Nu Boyana ...","Action,Drama,Thriller,War",104
874,12. Mary,2024,"In this timeless coming-of-age story, Mary is ...",5.2,,"Noa Cohen,Ido Tako,Ori Pfeffer,Hilla Vidor,Dud...",TV-14,D.J. Caruso,"Netflix,Arna Media,Atmosfera Kino,Svoe Kino,Ne...","Aloe Entertainment,Creativity Media,FitzHenry ...",Drama,112
875,14. Jamie Foxx: What Had Happened Was...,2024,A stand-up special with Jamie Foxx performing ...,5.8,,"Jamie Foxx,Craig Brockman,Nisan Stewart,Bennet...",TV-MA,Hamish Hamilton,"Netflix,Netflix","Done and Dusted Productions,Foxxhole Productio...","Documentary,Comedy",68


In [6]:
df.to_csv("raw5.csv")
print("DONE")

DONE
