1. Data Scraping

We collect the top *n* movies sorted by year with the following criteria:
- Have atleast 1000 reviews
- Is a Movie (Not a series)
- Is in English

In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from time import sleep
from imdb import Cinemagoer
import calendar
import re
import IPython

import socket
socket.setdefaulttimeout(120)

In [3]:
base_url = 'https://www.imdb.com/search/title/'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}

cg = Cinemagoer()

# Initialize lists and session
titles = []
plots = []
ratings = []
budgets = []
years = []
session = requests.Session()  # Crucial for maintaining cookies

# New Lists
cast = []
mpa_ratings = []
directors = []
distributors = []
producers = []
genres = []
runtime = []

# Loop through each year and month from 2019 to 2024
for year in range(2018, 2019+1):
    for month in range(1, 13):
        IPython.display.clear_output()
        print(f"{year}-{month:02d}")

        # Calculate start and end dates for the month
        # Generated by DeepSeek
        last_day = calendar.monthrange(int(year), int(month))[1]
        start_date = f"{year}-{month:02d}-01"
        end_date = f"{year}-{month:02d}-{last_day:02d}"

        # Set parameters with the current month's date range
        # Generated by DeepSeek
        params = {
            'title_type': 'feature',
            'num_votes': 1000,
            'primary_language': 'en',
            'sort': 'year,asc',
            'release_date': f"{start_date},{end_date}"
        }

        # Make request for the current month
        # Generated by ChatGPT
        response = session.get(base_url, params=params, headers=headers, timeout=70)
        if response.status_code != 200:
            print(f"Failed to retrieve data for {year}-{month:02d}, status code: {response.status_code}")
            continue

        soup = BeautifulSoup(response.content, 'html.parser')
        movies = soup.find_all('div', class_='ipc-metadata-list-summary-item__tc')

        for movie in movies:
            RETRY_ATTEMPTS = 3
            for attempt in range(1, RETRY_ATTEMPTS + 1):
                try:
                    # Title
                    title_tag = movie.find('h3', class_='ipc-title__text')
                    title = title_tag.text.strip() if title_tag else 'N/A'
                    id = re.search(r"/title/tt([0-9]+)/",title_tag.parent.get("href")).group(1)
                    
                    # Actors
                    movie = cg.get_movie(id)
                    actors = movie.get('cast')
                    actors = [str(actor) for actor in actors]
                    actors = ",".join(actors)

                    #Rating
                    rating = movie['rating']
                    
                    #Budget
                    bo = movie.get('box office')
                    budget = bo.get('Budget') if bo else 'N/A'
                    
                    #Plot
                    plot = movie.get('plot')
                    plot = " ".join(plot) if plot else 'N/A'
                    
                    # MPA Ratings
                    certs = movie.get('certificates')
                    certs = ",".join(certs) + " "
                    certs = re.search(r"United States:(.*?)[\b,\: ]",certs)
                    certs = certs.group(1) if certs else 'N/A'

                    # Directors
                    director = movie.get('director')
                    director = [str(direct) for direct in director]
                    director = ",".join(director)

                    # Producers
                    prod_list = movie.get('production companies')
                    prod_list = [str(prod) for prod in prod_list]
                    prod_list = ",".join(prod_list)
                    

                    # Distributors
                    dist_list = movie.get('distributors')
                    dist_list = [str(dist) for dist in dist_list]
                    dist_list = ",".join(dist_list)
                    
                    # Genre
                    genre = movie.get('genres')
                    genre = [str(gen) for gen in genre]
                    genre = ",".join(genre)

                    # Runtime
                    run = movie.get('runtimes')
                    run = ",".join(run)

                    #appends
                    ratings.append(rating)
                    titles.append(title)
                    genres.append(genre)
                    distributors.append(dist_list)
                    directors.append(director)
                    mpa_ratings.append(certs)
                    plots.append(plot)
                    budgets.append(budget)
                    cast.append(actors)
                    runtime.append(run)
                    years.append(year)
                    producers.append(prod_list)
                    break
                except:
                    print(id)
                    sleep(1)

2019-12
4130484
4130484
4130484


In [4]:
len(mpa_ratings)

591

In [5]:
# Create DataFrame
df = pd.DataFrame({
    'Title': titles,
    'Year' : years,
    'Plot': plots,
    'Rating': ratings,
    'Budget': budgets,
    'Cast': cast,
    'MPA Rating': mpa_ratings,
    'Directors': directors,
    'Distributors': distributors,
    'Producers': producers,
    'Genre': genres,
    'Runtime in Minutes': runtime
})

print(f"Total movies scraped: {len(df)}")
df

Total movies scraped: 591


Unnamed: 0,Title,Year,Plot,Rating,Budget,Cast,MPA Rating,Directors,Distributors,Producers,Genre,Runtime in Minutes
0,1. Den of Thieves,2018,An elite unit of the LA County Sheriff's Dept....,7.0,"$30,000,000 (estimated)","Gerard Butler,Pablo Schreiber,O'Shea Jackson J...",TV-14,Christian Gudegast,"STX Entertainment,Ascot Elite Entertainment Gr...","STX Films,Diamond Film Productions,Tucker Tool...","Action,Crime,Drama,Thriller",140
1,2. Hereditary,2018,A grieving family is haunted by tragic and dis...,7.3,"$10,000,000 (estimated)","Alex Wolff,Gabriel Byrne,Toni Collette,Milly S...",TV-14,Ari Aster,"A24,BestFilm.eu,Blitz,DeAPlaneta,Diamond Films...","PalmStar Media,Finch Entertainment,Windy Hill ...","Drama,Horror,Mystery,Thriller",127
2,3. Black Panther,2018,"T'Challa, heir to the hidden but advanced king...",7.3,"$200,000,000 (estimated)","Chadwick Boseman,Michael B. Jordan,Lupita Nyon...",PG-13,Ryan Coogler,"Walt Disney Studios Motion Pictures,Walt Disne...",Marvel Studios,"Action,Adventure,Sci-Fi",134
3,4. 12 Strong,2018,12 Strong tells the story of the first Special...,6.5,"$35,000,000 (estimated)","Chris Hemsworth,Michael Shannon,Michael Peña,N...",R,Nicolai Fuglsig,"01 Distribution,ACME,ACME,ACME,Aurum Films,Bel...","Alcon Entertainment,Black Label Media,Jerry Br...","Action,Drama,History,War",130
4,5. Eighth Grade,2018,An introverted teenage girl tries to survive t...,7.3,"$2,000,000 (estimated)","Elsie Fisher,Josh Hamilton,Emily Robinson,Jake...",TV-PG,Bo Burnham,"A24,Sony Pictures Worldwide Acquisitions (SPWA...","A24,IAC Films","Comedy,Drama",93
...,...,...,...,...,...,...,...,...,...,...,...,...
586,21. Hammer,2019,A father faces a personal crisis when he disco...,5.7,,"Will Patton,Mark O'Brien,Ben Cotton,Vickie Pap...",,Christian Sparkes,"Vertical Entertainment,A71 Entertainment,Eagle...","Sara Fost Pictures,Away Films,JoBro Production...","Crime,Drama,Thriller",82
587,22. The Murder of Nicole Brown Simpson,2019,"Inspired by true events, the film follows OJ S...",2.6,,"Mena Suvari,Nick Stahl,Taryn Manning,Agnes Bru...",R,Daniel Farrands,"Showtime Networks,GEM Entertainment,Cinemundo,...","Skyline Entertainment,Bundy Brown Island,1428 ...","Crime,Horror,Mystery,Thriller",82
588,23. A Christmas Prince: The Royal Baby,2019,"It's Christmastime in Aldovia, and a royal bab...",5.4,,"Rose McIver,Ben Lamb,Alice Krige,Honor Kneafse...",TV-PG,John Schultz,"Netflix,Netflix,Netflix","Netflix,Motion Picture Corporation of America ...","Family,Romance",84
589,24. The Madness Within,2019,Russ Washington is a successful businessman on...,4.6,"$1,000,000 (estimated)","Hunter G. Williams,Edin Gali,Tessa Farrell,Lil...",R,Hunter G. Williams,"Midnight Releasing,Uncork'd Entertainment",Busted Knuckle Productions,Thriller,97


In [6]:
df.to_csv("raw3.csv")
print("DONE")

DONE
