1. Data Scraping

We collect the top *n* movies sorted by year with the following criteria:
- Have atleast 1000 reviews
- Is a Movie (Not a series)
- Is in English

In [3]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from time import sleep
from imdb import Cinemagoer
import calendar
import re
import IPython

import socket
socket.setdefaulttimeout(120)

In [4]:
base_url = 'https://www.imdb.com/search/title/'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}

cg = Cinemagoer()

# Initialize lists and session
titles = []
plots = []
ratings = []
budgets = []
years = []
session = requests.Session()  # Crucial for maintaining cookies

# New Lists
cast = []
mpa_ratings = []
directors = []
distributors = []
producers = []
genres = []
runtime = []

# Loop through each year and month from 2019 to 2024
for year in range(2014, 2015+1):
    for month in range(1, 13):
        IPython.display.clear_output()
        print(f"{year}-{month:02d}")

        # Calculate start and end dates for the month
        # Generated by DeepSeek
        last_day = calendar.monthrange(int(year), int(month))[1]
        start_date = f"{year}-{month:02d}-01"
        end_date = f"{year}-{month:02d}-{last_day:02d}"

        # Set parameters with the current month's date range
        # Generated by DeepSeek
        params = {
            'title_type': 'feature',
            'num_votes': 1000,
            'primary_language': 'en',
            'sort': 'year,asc',
            'release_date': f"{start_date},{end_date}"
        }

        # Make request for the current month
        # Generated by ChatGPT
        response = session.get(base_url, params=params, headers=headers, timeout=70)
        if response.status_code != 200:
            print(f"Failed to retrieve data for {year}-{month:02d}, status code: {response.status_code}")
            continue

        soup = BeautifulSoup(response.content, 'html.parser')
        movies = soup.find_all('div', class_='ipc-metadata-list-summary-item__tc')

        for movie in movies:
            RETRY_ATTEMPTS = 3
            for attempt in range(1, RETRY_ATTEMPTS + 1):
                try:
                    # Title
                    title_tag = movie.find('h3', class_='ipc-title__text')
                    title = title_tag.text.strip() if title_tag else 'N/A'
                    id = re.search(r"/title/tt([0-9]+)/",title_tag.parent.get("href")).group(1)
                    
                    # Actors
                    movie = cg.get_movie(id)
                    actors = movie.get('cast')
                    actors = [str(actor) for actor in actors]
                    actors = ",".join(actors)

                    #Rating
                    rating = movie['rating']
                    
                    #Budget
                    bo = movie.get('box office')
                    budget = bo.get('Budget') if bo else 'N/A'
                    
                    #Plot
                    plot = movie.get('plot')
                    plot = " ".join(plot) if plot else 'N/A'
                    
                    # MPA Ratings
                    certs = movie.get('certificates')
                    certs = ",".join(certs) + " "
                    certs = re.search(r"United States:(.*?)[\b,\: ]",certs)
                    certs = certs.group(1) if certs else 'N/A'

                    # Directors
                    director = movie.get('director')
                    director = [str(direct) for direct in director]
                    director = ",".join(director)

                    # Producers
                    prod_list = movie.get('production companies')
                    prod_list = [str(prod) for prod in prod_list]
                    prod_list = ",".join(prod_list)

                    # Distributors
                    dist_list = movie.get('distributors')
                    dist_list = [str(dist) for dist in dist_list]
                    dist_list = ",".join(dist_list)
                    
                    # Genre
                    genre = movie.get('genres')
                    genre = [str(gen) for gen in genre]
                    genre = ",".join(genre)

                    # Runtime
                    run = movie.get('runtimes')
                    run = ",".join(run)

                    #appends
                    ratings.append(rating)
                    titles.append(title)
                    genres.append(genre)
                    producers.append(prod_list)
                    distributors.append(dist_list)
                    directors.append(director)
                    mpa_ratings.append(certs)
                    plots.append(plot)
                    budgets.append(budget)
                    cast.append(actors)
                    runtime.append(run)
                    years.append(year)
                    break
                except:
                    print(id)
                    sleep(1)

2015-12


In [5]:
len(mpa_ratings)

596

In [6]:
# Create DataFrame
df = pd.DataFrame({
    'Title': titles,
    'Year' : years,
    'Plot': plots,
    'Rating': ratings,
    'Budget': budgets,
    'Cast': cast,
    'MPA Rating': mpa_ratings,
    'Directors': directors,
    'Distributors': distributors,
    'Producers': producers,
    'Genre': genres,
    'Runtime in Minutes': runtime
})

print(f"Total movies scraped: {len(df)}")
df

Total movies scraped: 596


Unnamed: 0,Title,Year,Plot,Rating,Budget,Cast,MPA Rating,Directors,Distributors,Producers,Genre,Runtime in Minutes
0,1. Whiplash,2014,A promising young drummer enrolls at a cut-thr...,8.5,"$3,300,000 (estimated)","Miles Teller,J.K. Simmons,Paul Reiser,Melissa ...",TV-14,Damien Chazelle,"Sony Pictures Classics,Sony Pictures Worldwide...","Bold Films,Blumhouse Productions,Right of Way ...","Drama,Music",106
1,2. RoboCop,2014,"In 2028 Detroit, when Alex Murphy, a loving hu...",6.1,"$100,000,000 (estimated)","Joel Kinnaman,Gary Oldman,Michael Keaton,Abbie...",TV-14,José Padilha,"Columbia Pictures,ACME,ACME,ACME,B&H Film Dist...","Metro-Goldwyn-Mayer (MGM),Columbia Pictures,St...","Action,Crime,Sci-Fi,Thriller",117
2,3. The Babadook,2014,A single mother and her child fall into a deep...,6.8,"$2,000,000 (estimated)","Essie Davis,Noah Wiseman,Hayley McElhinney,Dan...",Not,Jennifer Kent,"Capelight Pictures,Cinecolor Films,Feelgood En...","Screen Australia,Causeway Films,The South Aust...","Drama,Horror,Mystery",94
3,4. What We Do in the Shadows,2014,"Viago, Deacon, and Vladislav are vampires who ...",7.6,"$1,600,000 (estimated)","Jemaine Clement,Taika Waititi,Jonny Brugh,Cori...",R,"Jemaine Clement,Taika Waititi","Benuca Films,Cinetren,Festival Films / Yedra F...","Unison Films,Defender Films,Park Road Post Pro...","Comedy,Horror",86
4,5. Boyhood,2014,"The life of Mason, from early childhood to his...",7.9,"$4,000,000 (estimated)","Ellar Coltrane,Patricia Arquette,Elijah Smith,...",TV-14,Richard Linklater,"Diaphana Distribution,IFC Films,Lumière,Lumièr...","IFC Productions,Detour Filmproduction",Drama,165
...,...,...,...,...,...,...,...,...,...,...,...,...
591,21. Manifesto,2015,Cate Blanchett performs manifestos as a series...,6.5,,"Cate Blanchett,Erika Bauer,Ruby Bustamante,Car...",Not,Julian Rosefeldt,"Modern Films,More2Screen,A-One Films,DCM Film ...","Bayerischer Rundfunk,Ruhr Triennale,Schiwago Film",Drama,95
592,22. Thirst,2015,When a group of wayward teens arrive at a dese...,4.4,,"John Redlinger,Jes Macallan,Karl Makinen,Clare...",,Greg Kiefer,"Uncork'd Entertainment,At Entertainment,Amazin...","Cosmic Pictures,Thirst Film","Action,Adventure,Horror,Mystery,Sci-Fi,Thriller",87
593,23. Brothers of the Wind,2015,"In a world where it takes courage to fly, a yo...",6.8,,"Jean Reno,Tobias Moretti,Manuel Camacho,Eva Ku...",TV-G,"Gerardo Olivares,Otmar Penker","Crystalsky Multimedia,Légende Distribution,War...","FilmVergnuegen,Terra Mater Factual Studios","Adventure,Drama,Family",98
594,24. Bad Roomies,2015,"When they lose a roomie, the other two men fin...",5.1,,"Tommy Savas,Eric Pumphrey,Patrick Renna,Hanove...",Not,Jason Schnell,The Orchard,"Eastside Films,Fae Studio",Comedy,93


In [7]:
df.to_csv("raw1.csv")
print("DONE")

DONE
