1. Data Scraping

We collect the top *n* movies sorted by year with the following criteria:
- Have atleast 1000 reviews
- Is a Movie (Not a series)
- Is in English

In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from time import sleep
from imdb import Cinemagoer
import calendar
import re
import IPython

import socket
socket.setdefaulttimeout(120)

In [3]:
base_url = 'https://www.imdb.com/search/title/'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}

cg = Cinemagoer()

# Initialize lists and session
titles = []
plots = []
ratings = []
budgets = []
years = []
session = requests.Session()  # Crucial for maintaining cookies

# New Lists
cast = []
mpa_ratings = []
directors = []
distributors = []
producers = []
genres = []
runtime = []

# Loop through each year and month from 2019 to 2024
for year in range(2016, 2017+1):
    for month in range(1, 13):
        IPython.display.clear_output()
        print(f"{year}-{month:02d}")

        # Calculate start and end dates for the month
        # Generated by DeepSeek
        last_day = calendar.monthrange(int(year), int(month))[1]
        start_date = f"{year}-{month:02d}-01"
        end_date = f"{year}-{month:02d}-{last_day:02d}"

        # Set parameters with the current month's date range
        # Generated by DeepSeek
        params = {
            'title_type': 'feature',
            'num_votes': 1000,
            'primary_language': 'en',
            'sort': 'year,asc',
            'release_date': f"{start_date},{end_date}"
        }

        # Make request for the current month
        # Generated by ChatGPT
        response = session.get(base_url, params=params, headers=headers, timeout=70)
        if response.status_code != 200:
            print(f"Failed to retrieve data for {year}-{month:02d}, status code: {response.status_code}")
            continue

        soup = BeautifulSoup(response.content, 'html.parser')
        movies = soup.find_all('div', class_='ipc-metadata-list-summary-item__tc')

        for movie in movies:
            RETRY_ATTEMPTS = 3
            for attempt in range(1, RETRY_ATTEMPTS + 1):
                try:
                    # Title
                    title_tag = movie.find('h3', class_='ipc-title__text')
                    title = title_tag.text.strip() if title_tag else 'N/A'
                    id = re.search(r"/title/tt([0-9]+)/",title_tag.parent.get("href")).group(1)
                    
                    # Actors
                    movie = cg.get_movie(id)
                    actors = movie.get('cast')
                    actors = [str(actor) for actor in actors]
                    actors = ",".join(actors)

                    #Rating
                    rating = movie['rating']
                    
                    #Budget
                    bo = movie.get('box office')
                    budget = bo.get('Budget') if bo else 'N/A'
                    
                    #Plot
                    plot = movie.get('plot')
                    plot = " ".join(plot) if plot else 'N/A'
                    
                    # MPA Ratings
                    certs = movie.get('certificates')
                    certs = ",".join(certs) + " "
                    certs = re.search(r"United States:(.*?)[\b,\: ]",certs)
                    certs = certs.group(1) if certs else 'N/A'

                    # Directors
                    director = movie.get('director')
                    director = [str(direct) for direct in director]
                    director = ",".join(director)

                    # Producers
                    prod_list = movie.get('production companies')
                    prod_list = [str(prod) for prod in prod_list]
                    prod_list = ",".join(prod_list)
                    

                    # Distributors
                    dist_list = movie.get('distributors')
                    dist_list = [str(dist) for dist in dist_list]
                    dist_list = ",".join(dist_list)
                    
                    # Genre
                    genre = movie.get('genres')
                    genre = [str(gen) for gen in genre]
                    genre = ",".join(genre)

                    # Runtime
                    run = movie.get('runtimes')
                    run = ",".join(run)

                    #appends
                    ratings.append(rating)
                    titles.append(title)
                    genres.append(genre)
                    producers.append(prod_list)
                    distributors.append(dist_list)
                    directors.append(director)
                    mpa_ratings.append(certs)
                    plots.append(plot)
                    budgets.append(budget)
                    cast.append(actors)
                    runtime.append(run)
                    years.append(year)
                    break
                except:
                    print(id)
                    sleep(1)

2017-12


In [4]:
len(genre)

13

In [5]:
# Create DataFrame
df = pd.DataFrame({
    'Title': titles,
    'Year' : years,
    'Plot': plots,
    'Rating': ratings,
    'Budget': budgets,
    'Cast': cast,
    'MPA Rating': mpa_ratings,
    'Directors': directors,
    'Distributors': distributors,
    'Producers': producers,
    'Genre': genres,
    'Runtime in Minutes': runtime
})

print(f"Total movies scraped: {len(df)}")
df

Total movies scraped: 591


Unnamed: 0,Title,Year,Plot,Rating,Budget,Cast,MPA Rating,Directors,Distributors,Producers,Genre,Runtime in Minutes
0,1. Manchester by the Sea,2016,A depressed uncle is asked to take care of his...,7.8,"$8,500,000 (estimated)","Casey Affleck,Ivy O'Brien,Kyle Chandler,Richar...",R,Kenneth Lonergan,"B&H Film Distribution,Bitters End,Continental ...","Amazon Studios,K Period Media,Pearl Street Fil...",Drama,137
1,2. Deadpool,2016,A wisecracking mercenary gets experimented on ...,8.0,"$58,000,000 (estimated)","Ryan Reynolds,Karan Soni,Ed Skrein,Michael Ben...",TV-MA,Tim Miller,"Twentieth Century Fox,20th Century Fox,20th Ce...","Twentieth Century Fox,Marvel Entertainment,Kin...","Action,Comedy,Sci-Fi",108
2,3. 13 Hours,2016,"During an attack on a U.S. compound in Libya, ...",7.3,"$50,000,000 (estimated)","John Krasinski,James Badge Dale,Pablo Schreibe...",R,Michael Bay,"Andes Films,B&H Film Distribution,Central Part...","Paramount Pictures,3 Arts Entertainment,Bay Fi...","Action,Drama,History,Thriller,War",144
3,4. The 5th Wave,2016,Four waves of increasingly deadly alien attack...,5.2,"$38,000,000 (estimated)","Chloë Grace Moretz,Matthew Zuk,Gabriela Lopez,...",PG-13,J Blakeson,"Columbia Pictures,Sony Pictures Releasing,Ande...","Columbia Pictures,LStar Capital,Material Pictu...","Action,Adventure,Sci-Fi,Thriller",112
4,5. Captain Fantastic,2016,"In the forests of the Pacific Northwest, a fat...",7.8,"$5,000,000 (estimated)","Viggo Mortensen,George MacKay,Samantha Isler,A...",R,Matt Ross,"Bleecker Street Media,Entertainment One,Entert...","Electric City Entertainment,ShivHans Pictures","Comedy,Drama",118
...,...,...,...,...,...,...,...,...,...,...,...,...
586,21. The Last Guest,2017,The tragic story of a guest and his perspectiv...,6.3,$500 (estimated),"Sabrina Abu-Obeid,Carlos Alazraqui,Newell Alex...",TV-MA,ObliviousHD,"Amazon Prime Video,ObliviousHD,YouTube","Roblox,Blender,Moon Animation,Oblivious,Roblox","Animation,Action,Adventure,Drama,Family,Sci-Fi...",90
587,22. Accident,2017,A group of teenagers suffer a terrible acciden...,3.3,,"Stephanie Schildknecht,Roxane Hayward,Tyrone K...",,Dan Tondowski,"The Exchange,Batrax Entertainment,Concorde Hom...","Forefront Media Group,Superbe Films,D21 Films,...","Action,Thriller",95
588,23. The Ice Cream Truck,2017,Mary moves back to her suburban hometown to fi...,4.2,,"Deanna Russo,Emil Johnsen,John Redlinger,Sam S...",,Megan Freels Johnston,"Tulip Pictures México,SC Movies,Uncork'd Enter...",Look At Me Films,"Comedy,Horror,Mystery,Thriller",96
589,24. Treasures from the Wreck of the Unbelievable,2017,The fake story of the art found beneath the de...,6.0,,"Damien Hirst,Piotr Baumann,Andrew Whipp",,Sam Hobkinson,"Netflix,Park Circus","Science,The Oxford Film Company",Documentary,90


In [6]:
df.to_csv("raw2.csv")
print("DONE")

DONE
