In [None]:
import requests
from bs4 import BeautifulSoup
import re
import json

In [None]:
import requests
from bs4 import BeautifulSoup
import re
import json

def getMovieDetails(imdbID):
    data = {}

    movie_url = "https://www.imdb.com/title/" + imdbID
    r = requests.get(headers={'User-Agent': 'Mozilla/5.0'}, url=movie_url)

    # Create a BeautifulSoup object
    soup = BeautifulSoup(r.text, 'html.parser')
    jsonData = soup.find('script', {"type": "application/ld+json"})

    # If no JSON data is found, return None
    if not jsonData:
        return None

    Moredata = []
    try:
        jsonSourceObj = json.loads(jsonData.string)
        Moredata.append(jsonSourceObj)
        data["expanded"] = Moredata
    except Exception as e:
        print(f"Error parsing JSON for {imdbID}: {e}")
        return None

    # Budget
    budget_element = soup.find('li', {'data-testid': 'title-boxoffice-budget'})
    if budget_element:
        budget_text = budget_element.text
        match = re.search(r'\d+', budget_text)
        if match:
            data['budget'] = int(match.group(0))

    # Directors, Writers, Editors, etc.
    full_credits = soup.find('li', class_='ipc-metadata-list__item')
    if full_credits:
        for item in full_credits.findAll('ul', class_='ipc-metadata-list ipc-metadata-list--dividers-all sc-afe43def-4 kdXikI'):
            for sub_item in item.findAll('li'):
                job = sub_item.find('a', class_='ipc-metadata-list-item__list-content-item ipc-metadata-list-item__list-content-item--link')
                if job:
                    data[job.text] = sub_item.text

    # imdbId
    data["imdbID"] = imdbID

    # Page title
    title = soup.find('title')
    if title:
        data["title"] = title.string
    else:
        data["title"] = ""

    # RunTime and other attributes
    data["Minutes"] = jsonSourceObj.get('duration', "")

    # Rating
    data["ratingValue"] = jsonSourceObj.get('aggregateRating', {}).get('ratingValue', "")
    data["ratingCount"] = jsonSourceObj.get('aggregateRating', {}).get('ratingCount', "")
    data["summary_text"] = jsonSourceObj.get('description', "")
    data['keywords'] = jsonSourceObj.get('keywords', "")

    return data


In [None]:
import logging
import re

logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")

def scrapIMDB(ImdbId):
    if not ImdbId.startswith("tt"):
        raise ValueError("Invalid IMDB ID format.")

    data = {"imdbId": ImdbId}
    try:
        movieDetails = getMovieDetails(ImdbId)
        expanded = movieDetails.get("expanded", [{}])[0]

        data["name"] = expanded.get("name", "")
        data["genre"] = expanded.get("genre", [])
        if not isinstance(data["genre"], list):
            data["genre"] = [data["genre"]]

        data["datePublished"] = expanded.get("datePublished", "")
        data["contentRating"] = expanded.get("contentRating", "")
        data["keywords"] = expanded.get("keywords", "")
        data["ratingValue"] = movieDetails.get("ratingValue", "")
        data["ratingCount"] = movieDetails.get("ratingCount", "")
        data["budget"] = movieDetails.get("budget", "")
        data["Minutes"] = movieDetails.get("Minutes", "")

        def extract_persons(field):
            persons = expanded.get(field, [])
            if not isinstance(persons, list):
                persons = [persons]
            return [{"name": person.get("name", ""), "id": extract_id(person.get("url", ""))} for person in persons]

        def extract_id(url):
            match = re.search(r"/name/(nm\d+)/", url)
            return match.group(1) if match else ""

        data["actors"] = extract_persons("actor")
        data["directors"] = extract_persons("director")
        data["creators"] = extract_persons("creator")

    except Exception as e:
        logging.error(f"Error processing IMDB ID {ImdbId}: {e}")
        raise

    return data


In [None]:
# endgame


id = "tt4154796"
try:
    data = scrapIMDB(id)
    print(data)
except Exception as e:
    print(f"Failed to scrape data: {e}")


{'imdbId': 'tt4154796', 'name': 'Avengers: Endgame', 'genre': ['Action', 'Adventure', 'Drama'], 'datePublished': '2019-04-26', 'contentRating': 'PG-13', 'keywords': 'time travel,superhero,super villain,cosmic,marvel cinematic universe', 'ratingValue': 8.4, 'ratingCount': 1328302, 'budget': 356, 'Minutes': 'PT3H1M', 'actors': [{'name': 'Robert Downey Jr.', 'id': 'nm0000375'}, {'name': 'Chris Evans', 'id': 'nm0262635'}, {'name': 'Mark Ruffalo', 'id': 'nm0749263'}], 'directors': [{'name': 'Anthony Russo', 'id': 'nm0751577'}, {'name': 'Joe Russo', 'id': 'nm0751648'}], 'creators': [{'name': '', 'id': ''}, {'name': 'Christopher Markus', 'id': 'nm1321655'}, {'name': 'Stephen McFeely', 'id': 'nm1321656'}, {'name': 'Stan Lee', 'id': 'nm0498278'}]}


In [None]:
# rush hour tt0120812


id = "tt0120812"
try:
    data = scrapIMDB(id)
    print(data)
except Exception as e:
    print(f"Failed to scrape data: {e}")


{'imdbId': 'tt0120812', 'name': 'Rush Hour', 'genre': ['Action', 'Comedy', 'Crime'], 'datePublished': '1998-09-18', 'contentRating': 'PG-13', 'keywords': 'child kidnapping,chinese mafia,gun,mobster,organized crime', 'ratingValue': 7, 'ratingCount': 306205, 'budget': 33, 'Minutes': 'PT1H38M', 'actors': [{'name': 'Jackie Chan', 'id': 'nm0000329'}, {'name': 'Chris Tucker', 'id': 'nm0000676'}, {'name': 'Ken Leung', 'id': 'nm0504962'}], 'directors': [{'name': 'Brett Ratner', 'id': 'nm0711840'}], 'creators': [{'name': '', 'id': ''}, {'name': '', 'id': ''}, {'name': 'Ross LaManna', 'id': 'nm0482780'}, {'name': 'Jim Kouf', 'id': 'nm0467942'}]}


In [None]:
#not found movie:

# rush hour tt1000002


id = "tt1000002"
try:
    data = scrapIMDB(id)
    print(data)
except Exception as e:
    print(f"Failed to scrape data: {e}")


ERROR:root:Error processing IMDB ID tt1000002: 'NoneType' object has no attribute 'get'


Failed to scrape data: 'NoneType' object has no attribute 'get'


In [None]:
import time
import pandas as pd
import logging


# Set up logging to force output in Colab
logger = logging.getLogger()
logger.setLevel(logging.INFO)

# Add a StreamHandler to ensure logging output is seen in Colab
stream_handler = logging.StreamHandler()
stream_handler.setLevel(logging.INFO)
logger.addHandler(stream_handler)

logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")


In [None]:
import time
import pandas as pd
import logging




movies_data = []

for i in range(1151438, 1200000):
    movie_id = f"tt{i:07}"
    try:
        movie_data = scrapIMDB(movie_id)
        if movie_data:
            movies_data.append(movie_data)
        else:
            logging.info(f"Movie ID {movie_id} does not exist or data could not be fetched.")
    except Exception as e:
        logging.error(f"Error processing movie ID {movie_id}: {e}")

    time.sleep(0.6)



ERROR:root:Error processing IMDB ID tt1151464: 'NoneType' object has no attribute 'get'
Error processing IMDB ID tt1151464: 'NoneType' object has no attribute 'get'
ERROR:root:Error processing movie ID tt1151464: 'NoneType' object has no attribute 'get'
Error processing movie ID tt1151464: 'NoneType' object has no attribute 'get'
ERROR:root:Error processing IMDB ID tt1151483: 'NoneType' object has no attribute 'get'
Error processing IMDB ID tt1151483: 'NoneType' object has no attribute 'get'
ERROR:root:Error processing movie ID tt1151483: 'NoneType' object has no attribute 'get'
Error processing movie ID tt1151483: 'NoneType' object has no attribute 'get'
ERROR:root:Error processing IMDB ID tt1151506: 'NoneType' object has no attribute 'get'
Error processing IMDB ID tt1151506: 'NoneType' object has no attribute 'get'
ERROR:root:Error processing movie ID tt1151506: 'NoneType' object has no attribute 'get'
Error processing movie ID tt1151506: 'NoneType' object has no attribute 'get'
ERRO

In [None]:
from google.colab import files


df = pd.DataFrame(movies_data)


csv_file = "movie_data.csv"
df.to_csv(csv_file, sep=';', index=False)


files.download(csv_file)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>