In [2]:
import requests
from bs4 import BeautifulSoup

In [3]:
url = 'https://www.themoviedb.org/movie'  # Put your URL in this quote

header = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.51 Safari/537.36'
}

In [4]:
def get_all_pages():
    all_url = []
    for i in range(1, 51):
        page = f"?page={i}"
        all_url.append(url + page)
    return all_url

In [None]:
pages = get_all_pages()

In [6]:
def get_movie_cards(page):
    source_data_all = requests.get(page, headers = header).text
    soup_data_all = BeautifulSoup(source_data_all, 'lxml')
    all_card = soup_data_all.find_all('div', class_='card style_1')
    return all_card

In [7]:
def fetch_movie_urls(item):
    link = str(item.find('a'))[29:37]
    new_url = url + link
    return new_url

In [8]:
def get_movie_names(movie_head):
    try:
        movie_name = movie_head.find('h2').find('a').text
    except:
        movie_name = None
    return movie_name

In [9]:
def get_release_dates(movie_head):
    try:
        release_date = movie_head.find('span', class_='release').text.replace('\n','').strip().split(" ")[0]
    except:
        release_date = None
    return release_date

In [10]:
def get_ratings(movie_head):
    try:
        rating = movie_head.find('div', class_='user_score_chart')["data-percent"]
    except:
        rating = None
    return rating

In [11]:
def get_directors(movie_head):
    try:
        production = movie_head.find('ol', class_='people no_image')
        profile = production.find_all('li', class_='profile')
        director = ""

        for i in range(0, len(profile)):
            character = str(profile[i].find_all('p')[1].text)
            if 'Director' in character:
                director += f"{profile[i].find('a').text},"
        
        director = director.rstrip(',')
    except:
        director = None
    
    return director

In [12]:
def get_movie_durations(movie_head):
    try:
        duration = movie_head.find('span', class_='runtime').text.replace('\n','').strip()
    except:
        duration = None
    return duration

In [13]:
def get_movie_genres(movie_head):
    try:
        genre = movie_head.find('span', class_='genres').text.replace('\n','').replace('\xa0','')
    except:
        genre = None
    return genre

In [None]:
final_data = []

for page in pages:
    all_cards = get_movie_cards(page)
    
    for card in all_cards:
        movie_url = fetch_movie_urls(card)
        
        try:
            movie = requests.get(movie_url, headers=header).text
            movie_soup = BeautifulSoup(movie, 'lxml')

            movie_head = movie_soup.find('section', id='original_header')

            movie_name = get_movie_names(movie_head)
            release_date = get_release_dates(movie_head)
            rating = get_ratings(movie_head)
            director = get_directors(movie_head)
            duration = get_movie_durations(movie_head)
            genre = get_movie_genres(movie_head)

            movie_dict = {
                'Movie Name': movie_name,
                'Release Date': release_date,
                'Rating': rating,
                'Director': director,
                'Duration': duration,
                'Genre': genre
            }

            final_data.append(movie_dict)
        except Exception as e:
            print(f"Error processing {movie_url}: {e}")


In [15]:
import pandas as pd

In [16]:
def save_to_excel(final_data):
    df = pd.DataFrame(final_data)
    df.to_excel('themoviedb.xlsx')

In [17]:
save_to_excel(final_data)