In [80]:
import requests
from bs4 import BeautifulSoup
import lxml
import pandas as pd
import openpyxl

In [76]:
# 'headers' includes a user agent to avoid being blocked by the website
headers = {'User-Agent': 'Mozilla/5.0'}

# Access the website and return a response object
response = requests.get('https://www.imdb.com/chart/top/', headers = headers)

In [None]:
# Create a BeautifulSoup object to parse the HTML content of the page
soup = BeautifulSoup(response.text, 'lxml')
print(soup.prettify())

In [78]:
# Initialize a dictionary to store the movie data
movie_dict = {'Ranking':[], 'Name':[], 'Year':[], 'Duration':[], 'Point':[]}

# Find all the list items that contain the movie information
# 'cli-parent' is the class associated with the list items that contain the movies
movie_list = soup.find_all('li', class_ = 'cli-parent')

# Optional: You can print the length of 'movie_list' to check if all 250 movies are selected
# print(len(movie_list))

# Loop through each movie in the list and extract the required details
for movie in movie_list:
    # Extract and store the movie's ranking (the number before the movie's name)
    movie_dict['Ranking'].append(movie.find('h3', class_='ipc-title__text').text.split('.')[0])

    # Extract and store the movie's name (the text after the ranking)
    movie_dict['Name'].append(movie.find('h3', class_='ipc-title__text').text.split('.')[1].strip())

    # Extract and store the release year of the movie
    movie_dict['Year'].append(movie.find_all('span', class_='cli-title-metadata-item')[0].text)

    # Extract and store the duration of the movie
    movie_dict['Duration'].append(movie.find_all('span', class_='cli-title-metadata-item')[1].text)

    # Extract and store the IMDb rating of the movie
    movie_dict['Point'].append(movie.find('span', class_='ipc-rating-star--rating').text)

In [79]:
# Convert the dictionary to a Pandas DataFrame for easier manipulation and exporting
df = pd.DataFrame(movie_dict)

# Export the DataFrame to an Excel file named 'Top250.xlsx'
# 'index=False' is used to avoid saving the DataFrame's index as a column in the Excel file
df.to_excel('Top250.xlsx', index=False)