# Importing required python libraries 

In [45]:
import pandas as pd
import numpy as np
import re
import requests
from bs4 import BeautifulSoup

# Setting Headers for web scraping

In [55]:
# To avoid '403'-Fordbidden request error, add below headers to the request
headers = {
   'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
}

# Fetching webpage content using requests
webpage = requests.get('https://www.imdb.com/chart/top/',headers=headers).text

# Parse HTML content

In [None]:
# Using BeautifulSoup to parse the webpage content
soup = BeautifulSoup(webpage, 'lxml')

In [48]:
soup.find_all('h1')[0].text

'IMDb Top 250 Movies'

# Extracting content of each movie container

In [49]:
movie = soup.find_all('div',class_ ='sc-e2db8066-1 QxXCO cli-parent li-compact')

In [50]:
Name = []
User_rating = []
Year = []
MPA_rating = []
Duration = []
Rank = []

for i in movie:
    # Extracting rank and name
    full_name = i.find('h3').text.strip()
    rank_match = re.match(r'(\d+)\.\s*(.*)', full_name)
    if rank_match:
        Rank.append(rank_match.group(1))
        Name.append(rank_match.group(2))
    
    # Extracting user rating
    User_rating.append(i.find('span', class_='ipc-rating-star--rating').text.strip())
    
    # Extracting metadata (year, duration, MPA rating)
    metadata = i.find_all('span', class_='sc-d5ea4b9d-7 URyjV cli-title-metadata-item')
    Year.append(metadata[0].text.strip() if len(metadata) > 0 else 'N/A')
    Duration.append(metadata[1].text.strip() if len(metadata) > 1 else 'N/A')
    MPA_rating.append(metadata[2].text.strip() if len(metadata) > 2 else 'N/A')

''' Another way to extract the data
for i in movie:
    name.append(i.find('h3').text.strip()) #print(i.find('h3').text.strip())
    user_rating.append(i.find('span',class_='ipc-rating-star--rating').text.strip()) #print(soup.find_all('span',class_='ipc-rating-star--rating')[0].text.strip()) Use find_all where you have multiple values, but here we have only one value so we can use find & combination of span & ipc-rating is "unique"
    year.append(i.find_all('span',class_='sc-d5ea4b9d-7 URyjV cli-title-metadata-item')[0].text.strip())
    duration.append(i.find_all('span', class_='sc-d5ea4b9d-7 URyjV cli-title-metadata-item')[1].text.strip())
    MPA_rating.append(i.find_all('span',class_='sc-d5ea4b9d-7 URyjV cli-title-metadata-item')[2].text.strip())
    '''

# Creating a DataFrame to store the extracted data
d = {
    'Rank': Rank,
    'Name': Name,
    'User_rating': User_rating,
    'Year': Year,
    'Duration': Duration,
    'MPA_rating': MPA_rating
}
df = pd.DataFrame(d)


In [51]:
print(movie[0].find_all('span',class_='sc-d5ea4b9d-7 URyjV cli-title-metadata-item')[0].text.strip())
print(movie[0].find_all('span',class_='sc-d5ea4b9d-7 URyjV cli-title-metadata-item')[1].text.strip())
print(movie[0].find_all('span',class_='sc-d5ea4b9d-7 URyjV cli-title-metadata-item')[2].text.strip())

1994
2h 22m
A


In [52]:
df

Unnamed: 0,Rank,Name,User_rating,Year,Duration,MPA_rating
0,1,The Shawshank Redemption,9.3,1994,2h 22m,A
1,2,The Godfather,9.2,1972,2h 55m,A
2,3,The Dark Knight,9.0,2008,2h 32m,UA
3,4,The Godfather: Part II,9.0,1974,3h 22m,A
4,5,12 Angry Men,9.0,1957,1h 36m,U
5,6,The Lord of the Rings: The Return of the King,9.0,2003,3h 21m,U
6,7,Schindler's List,9.0,1993,3h 15m,A
7,8,Pulp Fiction,8.9,1994,2h 34m,A
8,9,The Lord of the Rings: The Fellowship of the Ring,8.9,2001,2h 58m,U
9,10,"Il Buono, Il Brutto, Il Cattivo",8.8,1966,2h 41m,A


Here 
1. A means "R" rated
2. UA means "PG-13"
3. U means "Approved" or "Not Rated"


# Converting df to dataset to use it later for EDA

In [53]:
df.to_csv('IMDB_Top_25_Movies.csv', index=False)