## Project Outline:
    
- we are going to scrape: https://www.imdb.com/search/title/?groups=top_250&sort=user_rating
- we will get the lists of top 250 movies according to Imdb. 
- For each movie we will grab movie_name , year , time , genre , rating , metascore

## Importing the Libraries.

In [1]:
import requests
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

## Web Scrapping 

In [2]:
# Empty list , these lists will contain data which will be scrapped
movie_name = []
year = []
time = []
genre = []            
rating = []
metascore = []
dir_name = []

# Scraping the first 5 pages
pages = [0, 5, 10, 15, 20]

def movies_data():
    # Loop through multiple pages
    for page_number in pages:  
        url = f"https://www.imdb.com/search/title/?groups=top_250&sort=user_rating,desc&start={page_number}1&ref_=adv_prv" 
        response = requests.get(url)  # Send an HTTP GET request to the URL


        if response.status_code == 200:  # Check if the request was successful  
            html_cont = response.content   # Contains data of the page
            soup = BeautifulSoup(html_cont , "html.parser")  # Parse the HTML content of the page

            # scrapping movies name
            movie_tags_1 = soup.find_all("h3" , {"class" : "lister-item-header"})
            for i in movie_tags_1: 
                movie_name.append(i.a.text)

            # scrapping year 
            year_tag_1 = soup.find_all("span" , {"class" :"lister-item-year text-muted unbold" })
            for i in year_tag_1:
                year.append(i.text.replace("("  ," ").replace(")" ,"").replace("I  ",""))

            # scrapping time (duration)
            time_tag_1 =soup.find_all("span", {"class":"runtime"})
            for i in time_tag_1:
                time.append(int(i.text.split()[0]))  

            # scrapping genre
            genre_tag_1 =soup.find_all("span", {"class":"genre"})
            for i in genre_tag_1:
                genre.append(i.text.strip().split(",")[0])

            # scrapping ratings of the movies
            rating_tag_1 = soup.find_all("div" , {"class": "inline-block ratings-imdb-rating"})
            for i in rating_tag_1:
                rating.append(float(i.text.strip()))

            # scrapping metascore of the movies
            metascore_tag_1 = soup.find_all("div" , {"class": "inline-block ratings-metascore"})
            for i in metascore_tag_1:
                metascore.append(int(i.span.text))

            # scrapping director name
            dir_name_1 = soup.find_all("div", {"class":"lister-item-content"})
            for i in range(0,len(dir_name_1)):
                kk = dir_name_1[i].find("p",{"class":""}).text.strip().split("\n")[1]
                dir_name.append(kk)
                
movies_data()                

In [3]:
# index which doesn't have metascore
missing_data = [14,60,81,111,115,120,121,167,168,169,170,215,218,237,238,243,244]
for i in missing_data:
    metascore.insert(i , np.nan) 

In [4]:
# Creating a dataframe 
import pandas as pd
df = pd.DataFrame({"Sno.":np.arange(1 , 251),"Movie" : movie_name , "Year":year , "Genre" : genre ,"Rating" : rating , "Time": time , "Metascore":metascore, "Director" : dir_name})
df

Unnamed: 0,Sno.,Movie,Year,Genre,Rating,Time,Metascore,Director
0,1,The Shawshank Redemption,1994,Drama,9.3,142,82.0,Frank Darabont
1,2,The Godfather,1972,Crime,9.2,175,100.0,Francis Ford Coppola
2,3,The Dark Knight,2008,Action,9.0,152,84.0,Christopher Nolan
3,4,Schindler's List,1993,Biography,9.0,195,95.0,Steven Spielberg
4,5,The Lord of the Rings: The Return of the King,2003,Action,9.0,201,94.0,Peter Jackson
...,...,...,...,...,...,...,...,...
245,246,Dances with Wolves,1990,Adventure,8.0,181,72.0,Kevin Costner
246,247,The Incredibles,2004,Animation,8.0,115,90.0,Brad Bird
247,248,Groundhog Day,1993,Comedy,8.0,101,72.0,Harold Ramis
248,249,Aladdin,1992,Animation,8.0,90,86.0,"Ron Clements,"


## Saving Pandas DataFrame as a CSV file.

In [5]:
df.to_csv('Movies_data.csv',index=False )

## Reading CSV file

In [6]:
read_csv = pd.read_csv("Movies_data.csv")
read_csv

Unnamed: 0,Sno.,Movie,Year,Genre,Rating,Time,Metascore,Director
0,1,The Shawshank Redemption,1994,Drama,9.3,142,82.0,Frank Darabont
1,2,The Godfather,1972,Crime,9.2,175,100.0,Francis Ford Coppola
2,3,The Dark Knight,2008,Action,9.0,152,84.0,Christopher Nolan
3,4,Schindler's List,1993,Biography,9.0,195,95.0,Steven Spielberg
4,5,The Lord of the Rings: The Return of the King,2003,Action,9.0,201,94.0,Peter Jackson
...,...,...,...,...,...,...,...,...
245,246,Dances with Wolves,1990,Adventure,8.0,181,72.0,Kevin Costner
246,247,The Incredibles,2004,Animation,8.0,115,90.0,Brad Bird
247,248,Groundhog Day,1993,Comedy,8.0,101,72.0,Harold Ramis
248,249,Aladdin,1992,Animation,8.0,90,86.0,"Ron Clements,"


## Setting "Sno." as index

In [7]:
read_csv.set_index("Sno.")

Unnamed: 0_level_0,Movie,Year,Genre,Rating,Time,Metascore,Director
Sno.,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,The Shawshank Redemption,1994,Drama,9.3,142,82.0,Frank Darabont
2,The Godfather,1972,Crime,9.2,175,100.0,Francis Ford Coppola
3,The Dark Knight,2008,Action,9.0,152,84.0,Christopher Nolan
4,Schindler's List,1993,Biography,9.0,195,95.0,Steven Spielberg
5,The Lord of the Rings: The Return of the King,2003,Action,9.0,201,94.0,Peter Jackson
...,...,...,...,...,...,...,...
246,Dances with Wolves,1990,Adventure,8.0,181,72.0,Kevin Costner
247,The Incredibles,2004,Animation,8.0,115,90.0,Brad Bird
248,Groundhog Day,1993,Comedy,8.0,101,72.0,Harold Ramis
249,Aladdin,1992,Animation,8.0,90,86.0,"Ron Clements,"
