In [4]:
import bs4
import requests
import time
import random as ran
import sys
import pandas as pd

In [5]:
def scrape_movies_data(data_name):
    """Scrapes movie data from IMDb's Top 150 movies page and saves it to a CSV file.
        Returns:
        dataframe: a table containe all information about top ranked movies"""
    
    step=0
    
    movie_list = []
    while step<=250:
        url=f"https://www.imdb.com/search/title/?groups=top_250&sort=user_rating,desc&start={step}&ref_=adv_nxt"
        resp = requests.get(url)
        content = bs4.BeautifulSoup(resp.content, 'html.parser')


        for movie in content.select('.lister-item-content'):
            
            try:

                data ={
                "title":movie.find('a').get_text().strip(),
                "year":movie.select('.lister-item-year')[0].get_text().strip()[1:-1],
                "time(min)":int(movie.select('.runtime')[0].get_text().strip()[0:-3]),
                "genre":movie.select('.genre')[0].get_text().strip(),
                "rating":float(movie.select('.ratings-imdb-rating')[0].get_text().strip()),
                "metascore":int(movie.select('.ratings-metascore')[0].get_text().strip()[0:4]),
                "simple_desc":movie.select('.text-muted')[2].get_text().strip(),
                "votes":int(movie.select('.sort-num_votes-visible')[0].findAll('span')[1].get_text().strip().replace(",", "")),
                "Gross (M$)":float(movie.select('.sort-num_votes-visible')[0].findAll('span')[4].get_text().strip()[1:-1])



            }
            except IndexError:
                continue

            movie_list.append(data)
        step+=50
        
    dataframe = pd.DataFrame(movie_list)
    dataframe.to_csv(data_name)



In [7]:
# Calling the function to store the data in csv file for any future use 
scrape_movies_data("movies.csv")
data= pd.read_csv('movies.csv')

In [8]:
data

Unnamed: 0.1,Unnamed: 0,title,year,time(min),genre,rating,metascore,simple_desc,votes,Gross (M$)
0,0,The Shawshank Redemption,1994,142,Drama,9.3,82,"Over the course of several years, two convicts...",2744211,28.34
1,1,The Godfather,1972,175,"Crime, Drama",9.2,100,"Don Vito Corleone, head of a mafia family, dec...",1908366,134.97
2,2,The Dark Knight,2008,152,"Action, Crime, Drama",9.0,84,When the menace known as the Joker wreaks havo...,2716942,534.86
3,3,Schindler's List,1993,195,"Biography, Drama, History",9.0,95,"In German-occupied Poland during World War II,...",1384514,96.90
4,4,12 Angry Men,1957,96,"Crime, Drama",9.0,97,The jury in a New York City murder trial is fr...,812432,4.36
...,...,...,...,...,...,...,...,...,...,...
228,228,The Incredibles,2004,115,"Animation, Action, Adventure",8.0,90,"While trying to lead a quiet suburban life, a ...",760911,261.44
229,229,Dances with Wolves,1990,181,"Adventure, Drama, Western",8.0,72,"Lieutenant John Dunbar, assigned to a remote w...",276365,184.21
230,230,Aladdin,1992,90,"Animation, Adventure, Comedy",8.0,86,A kind-hearted street urchin and a power-hungr...,436409,217.35
231,231,Groundhog Day,1993,101,"Comedy, Drama, Fantasy",8.0,72,"A narcissistic, self-centered weatherman finds...",650252,70.91


In [9]:
# Check if there are any missing values
print(data.isnull().sum())


Unnamed: 0     0
title          0
year           0
time(min)      0
genre          0
rating         0
metascore      0
simple_desc    0
votes          0
Gross (M$)     0
dtype: int64


In [10]:
# Summary statistics
print(data.describe())



       Unnamed: 0   time(min)      rating   metascore         votes  \
count  233.000000  233.000000  233.000000  233.000000  2.330000e+02   
mean   116.000000  130.326180    8.308584   82.665236  6.947200e+05   
std     67.405489   28.932532    0.233811   11.099062  5.288226e+05   
min      0.000000   80.000000    8.000000   55.000000  3.962600e+04   
25%     58.000000  110.000000    8.100000   75.000000  2.626860e+05   
50%    116.000000  127.000000    8.200000   84.000000  5.929890e+05   
75%    174.000000  146.000000    8.400000   91.000000  1.010706e+06   
max    232.000000  238.000000    9.300000  100.000000  2.744211e+06   

       Gross (M$)  
count  233.000000  
mean    97.576867  
std    140.389545  
min      0.010000  
25%      8.180000  
50%     36.760000  
75%    138.430000  
max    858.370000  


In [12]:

# Count of movies by genre
genre_counts = data['genre'].value_counts()
genre_counts


Drama                           17
Crime, Drama                    12
Animation, Adventure, Comedy    10
Biography, Drama, History        8
Drama, War                       8
                                ..
Drama, Sci-Fi                    1
Crime, Drama, Film-Noir          1
Action, Adventure, Comedy        1
Biography, Comedy, Crime         1
Comedy                           1
Name: genre, Length: 96, dtype: int64