In [35]:
import requests
from bs4 import BeautifulSoup
import pandas

urls = ['https://www.imdb.com/search/title/?title_type=feature&num_votes=25000,&genres=animation&sort=user_rating,desc',
        'https://www.imdb.com/search/title/?title_type=feature&num_votes=25000,&genres=animation&sort=user_rating,desc&start=51&ref_=adv_nxt',
        'https://www.imdb.com/search/title/?title_type=feature&num_votes=25000,&genres=animation&sort=user_rating,desc&start=101&ref_=adv_nxt',
        'https://www.imdb.com/search/title/?title_type=feature&num_votes=25000,&genres=animation&sort=user_rating,desc&start=151&ref_=adv_nxt',
        'https://www.imdb.com/search/title/?title_type=feature&num_votes=25000,&genres=animation&sort=user_rating,desc&start=201&ref_=adv_nxt',
        'https://www.imdb.com/search/title/?title_type=feature&num_votes=25000,&genres=animation&sort=user_rating,desc&start=251&ref_=adv_nxt']

name_col = []
year_col = []

genre_col = []
imdb_rating_col = []

director_col = []


for url in urls:
    response = requests.get(url)
    html_soup = BeautifulSoup(response.text, 'html.parser')
    movie_containers = html_soup.find_all('div', class_='lister-item mode-advanced')

    for container in movie_containers:
        name = container.h3.a.text
        name_col.append(name)

        year = container.h3.find('span', class_='lister-item-year').text
        year_col.append(year)



        genre = container.find('span', class_='genre').text
        genre_col.append(genre)

        imdb_rating = float(container.strong.text)
        imdb_rating_col.append(imdb_rating)


        director = container.find('p', class_='').find_all('a')[0].text
        director_col.append(director)


movie_dict = {'name': name_col,
              'year': year_col,
              'rating': imdb_rating_col,
              'genre': genre_col,
              'director': director_col
              }

df = pandas.DataFrame(movie_dict)
df.to_csv('feature.csv', index=True)
print('data saved')

data saved


In [36]:
df

Unnamed: 0,name,year,rating,genre,director
0,Spider-Man: Across the Spider-Verse,(2023),8.7,"\nAnimation, Action, Adventure",Joaquim Dos Santos
1,Sen to Chihiro no kamikakushi,(2001),8.6,"\nAnimation, Adventure, Family",Hayao Miyazaki
2,The Lion King,(1994),8.5,"\nAnimation, Adventure, Drama",Roger Allers
3,Hotaru no haka,(1988),8.5,"\nAnimation, Drama, War",Isao Takahata
4,Spider-Man: Into the Spider-Verse,(2018),8.4,"\nAnimation, Action, Adventure",Bob Persichetti
...,...,...,...,...,...
295,Trolls World Tour,(2020),6.1,"\nAnimation, Adventure, Comedy",Walt Dohrn
296,Lightyear,(2022),6.1,"\nAnimation, Action, Adventure",Angus MacLane
297,Bee Movie,(2007),6.1,"\nAnimation, Adventure, Comedy",Simon J. Smith
298,Shrek the Third,(2007),6.1,"\nAnimation, Adventure, Comedy",Chris Miller


In [37]:
df = df.sort_values(by='year')

# Reset the index after sorting
df = df.reset_index(drop=True)


In [38]:
df

Unnamed: 0,name,year,rating,genre,director
0,Snow White and the Seven Dwarfs,(1937),7.6,"\nAnimation, Adventure, Family",William Cottrell
1,Pinocchio,(1940),7.5,"\nAnimation, Adventure, Comedy",Norman Ferguson
2,Fantasia,(1940),7.7,"\nAnimation, Family, Fantasy",James Algar
3,Dumbo,(1941),7.2,"\nAnimation, Adventure, Drama",Samuel Armstrong
4,Bambi,(1942),7.3,"\nAnimation, Adventure, Drama",James Algar
...,...,...,...,...,...
295,Moana,(I) (2016),7.6,"\nAnimation, Adventure, Comedy",Ron Clements
296,Coco,(I) (2017),8.4,"\nAnimation, Adventure, Drama",Lee Unkrich
297,Onward,(I) (2020),7.4,"\nAnimation, Action, Adventure",Dan Scanlon
298,Luck,(I) (2022),6.4,"\nAnimation, Adventure, Comedy",Peggy Holmes


In [41]:
df['year'] = df['year'].str.extract('(\d+)').astype(float)
# Drop rows where the 'year' value is greater than 2013 or less than 2003
df = df[(df['year'] <= 2013) & (df['year'] >= 2003)]

# Reset the index after dropping rows
df = df.reset_index(drop=True)

In [46]:
df.shape

(101, 5)

In [47]:
df

Unnamed: 0,name,year,rating,genre,director
0,Sinbad: Legend of the Seven Seas,2003.0,6.7,"\nAnimation, Adventure, Comedy",Patrick Gilmore
1,Tôkyô goddofâzâzu,2003.0,7.8,"\nAnimation, Adventure, Comedy",Satoshi Kon
2,Finding Nemo,2003.0,8.2,"\nAnimation, Adventure, Comedy",Andrew Stanton
3,Les triplettes de Belleville,2003.0,7.7,"\nAnimation, Adventure, Comedy",Sylvain Chomet
4,Brother Bear,2003.0,6.8,"\nAnimation, Adventure, Comedy",Aaron Blaise
...,...,...,...,...,...
96,Monsters University,2013.0,7.2,"\nAnimation, Adventure, Comedy",Dan Scanlon
97,Kaze tachinu,2013.0,7.7,"\nAnimation, Biography, Drama",Hayao Miyazaki
98,Innocence,2004.0,7.4,"\nAnimation, Drama, Mystery",Mamoru Oshii
99,9,2009.0,7.0,"\nAnimation, Action, Adventure",Shane Acker


In [50]:

# Function to calculate the average rating for the top-rated movies
def average_top_rated_rating(df):
    # Sort the DataFrame by 'rating' column in descending order
    sorted_df = df.sort_values(by='rating', ascending=False)
    
    # Calculate the average rating for the top-rated movies (e.g., top 10%)
    top_rated_avg_rating = sorted_df['rating'].head(int(len(sorted_df) * 0.10)).mean()
    
    return top_rated_avg_rating

# Function to find the most common genre among the top-rated movies
def most_common_genre(df):
    # Sort the DataFrame by 'rating' column in descending order
    sorted_df = df.sort_values(by='rating', ascending=False)
    
    # Get the most common genre among the top-rated movies (e.g., top 10%)
    top_rated_genres = sorted_df['genre'].head(int(len(sorted_df) * 0.10))
    most_common_genre = top_rated_genres.mode().iloc[0]
    
    return most_common_genre

# Function to identify the director with the highest average IMDb rating
def director_with_highest_average_rating(df):
    # Group the DataFrame by 'director' and calculate the average rating for each director
    director_avg_ratings = df.groupby('director')['rating'].mean()
    
    # Find the director with the highest average rating
    highest_rated_director = director_avg_ratings.idxmax()
    
    return highest_rated_director

# Function to determine the year with the highest number of top-rated movies
def year_with_highest_number_of_top_rated_movies(df):
    # Sort the DataFrame by 'rating' column in descending order
    sorted_df = df.sort_values(by='rating', ascending=False)
    
    # Get the year with the highest number of top-rated movies (e.g., top 10%)
    top_rated_years = sorted_df['year'].head(int(len(sorted_df) * 0.10))
    year_with_highest_count = top_rated_years.mode().iloc[0]
    
    return year_with_highest_count

# Call functions
average_rating = average_top_rated_rating(df)
common_genre = most_common_genre(df)
highest_rated_director = director_with_highest_average_rating(df)
year_with_highest_count = year_with_highest_number_of_top_rated_movies(df)

print("Average Rating for Top-Rated Movies:", average_rating)
print("Most Common Genre among Top-Rated Movies:", common_genre)
print("Director with Highest Average IMDb Rating:", highest_rated_director)
print("Year with Highest Number of Top-Rated Movies:", year_with_highest_count)


Average Rating for Top-Rated Movies: 8.18
Most Common Genre among Top-Rated Movies: 
Animation, Adventure, Comedy            
Director with Highest Average IMDb Rating: Andrew Stanton
Year with Highest Number of Top-Rated Movies: 2004.0
