In [1]:
import zipfile
import os

zip_file_path = 'movie_data.zip'
extract_folder = 'movie_data_extracted'

os.makedirs(extract_folder, exist_ok=True)

with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(extract_folder)

extracted_files = os.listdir(extract_folder)
csv_files = [file for file in extracted_files if file.endswith('.csv')]

num_csv_files = len(csv_files)
print(f'There are {num_csv_files} CSV files in the dataset.')



There are 4 CSV files in the dataset.


In [2]:
import pandas as pd

movies_csv_path = os.path.join(extract_folder, 'movies.csv')

movies_df = pd.read_csv(movies_csv_path)

movies_shape = movies_df.shape
movies_shape


(9742, 3)

In [3]:
ratings_csv_path = os.path.join(extract_folder, 'ratings.csv')

ratings_df = pd.read_csv(ratings_csv_path)

ratings_shape = ratings_df.shape
ratings_shape


(100836, 4)

In [4]:
ratings_csv_path = r'C:\Users\hp\Downloads\Hackathon\movie_data_extracted\ratings.csv'

ratings_df = pd.read_csv(ratings_csv_path)

unique_user_ids = ratings_df['userId'].nunique()
print(unique_user_ids)


610


In [5]:
movies_csv_path = r'C:\Users\hp\Downloads\Hackathon\movie_data_extracted\movies.csv'
movies_df = pd.read_csv(movies_csv_path)

ratings_count = ratings_df['movieId'].value_counts().reset_index()
ratings_count.columns = ['movieId', 'rating_count']

movies_ratings = pd.merge(ratings_count, movies_df, on='movieId')

max_rated_movie = movies_ratings.loc[movies_ratings['rating_count'].idxmax()]

print(f"The movie with the maximum number of ratings is '{max_rated_movie['title']}' with {max_rated_movie['rating_count']} ratings.")


The movie with the maximum number of ratings is 'Forrest Gump (1994)' with 329 ratings.


In [6]:
tags_csv_path = r'C:\Users\hp\Downloads\Hackathon\movie_data_extracted\tags.csv'

tags_df = pd.read_csv(tags_csv_path)

matrix_movie_id = movies_df[movies_df['title'] == 'Matrix, The (1999)']['movieId'].values[0]

matrix_tags = tags_df[tags_df['movieId'] == matrix_movie_id]['tag'].unique()

print("The tags submitted by users for 'Matrix, The (1999)' are:")
for tag in matrix_tags:
    print(tag)


The tags submitted by users for 'Matrix, The (1999)' are:
martial arts
sci-fi
alternate universe
philosophy
post apocalyptic


In [7]:
movie_id = movies_df[movies_df['title'] == 'Terminator 2: Judgment Day (1991)']['movieId'].values[0]

terminator_ratings = ratings_df[ratings_df['movieId'] == movie_id]['rating']
average_rating = terminator_ratings.mean()

average_rating

3.970982142857143

In [8]:

grouped_ratings = ratings_df.groupby('movieId').agg(count=('rating', 'count'), mean_rating=('rating', 'mean')).reset_index()

merged_df = pd.merge(movies_df, grouped_ratings, on='movieId', how='inner')

filtered_movies_df = merged_df[merged_df['count'] > 50]

filtered_movies_df.head() 

Unnamed: 0,movieId,title,genres,count,mean_rating
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,215,3.92093
1,2,Jumanji (1995),Adventure|Children|Fantasy,110,3.431818
2,3,Grumpier Old Men (1995),Comedy|Romance,52,3.259615
5,6,Heat (1995),Action|Crime|Thriller,102,3.946078
6,7,Sabrina (1995),Comedy|Romance,54,3.185185


In [9]:
grouped_ratings = ratings_df.groupby('movieId').agg(count=('rating', 'count'), mean_rating=('rating', 'mean')).reset_index()
merged_df = pd.merge(movies_df, grouped_ratings, on='movieId', how='inner')

filtered_movies_df = merged_df[merged_df['count'] > 50]

top_movie = filtered_movies_df.loc[filtered_movies_df['mean_rating'].idxmax()]

print(f"The most popular movie based on average user ratings is: {top_movie['title']}")

The most popular movie based on average user ratings is: Shawshank Redemption, The (1994)


In [10]:

grouped_ratings = ratings_df.groupby('movieId').agg(count=('rating', 'count')).reset_index()

merged_df = pd.merge(movies_df, grouped_ratings, on='movieId', how='inner')

filtered_movies_df = merged_df[merged_df['count'] > 50]

sorted_movies = filtered_movies_df.sort_values(by='count', ascending=False)

top_5_movies = sorted_movies.head(5)['title'].tolist()

print("Top 5 movies based on number of user ratings:", top_5_movies)

options = ["Pulp Fiction (1994)", "Bad Boys (1995)", "Silence of the Lambs, The (1991)", "Matrix, The (1999)"]
selected_options = [option for option in options if option in top_5_movies]
print("Selected options in top 5:", selected_options)

Top 5 movies based on number of user ratings: ['Forrest Gump (1994)', 'Shawshank Redemption, The (1994)', 'Pulp Fiction (1994)', 'Silence of the Lambs, The (1991)', 'Matrix, The (1999)']
Selected options in top 5: ['Pulp Fiction (1994)', 'Silence of the Lambs, The (1991)', 'Matrix, The (1999)']


In [11]:
grouped_ratings = ratings_df.groupby('movieId').agg(count=('rating', 'count')).reset_index()

merged_df = pd.merge(movies_df, grouped_ratings, on='movieId', how='inner')

filtered_movies_df = merged_df[merged_df['count'] > 50]

sci_fi_movies_df = filtered_movies_df[filtered_movies_df['genres'].str.contains('Sci-Fi')]

sorted_sci_fi_movies = sci_fi_movies_df.sort_values(by='count', ascending=False)

if len(sorted_sci_fi_movies) >= 3:
    third_most_popular_sci_fi_movie = sorted_sci_fi_movies.iloc[2]['title']
    print(f"The third most popular Sci-Fi movie based on the number of user ratings is: {third_most_popular_sci_fi_movie}")
else:
    print("There are not enough Sci-Fi movies with more than 50 ratings.")


The third most popular Sci-Fi movie based on the number of user ratings is: Jurassic Park (1993)


In [16]:
grouped_ratings = ratings_df.groupby('movieId').agg(
    rating_count=('rating', 'count'),
    average_rating=('rating', 'mean')
).reset_index()

merged_df = pd.merge(movies_df, grouped_ratings, on='movieId', how='inner')

filtered_df = merged_df[merged_df['rating_count'] > 50]

filtered_df.head(10)

Unnamed: 0,movieId,title,genres,rating_count,average_rating
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,215,3.92093
1,2,Jumanji (1995),Adventure|Children|Fantasy,110,3.431818
2,3,Grumpier Old Men (1995),Comedy|Romance,52,3.259615
5,6,Heat (1995),Action|Crime|Thriller,102,3.946078
6,7,Sabrina (1995),Comedy|Romance,54,3.185185
9,10,GoldenEye (1995),Action|Adventure|Thriller,132,3.496212
10,11,"American President, The (1995)",Comedy|Drama|Romance,70,3.671429
15,16,Casino (1995),Crime|Drama,82,3.926829
16,17,Sense and Sensibility (1995),Drama|Romance,67,3.776119
18,19,Ace Ventura: When Nature Calls (1995),Comedy,88,2.727273


In [17]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import numpy as np

def scrapper(imdbId):
    id = str(int(imdbId))
    n_zeroes = 7 - len(id)
    new_id = "0"*n_zeroes + id
    URL = f"https://www.imdb.com/title/tt{new_id}/"
    print(f"Accessing URL: {URL}")  # Debug print
    
    request_header = {'Content-Type': 'text/html; charset=UTF-8', 
                      'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/119.0', 
                      'Accept-Encoding': 'gzip, deflate, br'}
    
    response = requests.get(URL, headers=request_header)
    print(f"Response status code: {response.status_code}")  # Debug print
    
    soup = BeautifulSoup(response.text, 'html.parser')
    imdb_rating = soup.find('span', attrs={'class': 'sc-eb51e184-1 ljxVSS'})
    
    if imdb_rating:
        print(f"Found rating: {imdb_rating.text}")  
    else:
        print("Rating not found")
    
    return imdb_rating.text if imdb_rating else np.nan

links_df = pd.read_csv('movie_data_extracted\links.csv') 

merged_df = pd.merge(filtered_df, links_df, on='movieId')


movie_imdb_ratings = {}

for index, row in merged_df.iterrows():
    imdb_id = row['imdbId']
    title = row['title']
    rating = scrapper(imdb_id)
    movie_imdb_ratings[title] = rating

ratings_df = pd.DataFrame(list(movie_imdb_ratings.items()), columns=['Title', 'IMDb Rating'])


ratings_df.to_csv('movie_imdb_ratings.csv', index=False)

print("IMDb ratings have been saved to 'movie_imdb_ratings.csv'.")


Accessing URL: https://www.imdb.com/title/tt0114709/
Response status code: 200
Found rating: 8.3
Accessing URL: https://www.imdb.com/title/tt0113497/
Response status code: 200
Found rating: 7.1
Accessing URL: https://www.imdb.com/title/tt0113228/
Response status code: 200
Found rating: 6.7
Accessing URL: https://www.imdb.com/title/tt0113277/
Response status code: 200
Found rating: 8.3
Accessing URL: https://www.imdb.com/title/tt0114319/
Response status code: 200
Found rating: 6.3
Accessing URL: https://www.imdb.com/title/tt0113189/
Response status code: 200
Found rating: 7.2
Accessing URL: https://www.imdb.com/title/tt0112346/
Response status code: 200
Found rating: 6.8
Accessing URL: https://www.imdb.com/title/tt0112641/
Response status code: 200
Found rating: 8.2
Accessing URL: https://www.imdb.com/title/tt0114388/
Response status code: 200
Found rating: 7.7
Accessing URL: https://www.imdb.com/title/tt0112281/
Response status code: 200
Found rating: 6.4
Accessing URL: https://www.imd

In [None]:
import pandas as pd

ratings_df = pd.read_csv('movie_imdb_ratings.csv')

merged_df = pd.merge(filtered_df, links_df, on='movieId')

merged_ratings_df = pd.merge(merged_df, ratings_df, left_on='title', right_on='Title')

highest_rated_movie = merged_ratings_df.loc[merged_ratings_df['IMDb Rating'].astype(float).idxmax()]

highest_rated_movieId = highest_rated_movie['movieId']
highest_rated_movie_name = highest_rated_movie['title']

print(f"The Sci-Fi movie with the highest IMDb rating is '{highest_rated_movie_name}' and movieId: {highest_rated_movieId}")


In [None]:
import pandas as pd

sci_fi_movies = merged_ratings_df[merged_ratings_df['genres'].str.contains('Sci-Fi')]

highest_rated_sci_fi_movie = sci_fi_movies.loc[sci_fi_movies['IMDb Rating'].astype(float).idxmax()]

highest_rated_sci_fi_movieId = highest_rated_sci_fi_movie['movieId']
highest_rated_sci_fi_movie_name = highest_rated_sci_fi_movie['title']

print(f"The Sci-Fi movie with the highest IMDb rating is '{highest_rated_sci_fi_movie_name}' with movieId: {highest_rated_sci_fi_movieId}")