**Importing Libraries**

In [27]:
import pandas as pd
import numpy as np
import matplotlib.pyplot  as plt

**Loading Data**

In [28]:
links = pd.read_csv('https://raw.githubusercontent.com/BheemisettySaiHarsha/PRML_project/main/PRML%20DATASET/links.csv')
links

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0
...,...,...,...
9737,193581,5476944,432131.0
9738,193583,5914996,445030.0
9739,193585,6397426,479308.0
9740,193587,8391976,483455.0


In [29]:
movies = pd.read_csv('https://raw.githubusercontent.com/BheemisettySaiHarsha/PRML_project/main/PRML%20DATASET/movies.csv')
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9739,193585,Flint (2017),Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


**Merging movies.csv, links.csv**

In [30]:
merged_df = pd.merge(movies, links, on='movieId')
merged_df

Unnamed: 0,movieId,title,genres,imdbId,tmdbId
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,114709,862.0
1,2,Jumanji (1995),Adventure|Children|Fantasy,113497,8844.0
2,3,Grumpier Old Men (1995),Comedy|Romance,113228,15602.0
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,114885,31357.0
4,5,Father of the Bride Part II (1995),Comedy,113041,11862.0
...,...,...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy,5476944,432131.0
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy,5914996,445030.0
9739,193585,Flint (2017),Drama,6397426,479308.0
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation,8391976,483455.0


In [31]:
tags = pd.read_csv('https://raw.githubusercontent.com/BheemisettySaiHarsha/PRML_project/main/PRML%20DATASET/tags.csv')
tags.drop('timestamp',axis = 1,inplace = True) #dropping the timestamp
tags.head()

Unnamed: 0,userId,movieId,tag
0,2,60756,funny
1,2,60756,Highly quotable
2,2,60756,will ferrell
3,2,89774,Boxing story
4,2,89774,MMA


In [32]:
ratings = pd.read_csv('https://raw.githubusercontent.com/BheemisettySaiHarsha/PRML_project/main/PRML%20DATASET/ratings.csv')
ratings.drop('timestamp',axis = 1,inplace = True) #dropping the timestamp
ratings

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0
...,...,...,...
100831,610,166534,4.0
100832,610,168248,5.0
100833,610,168250,5.0
100834,610,168252,5.0


**Merging ratings and movies on movieId**

In [33]:
ratings_with_name = ratings.merge(movies,on='movieId')

In [34]:
ratings_with_name

Unnamed: 0,userId,movieId,rating,title,genres
0,1,1,4.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,5,1,4.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,7,1,4.5,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
3,15,1,2.5,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
4,17,1,4.5,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
...,...,...,...,...,...
100831,610,160341,2.5,Bloodmoon (1997),Action|Thriller
100832,610,160527,4.5,Sympathy for the Underdog (1971),Action|Crime|Drama
100833,610,160836,3.0,Hazard (2005),Action|Drama|Thriller
100834,610,163937,3.5,Blair Witch (2016),Horror|Thriller


**Reframing the dataframe by calculating the number of ratings a movie got**

In [35]:
num_rating_df = ratings_with_name.groupby('title').count()['rating'].reset_index()
num_rating_df.rename(columns={'rating':'num_ratings'},inplace=True)
num_rating_df

Unnamed: 0,title,num_ratings
0,'71 (2014),1
1,'Hellboy': The Seeds of Creation (2004),1
2,'Round Midnight (1986),2
3,'Salem's Lot (2004),1
4,'Til There Was You (1997),2
...,...,...
9714,eXistenZ (1999),22
9715,xXx (2002),24
9716,xXx: State of the Union (2005),5
9717,¡Three Amigos! (1986),26


**Calculating the average rating of a movie and sorting it on average rating and a minimum of 100 ratings**

In [40]:

# First, filter out movies with less than 100 ratings
filtered_ratings = ratings_with_name.groupby('title').filter(lambda x: len(x) >= 100)

# Then, join the two dataframes on the movie titles
merged_df = pd.merge(filtered_ratings, num_rating_df, on='title', how='left')

# Now, calculate the average rating and corresponding movieId for each movie
average_ratings = merged_df.groupby('title').agg({'rating': 'mean', 'movieId': 'first'}).reset_index()

# Finally, merge with the 'links' dataframe to get corresponding tmdbId
merged_with_links = pd.merge(average_ratings, links, on='movieId', how='left')

# Sort the movies by average rating
sorted_ratings = merged_with_links.sort_values(by='rating', ascending=False)

sorted_ratings


Unnamed: 0,title,rating,movieId,imdbId,tmdbId
106,"Shawshank Redemption, The (1994)",4.429022,318,111161,278.0
57,"Godfather, The (1972)",4.289062,858,68646,238.0
47,Fight Club (1999),4.272936,2959,137523,550.0
58,"Godfather: Part II, The (1974)",4.259690,1221,71562,240.0
37,"Departed, The (2006)",4.252336,48516,407887,1422.0
...,...,...,...,...,...
92,"Net, The (1995)",3.040179,185,113957,1642.0
30,Cliffhanger (1993),3.034653,434,106582,9350.0
67,Home Alone (1990),2.995690,586,99785,771.0
17,Batman Forever (1995),2.916058,153,112462,414.0
