In [28]:
import pandas as pd
import numpy as np
from collections import Counter
from datetime import datetime

movies_df = pd.read_csv('data/movies.csv')
ratings_df = pd.read_csv('data/ratings.csv')
tags_df = pd.read_csv('data/tags.csv')


In [2]:
movies_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [3]:
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,307,3.5,1256677221
1,1,481,3.5,1256677456
2,1,1091,1.5,1256677471
3,1,1257,4.5,1256677460
4,1,1449,4.5,1256677264


In [29]:
tags_df.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,14,110,epic,1443148538
1,14,110,Medieval,1443148532
2,14,260,sci-fi,1442169410
3,14,260,space action,1442169421
4,14,318,imdb top 250,1442615195


In [6]:
merged_df = pd.merge(movies_df, ratings_df, on='movieId', how='inner')

In [7]:
merged_df.head()

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,4,4.0,1113765937
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,10,5.0,948885850
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,14,4.5,1442169375
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,15,4.0,1370810063
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,22,4.0,1237622631


In [8]:
def most_rated_movies(data, n):
    movie_rating_count = data.groupby('movieId')['rating'].count().reset_index()
    movie_rating_count.columns = ['movieId', 'count']
    top_n_most_rated_movies = movie_rating_count.nlargest(n, 'count')
    return pd.merge(top_n_most_rated_movies, data[['movieId', 'title']], on='movieId').drop_duplicates()

# Example: Top 10 most-rated movies
print(most_rated_movies(merged_df, 10))


        movieId  count                                      title
0           318  97999           Shawshank Redemption, The (1994)
97999       356  97040                        Forrest Gump (1994)
195039      296  92406                        Pulp Fiction (1994)
287445      593  87899           Silence of the Lambs, The (1991)
375344     2571  84545                         Matrix, The (1999)
459889      260  81815  Star Wars: Episode IV - A New Hope (1977)
541704      480  76451                       Jurassic Park (1993)
618155      527  71516                    Schindler's List (1993)
689671      110  68803                          Braveheart (1995)
758474        1  68469                           Toy Story (1995)


In [9]:
def highest_rated_movies(data, n, min_ratings):
    movie_avg_rating = data.groupby('movieId')['rating'].agg(['count', 'mean']).reset_index()
    movie_avg_rating_filtered = movie_avg_rating[movie_avg_rating['count'] >= min_ratings]
    top_n_highest_rated_movies = movie_avg_rating_filtered.nlargest(n, 'mean')
    return pd.merge(top_n_highest_rated_movies, data[['movieId', 'title']], on='movieId').drop_duplicates()

# Example: Top 10 highest-rated movies with a minimum of 100 ratings
print(highest_rated_movies(merged_df, 10, 100))


        movieId  count      mean                                    title
0        171011    853  4.486518                   Planet Earth II (2016)
853      159817   1384  4.458092                      Planet Earth (2006)
2237        318  97999  4.424188         Shawshank Redemption, The (1994)
100236   170705    984  4.399898                  Band of Brothers (2001)
101220   174053   1074  4.350559     Black Mirror: White Christmas (2014)
102294   171495    157  4.343949                                   Cosmos
102451   172591    421  4.339667  The Godfather Trilogy: 1972-1990 (1992)
102872      858  60904  4.332893                    Godfather, The (1972)
163776       50  62180  4.291959               Usual Suspects, The (1995)
225956   176601    180  4.263889                             Black Mirror


In [10]:
def popular_genres(data, n):
    genre_ratings = data.explode('genres').groupby('genres')['rating'].mean().reset_index()
    return genre_ratings.nlargest(n, 'rating')

# Example: Top 5 popular genres
print(popular_genres(merged_df, 5))


                                            genres  rating
481   Action|Drama|Mystery|Romance|Sci-Fi|Thriller   5.000
669    Adventure|Children|Comedy|Documentary|Drama   5.000
822            Adventure|Drama|Romance|War|Western   5.000
1234               Comedy|Fantasy|Romance|Thriller   4.625
300                 Action|Children|Drama|Thriller   4.500


In [11]:
def extract_year(title):
    try:
        return int(title.strip()[-5:-1])
    except:
        return None

def year_wise_analysis(data):
    data['year'] = data['title'].apply(extract_year)
    year_movie_count = data.groupby('year')['movieId'].nunique().reset_index()
    year_movie_count.columns = ['year', 'count']
    year_avg_rating = data.groupby('year')['rating'].mean().reset_index()
    year_avg_rating.columns = ['year', 'mean_rating']
    return pd.merge(year_movie_count, year_avg_rating, on='year')

# Year-wise analysis
print(year_wise_analysis(merged_df))


       year  count  mean_rating
0       6.0      1     3.000000
1     201.0      2     2.166667
2    1874.0      1     2.550000
3    1878.0      1     2.928571
4    1883.0      1     2.300000
..      ...    ...          ...
131  2014.0   2205     3.619954
132  2015.0   2233     3.561680
133  2016.0   2143     3.532930
134  2017.0   1819     3.516753
135  2018.0    826     3.386943

[136 rows x 3 columns]


In [30]:
def tag_analysis(tags, n):
    tag_counts = tags['tag'].value_counts().reset_index()
    tag_counts.columns = ['tag', 'count']
    return tag_counts.nlargest(n, 'count')

# Example: Top 10 most common tags
print(tag_analysis(tags_df, 10))


                  tag  count
0              sci-fi   9400
1         atmospheric   6430
2              action   6219
3              comedy   5923
4             surreal   5299
5     based on a book   5294
6               funny   4864
7        twist ending   4844
8  visually appealing   4333
9            dystopia   4268


In [31]:
def user_activity(data, n):
    user_rating_count = data.groupby('userId')['rating'].agg(['count', 'mean']).reset_index()
    user_rating_count.columns = ['userId', 'num_ratings', 'avg_rating']
    top_n_active_users = user_rating_count.nlargest(n, 'num_ratings')
    return top_n_active_users

# Example: Top 10 most active users
print(user_activity(merged_df, 10))

        userId  num_ratings  avg_rating
123099  123100        23715    3.130635
117489  117490         9279    3.278424
134595  134596         8381    3.198306
212342  212343         7884    2.588026
242682  242683         7515    3.208317
111907  111908         6645    1.524981
77608    77609         6398    2.812207
63782    63783         6346    3.485424
172356  172357         5868    2.442059
141954  141955         5810    2.874785


In [32]:
def favorite_genres(data, user_id, n):
    user_data = data[data['userId'] == user_id]
    genre_ratings = user_data.explode('genres').groupby('genres')['rating'].mean().reset_index()
    return genre_ratings.nlargest(n, 'rating')

# Example: Top 5 favorite genres for user ID 1
print(favorite_genres(merged_df, 1, 5))

                  genres  rating
6  Comedy|Fantasy|Sci-Fi     4.5
7         Comedy|Romance     4.5
2           Action|Drama     4.0
8         Comedy|Western     4.0
9                  Drama     4.0


In [33]:
top_active_users = user_activity(merged_df, 10)
for user_id in top_active_users['userId']:
    print(f"User ID {user_id} - Top 5 Favorite Genres:")
    print(favorite_genres(merged_df, user_id, 5))
    print()

User ID 123100 - Top 5 Favorite Genres:
                                                genres  rating
84   Action|Adventure|Drama|Fantasy|Romance|Sci-Fi|...     5.0
430  Adventure|Animation|Comedy|Fantasy|Romance|Sci-Fi     5.0
714                         Children|Crime|Documentary     5.0
822                       Comedy|Drama|Romance|Western     5.0
942                                      Crime|Romance     5.0

User ID 117490 - Top 5 Favorite Genres:
                                          genres  rating
52                   Action|Adventure|Crime|IMAX     5.0
395     Animation|Children|Drama|Fantasy|Musical     5.0
450               Children|Drama|Fantasy|Romance     5.0
59                Action|Adventure|Drama|Fantasy     4.5
62   Action|Adventure|Drama|Fantasy|Mystery|IMAX     4.5

User ID 134596 - Top 5 Favorite Genres:
                                                genres  rating
356                           Adventure|Comedy|Musical     4.5
380             Adventure|Drama|