In [None]:
%pip install pandas tqdm matplotlib networkx plotly

In [177]:
import pandas as pd 
from tqdm import tqdm
import matplotlib.pyplot as plt
import networkx as nx
import plotly.express as px

In [190]:
## Helper Functions
def find_movie_name(input):
    mid = int(input)
    df = pd.read_csv('dataset/meta.csv')
    movie_name = df.loc[df['mid'] == mid, 'title'].values
    return movie_name[0] if len(movie_name) > 0 else None

def clean_dataframe(df):
    # Print shape before cleaning
    print("Shape before cleaning:", df.shape)
    
    # Remove duplicate rows
    df = df.drop_duplicates()
    # Remove rows with NaN values
    df = df.dropna()
    
    # Print shape after cleaning
    print("Shape after cleaning:", df.shape)
    return df

# Exploratory Analysis of Movie Ratings Dataset

## Dataset Description
- The dataset contains movie ratings provided by users.
- Each record consists of a user ID, movie ID, and rating.

In [179]:
df_rating = pd.read_csv('dataset/rating.csv').head(10000)
df_rating = clean_dataframe(df_rating)
df_meta = pd.read_csv('dataset/meta.csv')

Shape before cleaning: (10000, 3)
Shape after cleaning: (10000, 3)


## Data Preparation
- Load the dataset into a DataFrame.
- Convert the DataFrame into a NetworkX graph, where nodes represent users and movies, and edges represent ratings.

In [180]:
# Create a directed bipartite graph
B = nx.DiGraph()

# Add nodes with the node attribute "bipartite"
users = df_rating['uid'].unique()
movies = df_rating['mid'].unique()
B.add_nodes_from(users, bipartite=0)
B.add_nodes_from(movies, bipartite=1)

# Add edges with ratings as weights
for index, row in df_rating.iterrows():
    B.add_edge(row['uid'], row['mid'], weight=row['rating'])

## Graph Analysis

### User Activity
- Find the user who made the most number of ratings.
- Calculate the distribution of ratings provided by users.

In [181]:
most_active_user = df_rating['uid'].value_counts().idxmax()
print("Most active user:", most_active_user)

Most active user: 229


In [194]:
rating_distribution = df_rating['rating'].value_counts().sort_index()

fig = px.bar(x=rating_distribution.index, y=rating_distribution.values, 
             labels={'x': 'Rating', 'y': 'Frequency'}, 
             title='Rating Distribution')

# Show the plot
fig.show()

🌟🎬 Elite Critics: Unveiling the User with Ridiculous Standards 🎬🌟

In [191]:
average_rating_per_user = df_rating.groupby('uid')['rating'].mean()

# Sort users based on average rating in ascending order
sorted_users = average_rating_per_user.sort_values()

print("Ranking of users with lowest average rating:")
print(sorted_users)

Ranking of users with lowest average rating:
uid
470    0.500000
531    0.500000
172    1.000000
307    1.000000
374    1.166667
         ...   
260    5.000000
339    5.000000
464    5.000000
286    5.000000
315    5.000000
Name: rating, Length: 487, dtype: float64


### Movie Popularity
- Identify the movie with the most views (highest number of ratings).
- Determine the average rating for each movie.
- Find the highest-rated and lowest-rated movies.


In [183]:
most_popular_movie = df_rating['mid'].value_counts().idxmax()
print("Most popular movie:", find_movie_name(most_popular_movie))

Most popular movie: Terminator 3: Rise of the Machines


In [184]:
average_rating_per_movie = df_rating.groupby('mid')['rating'].mean().reset_index()

# Calculate overall average movie rating
overall_average_rating = average_rating_per_movie['rating'].mean()

# Print the average movie rating
print("Average movie rating:")
print("-" * 30)
print("Overall Average Rating: {:.2f}".format(overall_average_rating))
print("Individual Movie Ratings:")
print(average_rating_per_movie)

Average movie rating:
------------------------------
Overall Average Rating: 3.26
Individual Movie Ratings:
        mid    rating
0         5  3.083333
1        11  3.636364
2        12  2.250000
3        13  3.000000
4        14  2.833333
..      ...       ...
768   59387  3.000000
769   77866  2.750000
770   89492  3.666667
771  115210  4.000000
772  116977  5.000000

[773 rows x 2 columns]


🌟🎬 Movie Marvels and Meh Moments 🎬🌟

In [185]:
highest_rating_idx = df_rating['rating'].idxmax()
lowest_rating_idx = df_rating['rating'].idxmin()

# Extract the corresponding movie IDs
highest_rating_movie_id = df_rating.loc[highest_rating_idx, 'mid']
lowest_rating_movie_id = df_rating.loc[lowest_rating_idx, 'mid']

# Extract the highest and lowest rating values
highest_rating = df_rating.loc[highest_rating_idx, 'rating']
lowest_rating = df_rating.loc[lowest_rating_idx, 'rating']

print("Highest rated movie: {}, Rating: {:.2f}".format(find_movie_name(highest_rating_movie_id), highest_rating))
print("Lowest rated movie: {}, Rating: {:.2f}".format(find_movie_name(lowest_rating_movie_id), lowest_rating))

Highest rated movie: Sleepless in Seattle, Rating: 5.00
Lowest rated movie: American Graffiti, Rating: 0.50


### User Similarity
- Calculate similarity between users based on their ratings.
- Identify communities of users with similar preferences.

🔍 Determining besties 🔍

To uncover who vibes with you the most when it comes to the movies yall watch

In [186]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
# Create a user-movie matrix
user_movie_matrix = df_rating.pivot(index='uid', columns='mid', values='rating').fillna(0)

# Calculate cosine similarity between users
user_similarity = cosine_similarity(user_movie_matrix)

# Set diagonal values to zero
np.fill_diagonal(user_similarity, 0)

# Find most similar users (excluding itself)
most_similar_users = np.argmax(user_similarity, axis=1)

print("Most similar users for each user (excluding itself):")
for user, similar_user in enumerate(most_similar_users):
    print(f"User {user+1}: User {similar_user+1}")

Most similar users for each user (excluding itself):
User 1: User 194
User 2: User 165
User 3: User 232
User 4: User 339
User 5: User 190
User 6: User 184
User 7: User 14
User 8: User 96
User 9: User 245
User 10: User 208
User 11: User 467
User 12: User 62
User 13: User 22
User 14: User 103
User 15: User 65
User 16: User 434
User 17: User 281
User 18: User 145
User 19: User 39
User 20: User 129
User 21: User 323
User 22: User 13
User 23: User 186
User 24: User 97
User 25: User 403
User 26: User 60
User 27: User 194
User 28: User 357
User 29: User 290
User 30: User 457
User 31: User 128
User 32: User 97
User 33: User 362
User 34: User 91
User 35: User 67
User 36: User 71
User 37: User 356
User 38: User 295
User 39: User 19
User 40: User 199
User 41: User 360
User 42: User 10
User 43: User 215
User 44: User 354
User 45: User 175
User 46: User 362
User 47: User 459
User 48: User 119
User 49: User 458
User 50: User 342
User 51: User 276
User 52: User 69
User 53: User 252
User 54: User 135


🎬 **Discovering Fam Vibes** 🎬

Get ready to uncover communities where users share similar movie tastes and vibe together like one big fam! 🍿💫

In [187]:
# Detect communities using Louvain method
communities = nx.algorithms.community.modularity_max.greedy_modularity_communities(B)

# Display communities
print("Communities of users with similar preferences:")
for i, community in enumerate(communities):
    print(f"Community {i+1}: {list(community)}")

Communities of users with similar preferences:
Community 1: [1, 3, 8195, 8197, 7, 9, 12, 15, 16, 5137, 3089, 2067, 2069, 3093, 3097, 27, 28, 34, 4133, 37, 3110, 42, 4141, 47, 50, 51, 4148, 55, 57, 61, 63, 64, 2112, 1089, 1090, 69, 2118, 73, 75, 76, 79, 82, 86, 89, 96, 97, 1124, 106, 109, 110, 111, 2160, 117, 120, 121, 123, 124, 2176, 129, 133, 7303, 136, 4233, 138, 6283, 143, 8337, 146, 147, 148, 152, 4248, 154, 159, 164, 166, 174, 178, 184, 1213, 190, 6341, 200, 2252, 205, 206, 209, 210, 211, 212, 5333, 213, 214, 216, 215, 217, 219, 220, 2268, 1246, 223, 1247, 1248, 226, 1250, 1251, 230, 4327, 232, 233, 234, 1259, 235, 1262, 2289, 242, 243, 2291, 1267, 116977, 247, 1272, 249, 1273, 2300, 4349, 1278, 254, 255, 1281, 259, 263, 8456, 266, 267, 268, 273, 2321, 2324, 7445, 7450, 7451, 284, 285, 286, 290, 294, 300, 301, 302, 313, 314, 315, 6466, 2370, 324, 326, 330, 331, 333, 334, 1359, 335, 337, 51540, 340, 1366, 344, 6488, 346, 348, 358, 360, 361, 363, 367, 371, 374, 54648, 376, 49530, 38

## Recommendation System
### Algorithm Steps

1. **Calculate User Similarity**:
   - User similarity is calculated based on the cosine similarity between user vectors representing their movie ratings.
   - The `cosine_similarity` function from scikit-learn is used for this purpose.

2. **Find Similar Users**:
   - For a target user, similar users are identified based on the user similarity matrix.
   - The `find_similar_users` function retrieves the top similar users to the target user.

3. **Recommend Movies**:
   - Movies are recommended to the target user based on the ratings of similar users.
   - The `recommend_movies` function finds movies highly rated by similar users that the target user hasn't rated yet.
   - It calculates the average rating of each recommended movie among similar users and selects the top-rated ones as recommendations.


In [188]:
def find_similar_users(target_user, user_similarity_matrix, num_similar_users=5):
    """
    Find similar users to the target user based on user similarity matrix.
    Returns a list of similar user IDs.
    """
    similar_users = user_similarity_matrix[target_user].argsort()[::-1][1:num_similar_users+1]
    return similar_users

def recommend_movies(target_user, similar_users, df, top_n=5):
    """
    Recommend movies to the target user based on ratings of similar users.
    Returns a list of movie IDs recommended to the target user.
    """
    rated_movies = set(df[df['uid'] == target_user]['mid'])
    recommended_movies = []
    
    for user in similar_users:
        user_rated_movies = set(df[df['uid'] == user]['mid'])
        recommended_movies.extend(user_rated_movies - rated_movies)
    
    recommended_movies = list(recommended_movies)
    movie_ratings = [(movie, df[(df['mid'] == movie) & (df['uid'].isin(similar_users))]['rating'].mean()) for movie in recommended_movies]
    recommended_movies = sorted(movie_ratings, key=lambda x: x[1], reverse=True)[:top_n]
    recommended_movie_names = [find_movie_name(movie[0]) for movie in recommended_movies]
    
    return recommended_movie_names


🎥 Tailored Flick Picks 🎥

Welcome to a recommendation system crafted just for you! Get ready for personalized movie suggestions based on your ratings. 🌟🍿

In [189]:
# In this example, since we only have one user, similarity will be based on the user's own ratings
user_similarity_matrix = cosine_similarity(df_rating.pivot(index='uid', columns='mid', values='rating').fillna(0))

# Example: Recommend movies for target user '1'
target_user = 1
similar_users = find_similar_users(target_user, user_similarity_matrix)
recommended_movies = recommend_movies(target_user, similar_users, df_rating)
print("Recommended movies for user 1:", recommended_movies)

Recommended movies for user 1: ['Interview with the Vampire', 'Evil Dead II', '2001: A Space Odyssey', 'TMNT', 'Cool Hand Luke']
