pip install ipywidgets
jupyter nbextension enable --py widgetsnbextension --sys-prefix


In [91]:
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics.pairwise import cosine_similarity

Create a popularity based recommender system at a genre level


In [93]:
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics.pairwise import cosine_similarity

# Load datasets
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')

# Merge datasets
data = pd.merge(ratings, movies, on='movieId')

# Function to create popularity-based recommender
def popularity_based_recommender(genre, min_reviews, num_recommendations):
    genre_movies = data[data['genres'].str.contains(genre, case=False, na=False)]
    genre_movies = genre_movies.groupby('title').agg(
        average_rating=('rating', 'mean'),
        num_reviews=('rating', 'count')
    ).reset_index()
    genre_movies = genre_movies[genre_movies['num_reviews'] >= min_reviews]
    genre_movies = genre_movies.sort_values(by='average_rating', ascending=False)
    top_movies = genre_movies.head(num_recommendations)
    return top_movies

# Prepare data for content-based recommender
movies['genres'] = movies['genres'].str.split('|')
mlb = MultiLabelBinarizer()
genre_matrix = mlb.fit_transform(movies['genres'])
genre_df = pd.DataFrame(genre_matrix, columns=mlb.classes_, index=movies['movieId'])
cosine_sim = cosine_similarity(genre_df)
cosine_sim_df = pd.DataFrame(cosine_sim, index=movies['title'], columns=movies['title'])

# Function to create content-based recommender
def content_based_recommender(movie_title, num_recommendations):
    sim_scores = cosine_sim_df[movie_title]
    similar_movies = sim_scores.sort_values(ascending=False).head(num_recommendations + 1)
    similar_movies = similar_movies.drop(movie_title)
    return similar_movies

# Example usage
genre = 'Comedy'
min_reviews = 100
num_recommendations = 5
print(popularity_based_recommender(genre, min_reviews, num_recommendations))

movie_title = 'Toy Story (1995)'
num_recommendations = 5
print(content_based_recommender(movie_title, num_recommendations))


                                       title  average_rating  num_reviews
2093  Monty Python and the Holy Grail (1975)        4.301948          154
995                             Fargo (1996)        4.271144          201
2498              Princess Bride, The (1987)        4.163743          171
2523                     Pulp Fiction (1994)        4.160000          325
1069                     Forrest Gump (1994)        4.138264          311
title
Monsters, Inc. (2001)                                      1.0
Turbo (2013)                                               1.0
Antz (1998)                                                1.0
Asterix and the Vikings (Astérix et les Vikings) (2006)    1.0
Toy Story 2 (1999)                                         1.0
Name: Toy Story (1995), dtype: float64


Create a content-based recommender system which recommends top N movies based on similar movie(m) genres.

In [95]:

# Load the dataset
movies = pd.read_csv('movies.csv')

# Prepare the data
movies['genres'] = movies['genres'].str.split('|')

# One-hot encode the genres
mlb = MultiLabelBinarizer()
genre_matrix = mlb.fit_transform(movies['genres'])
genre_df = pd.DataFrame(genre_matrix, columns=mlb.classes_, index=movies['title'])

# Compute cosine similarity
cosine_sim = cosine_similarity(genre_df)
cosine_sim_df = pd.DataFrame(cosine_sim, index=movies['title'], columns=movies['title'])

# Content-based recommender function
def content_based_recommender(movie_title, num_recommendations):
    # Check if the movie exists in the dataset
    if movie_title not in cosine_sim_df.columns:
        return pd.DataFrame(columns=['S.No', 'MovieTitle'])

    # Get the similarity scores for the given movie
    sim_scores = cosine_sim_df[movie_title]

    # Sort the scores in descending order and get the top N
    similar_movies = sim_scores.sort_values(ascending=False).head(num_recommendations + 1)  # +1 to exclude the movie itself

    # Drop the input movie itself from the list
    similar_movies = similar_movies.drop(movie_title)

    # Prepare the output
    output = similar_movies.reset_index().rename(columns={movie_title: 'Similarity'})
    output.index.name = 'S.No'
    output.reset_index(inplace=True)
    output = output[['S.No', 'title']].rename(columns={'title': 'MovieTitle'})

    return output

# Example usage
movie_title = 'Toy Story (1995)'
num_recommendations = 5
recommendations = content_based_recommender(movie_title, num_recommendations)

# Display the recommendations
print(recommendations)


   S.No                                         MovieTitle
0     0                              Monsters, Inc. (2001)
1     1                                       Turbo (2013)
2     2                                        Antz (1998)
3     3  Asterix and the Vikings (Astérix et les Viking...
4     4                                 Toy Story 2 (1999)


Create a collaborative based recommender system which recommends top N movies based on “K” similar users for a target user “u”

In [112]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

# Step 1: Load the datasets
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')

# Step 2: Create a user-movie rating matrix
user_movie_matrix = ratings.pivot(index='userId', columns='movieId', values='rating').fillna(0)

# Step 3: Calculate cosine similarity between users
user_sim_matrix = cosine_similarity(user_movie_matrix)
user_sim_df = pd.DataFrame(user_sim_matrix, index=user_movie_matrix.index, columns=user_movie_matrix.index)

# Step 4: Define collaborative-based recommender function
def collaborative_based_recommender(user_id, num_recommendations, k_similar_users):
    # Check if the user exists in the dataset
    if user_id not in user_sim_df.index:
        return pd.DataFrame(columns=['S.No', 'MovieTitle'])
    
    # Get the similarity scores for the target user
    sim_scores = user_sim_df[user_id]

    # Sort the scores in descending order and get the top K similar users
    similar_users = sim_scores.sort_values(ascending=False).head(k_similar_users + 1).index  # +1 to include the user itself

    # Drop the target user from the list of similar users
    similar_users = similar_users.drop(user_id)

    # Aggregate ratings from similar users
    similar_users_ratings = user_movie_matrix.loc[similar_users].mean(axis=0)

    # Remove movies already rated by the target user
    target_user_ratings = user_movie_matrix.loc[user_id]
    recommendations = similar_users_ratings[target_user_ratings == 0]

    # Sort by highest predicted rating
    recommendations = recommendations.sort_values(ascending=False).head(num_recommendations)

    # Get movie titles
    recommended_movie_ids = recommendations.index
    recommended_movies = movies[movies['movieId'].isin(recommended_movie_ids)][['title']]

    # Prepare the output
    output = recommended_movies.reset_index().rename(columns={'title': 'MovieTitle'})
    output.index.name = 'S.No'
    output.reset_index(inplace=True)

    return output

# Example usage
user_id = 1
num_recommendations = 5
k_similar_users = 100

recommendations = collaborative_based_recommender(user_id, num_recommendations, k_similar_users)

# Display the recommendations
print(recommendations)


   S.No  index                                 MovieTitle
0     0      0                           Toy Story (1995)
1     1    843                            Die Hard (1988)
2     2    962                              Aliens (1986)
3     3   1050  Indiana Jones and the Last Crusade (1989)
4     4   3322                             Memento (2000)


The data consists of 105339 ratings applied to over 10329 movies. The average rating is 3.5 and minimum and maximum rating is 0.5 and 5 respectively. There are 668users who have given their ratings for 149532 movies.

In [114]:
import pandas as pd

# Load datasets
movies = pd.read_csv('Movies.csv')
ratings = pd.read_csv('Ratings.csv')

# Display first few rows to verify data loading
print("Movies dataset:")
print(movies.head())
print("\nRatings dataset:")
print(ratings.head())


Movies dataset:
   movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1                   Adventure|Children|Fantasy  
2                               Comedy|Romance  
3                         Comedy|Drama|Romance  
4                                       Comedy  

Ratings dataset:
   userId  movieId  rating   timestamp
0       1       16     4.0  1217897793
1       1       24     1.5  1217895807
2       1       32     4.0  1217896246
3       1       47     4.0  1217896556
4       1       50     4.0  1217896523


In [116]:
# Display basic information about movies dataset
print("Movies dataset info:")
print(movies.info())

# Summary statistics
print("\nSummary statistics of movies dataset:")
print(movies.describe())

# Unique movies and genres
unique_movies = movies['movieId'].nunique()
unique_genres = movies['genres'].nunique()

print(f"\nNumber of unique movies: {unique_movies}")
print(f"Number of unique genres: {unique_genres}")

# Top 10 movies by average rating
average_ratings = ratings.groupby('movieId')['rating'].mean()
top_movies = average_ratings.sort_values(ascending=False).head(10)
top_movies_titles = movies[movies['movieId'].isin(top_movies.index)]['title'].values

print("\nTop 10 movies by average rating:")
for i, title in enumerate(top_movies_titles, 1):
    print(f"{i}. {title}")


Movies dataset info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10329 entries, 0 to 10328
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  10329 non-null  int64 
 1   title    10329 non-null  object
 2   genres   10329 non-null  object
dtypes: int64(1), object(2)
memory usage: 242.2+ KB
None

Summary statistics of movies dataset:
             movieId
count   10329.000000
mean    31924.282893
std     37734.741149
min         1.000000
25%      3240.000000
50%      7088.000000
75%     59900.000000
max    149532.000000

Number of unique movies: 10329
Number of unique genres: 938

Top 10 movies by average rating:
1. Heaven & Earth (1993)
2. Fallen Angels (Duo luo tian shi) (1995)
3. Nine Lives of Fritz the Cat, The (1974)
4. Topkapi (1964)
5. Long Gray Line, The (1955)
6. Forbidden Zone (1980)
7. Enigma of Kaspar Hauser, The (a.k.a. Mystery of Kaspar Hauser, The) (Jeder für sich und Gott Gegen Alle) (1974)
8. Hunc

In [118]:
# Display basic information about ratings dataset
print("\nRatings dataset info:")
print(ratings.info())

# Summary statistics
print("\nSummary statistics of ratings dataset:")
print(ratings.describe())

# Average rating and total movies at genre level
# Assuming genres are pipe-separated and split into individual genres
movies['genres'] = movies['genres'].apply(lambda x: x.split('|'))
genre_ratings = movies.merge(ratings, on='movieId')
genre_ratings = genre_ratings.explode('genres')

print("\nAverage rating and total movies at genre level:")
genre_stats = genre_ratings.groupby('genres').agg({'rating': ['mean', 'count']})
genre_stats.columns = ['AverageRating', 'TotalMovies']
print(genre_stats)

# Unique users and movies
unique_users = ratings['userId'].nunique()
total_ratings = len(ratings)

print(f"\nNumber of unique users: {unique_users}")
print(f"Total number of ratings: {total_ratings}")



Ratings dataset info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 105339 entries, 0 to 105338
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     105339 non-null  int64  
 1   movieId    105339 non-null  int64  
 2   rating     105339 non-null  float64
 3   timestamp  105339 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.2 MB
None

Summary statistics of ratings dataset:
              userId        movieId         rating     timestamp
count  105339.000000  105339.000000  105339.000000  1.053390e+05
mean      364.924539   13381.312477       3.516850  1.130424e+09
std       197.486905   26170.456869       1.044872  1.802660e+08
min         1.000000       1.000000       0.500000  8.285650e+08
25%       192.000000    1073.000000       3.000000  9.711008e+08
50%       383.000000    2497.000000       3.500000  1.115154e+09
75%       557.000000    5991.000000       4.000000  1.275496e+09
max   

In [120]:
# Unique genres
unique_genres_list = movies['genres'].explode().unique()
print("\nUnique genres considered:")
print(unique_genres_list)



Unique genres considered:
['Adventure' 'Animation' 'Children' 'Comedy' 'Fantasy' 'Romance' 'Drama'
 'Action' 'Crime' 'Thriller' 'Horror' 'Mystery' 'Sci-Fi' 'IMAX' 'War'
 'Musical' 'Documentary' 'Western' 'Film-Noir' '(no genres listed)']


In [133]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
import ipywidgets as widgets
from IPython.display import display, clear_output

# Load datasets
movies = pd.read_csv('Movies.csv')
ratings = pd.read_csv('Ratings.csv')

# Define functions for recommendation modules

# Popularity-based recommender system
def popularity_based_recommender(num_recommendations):
    # Calculate popularity based on average ratings and number of ratings
    movie_popularity = ratings.groupby('movieId').agg({'rating': ['mean', 'count']})
    movie_popularity.columns = ['AverageRating', 'NumRatings']

    # Filter movies with minimum number of ratings threshold (optional)
    # For example, keep movies with at least 100 ratings
    movie_popularity = movie_popularity[movie_popularity['NumRatings'] >= 100]

    # Sort by popularity (average rating and number of ratings)
    movie_popularity = movie_popularity.sort_values(by=['AverageRating', 'NumRatings'], ascending=False)

    # Get top N recommended movies
    top_movies = movie_popularity.head(num_recommendations)

    # Merge with movies dataset to get movie titles
    top_movies_titles = movies.merge(top_movies, on='movieId').sort_values(by='AverageRating', ascending=False)[['title', 'AverageRating', 'NumRatings']]

    return top_movies_titles

# Content-based recommender system
def content_based_recommender(movie_title, num_recommendations):
    # TF-IDF Vectorizer to convert text (genres) into numerical vectors
    tfidf = TfidfVectorizer(stop_words='english')
    movies['genres'] = movies['genres'].fillna('')

    # Fit and transform the TF-IDF vectorizer on genres
    tfidf_matrix = tfidf.fit_transform(movies['genres'])

    # Compute similarity scores (cosine similarity) between movies based on genres
    cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

    # Get movie index
    movie_index = movies[movies['title'] == movie_title].index[0]

    # Get similarity scores for all movies
    sim_scores = list(enumerate(cosine_sim[movie_index]))

    # Sort movies based on similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get top N similar movies (excluding the movie itself)
    top_movie_indices = [i[0] for i in sim_scores[1:num_recommendations + 1]]
    top_movies = movies.iloc[top_movie_indices][['title', 'genres']]

    return top_movies

# Collaborative-based recommender system
def collaborative_based_recommender(user_id, num_recommendations, k_similar_users):
    # Create user-movie matrix
    user_movie_matrix = ratings.pivot(index='userId', columns='movieId', values='rating').fillna(0)

    # Calculate cosine similarity between users
    user_sim_matrix = cosine_similarity(user_movie_matrix)
    user_sim_df = pd.DataFrame(user_sim_matrix, index=user_movie_matrix.index, columns=user_movie_matrix.index)

    # Get similarity scores for the target user
    sim_scores = user_sim_df[user_id]

    # Sort scores in descending order and get top K similar users
    similar_users = sim_scores.sort_values(ascending=False).head(k_similar_users + 1).index  # +1 to include the user itself
    similar_users = similar_users.drop(user_id)

    # Aggregate ratings from similar users
    similar_users_ratings = user_movie_matrix.loc[similar_users].mean(axis=0)

    # Remove movies already rated by the target user
    target_user_ratings = user_movie_matrix.loc[user_id]
    recommendations = similar_users_ratings[target_user_ratings == 0]

    # Sort by highest predicted rating
    recommendations = recommendations.sort_values(ascending=False).head(num_recommendations)

    # Get movie titles
    recommended_movie_ids = recommendations.index
    recommended_movies = movies[movies['movieId'].isin(recommended_movie_ids)][['title']]

    return recommended_movies

# GUI Interface using ipywidgets

# Define widgets
movie_title_widget = widgets.Text(description='Movie Title:')
num_recommendations_widget = widgets.IntSlider(min=1, max=10, step=1, description='Num Recommendations:')
user_id_widget = widgets.IntSlider(min=1, max=668, step=1, description='User ID:')
k_similar_users_widget = widgets.IntSlider(min=1, max=200, step=1, description='K Similar Users:')

output_widget = widgets.Output()

# Define recommendation functions triggered by button clicks
def popularity_based(event):
    with output_widget:
        clear_output()
        recommendations = popularity_based_recommender(num_recommendations_widget.value)
        display(recommendations)

def content_based(event):
    with output_widget:
        clear_output()
        recommendations = content_based_recommender(movie_title_widget.value, num_recommendations_widget.value)
        display(recommendations)

def collaborative_based(event):
    with output_widget:
        clear_output()
        recommendations = collaborative_based_recommender(user_id_widget.value, num_recommendations_widget.value, k_similar_users_widget.value)
        display(recommendations)

# Create buttons to trigger recommendation modules
popularity_button = widgets.Button(description='Popularity Based')
popularity_button.on_click(popularity_based)

content_button = widgets.Button(description='Content Based')
content_button.on_click(content_based)

collaborative_button = widgets.Button(description='Collaborative Based')
collaborative_button.on_click(collaborative_based)

# Display widgets and output
display(widgets.VBox([movie_title_widget, num_recommendations_widget, content_button, popularity_button, collaborative_button,
                      user_id_widget, k_similar_users_widget, output_widget]))


VBox(children=(Text(value='', description='Movie Title:'), IntSlider(value=1, description='Num Recommendations…