In [46]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors
import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, Flatten, Concatenate, Dense
from tensorflow.keras.models import Model
from sklearn.model_selection import train_test_split


In [47]:
movies = pd.read_csv("dataset/small_dataset/movies_full_2.csv")
ratings = pd.read_csv("dataset/small_dataset/ratings.csv")
tags = pd.read_csv("dataset/small_dataset/tags.csv")
tags.drop(columns=['userId', 'timestamp'], inplace=True)

In [48]:
def create_weighted_rating_tags_df(movies_df, ratings_df, tags_df):
    movies_rating_user_df = pd.merge(movies_df, ratings_df, on="movieId", how="inner")

    movies_rating_df = movies_rating_user_df[['movieId', 'title', 'rating', 'genres', 'year', 'url']].groupby(['movieId', 'title', 'genres', 'year', 'url'])['rating'].agg(['count', 'mean']).round(1)
    movies_rating_df.sort_values('count', ascending=False, inplace=True)
    movies_rating_df.rename(columns={'count' : 'Num_ratings', 'mean': 'Average_rating'}, inplace=True)

    C = round(ratings_df['rating'].mean(), 2)
    m = 500
    movies_rating_df['Bayesian_rating'] = (movies_rating_df['Num_ratings'] / (movies_rating_df['Num_ratings'] + m)) * movies_rating_df['Average_rating'] + (m / (movies_rating_df['Num_ratings'] + m)) * C
    movies_rating_df.drop(columns='Average_rating', inplace=True)
    movies_rating_df.rename(columns={'Num_ratings' : 'count', 'Bayesian_rating' : 'weighted_rating'}, inplace=True)
    movies_rating_df.reset_index(inplace=True)
    

    movies_rating_tags_df = pd.merge(movies_rating_df, tags_df, how='left', on='movieId')
    movies_rating_tags_df['tag'] = movies_rating_tags_df['tag'].fillna(value='')
    movies_rating_tags_df = movies_rating_tags_df.groupby(['movieId', 'title', 'genres', 'year', 'url', 'count', 'weighted_rating'])['tag'].apply(list).reset_index()
    movies_rating_tags_df['genres'] = movies_rating_tags_df['genres'].str.split('|')
    movies_rating_tags_df['tag'] = movies_rating_tags_df['tag'].apply(lambda x: [] if x == [float('nan')] else x)
    movies_rating_tags_df.sort_values(by='weighted_rating', ascending=False, inplace=True)
    return movies_rating_tags_df


movies_rating_tags_df = create_weighted_rating_tags_df(movies, ratings, tags)
movies_rating_tags_df.head(20)

Unnamed: 0,movieId,title,genres,year,url,count,weighted_rating,tag
276,318,"Shawshank Redemption, The","[Crime, Drama]",1994.0,https://m.media-amazon.com/images/M/MV5BMDFkYT...,317,3.849204,"[prison, Stephen King, wrongful imprisonment, ..."
313,356,Forrest Gump,"[Comedy, Drama, Romance, War]",1994.0,https://m.media-amazon.com/images/M/MV5BNWIwOD...,329,3.777805,"[shrimp, Vietnam, bubba gump shrimp, lieutenan..."
256,296,Pulp Fiction,"[Comedy, Crime, Drama, Thriller]",1994.0,https://m.media-amazon.com/images/M/MV5BNGNhMD...,307,3.766295,"[good dialogue, great soundtrack, non-linear, ..."
509,593,"Silence of the Lambs, The","[Crime, Horror, Thriller]",1991.0,https://m.media-amazon.com/images/M/MV5BNjNhZT...,279,3.750706,"[Hannibal Lector, disturbing, drama, gothic, p..."
1932,2571,"Matrix, The","[Action, Sci-Fi, Thriller]",1999.0,https://m.media-amazon.com/images/M/MV5BNzQzOT...,278,3.750129,"[martial arts, sci-fi, alternate universe, phi..."
2217,2959,Fight Club,"[Action, Crime, Drama, Thriller]",1999.0,https://m.media-amazon.com/images/M/MV5BMmEzNT...,218,3.742897,"[dark comedy, psychology, thought-provoking, t..."
224,260,Star Wars: Episode IV - A New Hope,"[Action, Adventure, Sci-Fi]",1977.0,https://m.media-amazon.com/images/M/MV5BNzVlY2...,251,3.733955,"[classic, space action, action, sci-fi, EPIC, ..."
657,858,"Godfather, The","[Crime, Drama]",1972.0,https://m.media-amazon.com/images/M/MV5BM2MyNj...,192,3.721965,[Mafia]
460,527,Schindler's List,"[Drama, War]",1993.0,https://m.media-amazon.com/images/M/MV5BNDE4OT...,220,3.713889,"[moving, thought-provoking, Holocaust, based o..."
892,1196,Star Wars: Episode V - The Empire Strikes Back,"[Action, Adventure, Sci-Fi]",1980.0,https://m.media-amazon.com/images/M/MV5BYmU1ND...,211,3.707736,"[I am your father, space, space opera, classic..."


In [49]:
print(movies_rating_tags_df.iloc[2137])

movieId                                                       157110
title                           00 Schneider - Jagd auf Nihil Baxter
genres                                               [Comedy, Crime]
year                                                          1994.0
url                https://m.media-amazon.com/images/M/MV5BOTQzMz...
count                                                              1
weighted_rating                                             3.501996
tag                                                               []
Name: 8954, dtype: object


# Knowledge-based

In [50]:
def knowledge_based_recommendation(df, user_preferences):
    # Filter movies based on user's preferred genres
    recommended_movies = df[df['genres'].apply(lambda x: any(genre in user_preferences['preferred_genres'] for genre in x))]
    
    # Remove movies with disliked genres
    if user_preferences['disliked_genres']:
        recommended_movies = recommended_movies[~recommended_movies['genres'].apply(lambda x: any(genre in user_preferences['disliked_genres'] for genre in x))]
    
    # Return only the movie IDs of the top 3 recommendations
    return recommended_movies.head(3)['movieId'].tolist()

user_preferences = {'preferred_genres': ['Action', 'Adventure'], 'disliked_genres': ['Crime']}
k_recommendations_ids = knowledge_based_recommendation(movies_rating_tags_df, user_preferences)
print(k_recommendations_ids)



[2571, 260, 1196]


In [51]:
print(movies_rating_tags_df.loc[k_recommendations_ids])

      movieId                       title            genres    year  \
2571     3450              Grumpy Old Men          [Comedy]  1993.0   
260       301  Picture Bride (Bijo photo)  [Drama, Romance]  1994.0   
1196     1600             She's So Lovely  [Drama, Romance]  1997.0   

                                                    url  count  \
2571  https://m.media-amazon.com/images/M/MV5BMzNiYz...     29   
260   https://m.media-amazon.com/images/M/MV5BY2M2NG...      1   
1196  https://m.media-amazon.com/images/M/MV5BZmU1ZT...      2   

      weighted_rating tag  
2571         3.489036  []  
260          3.500998  []  
1196         3.500000  []  


# Content-based

In [52]:
def content_based_recommendation(df, user_preferences):
    # Combine genres and tags to create a single text representation for each movie
    df['features'] = df['genres'] + df['tag']
    df['features'] = df['features'].apply(lambda x: ' '.join(x))
    
    # Create TF-IDF Vectorizer
    tfidf_vectorizer = TfidfVectorizer(stop_words='english')
    
    # Fit and transform the TF-IDF vectorizer
    tfidf_matrix = tfidf_vectorizer.fit_transform(df['features'])
    
    # Calculate cosine similarity between movies
    cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
    
    # Get indices of movies similar to the user's liked movies
    similar_movies_indices = []
    for movie_title in user_preferences['liked_movies']:
        idx = df.index[df['title'] == movie_title][0]
        similar_movies_indices.extend(cosine_sim[idx].argsort()[-2:-11:-1])  # Get top 10 similar movies
    
    # Remove duplicates and movies already liked by the user
    similar_movies_indices = list(set(similar_movies_indices) - set(df.index[df['title'].isin(user_preferences['liked_movies'])]))
    
    # # Return top 3 recommended movies
    # recommended_movies = df.loc[similar_movies_indices[:3]]
    # return recommended_movies
    return similar_movies_indices[:5]

# User Input (User's liked movies)
user_preferences = {'liked_movies': ['Shawshank Redemption, The', 'Toy Story']}

# Get content-based recommendations with genres and tags
content_based_recommendations_ids = content_based_recommendation(movies_rating_tags_df, user_preferences)
print(content_based_recommendations_ids)

print(movies_rating_tags_df.loc[content_based_recommendations_ids])

# # Display recommended movies
# print("Content-Based Recommendations with Genres and Tags:")
# print(content_based_recommendations[['title', 'genres', 'tag', 'year']])


[1089, 841, 393, 3593, 204]
      movieId                                        title  \
1089     1423                             Hearts and Minds   
841      1116             Single Girl, A (Fille seule, La)   
393       453                            For Love or Money   
3593     4957                                Sudden Impact   
204       238  Far From Home: The Adventures of Yellow Dog   

                     genres    year  \
1089                [Drama]  1996.0   
841                 [Drama]  1995.0   
393       [Comedy, Romance]  1993.0   
3593      [Crime, Thriller]  1983.0   
204   [Adventure, Children]  1995.0   

                                                    url  count  \
1089  https://m.media-amazon.com/images/M/MV5BYzMzMG...      1   
841   https://m.media-amazon.com/images/M/MV5BYWI4OW...      1   
393   https://m.media-amazon.com/images/M/MV5BYjg1NG...      5   
3593  https://m.media-amazon.com/images/M/MV5BYTA2Nz...      2   
204   https://m.media-amazon.com/i

# collaborative-filtering

In [53]:
# Function to create the utility matrix
def create_utility_matrix(df):
    utility_matrix = df.pivot(index='userId', columns='movieId', values='rating').fillna(0)
    return utility_matrix

# Function for collaborative filtering recommendation using kNN
def collaborative_filtering_recommendation(df, user_id, k, num_recommendations):
    # Create the utility matrix
    utility_matrix = create_utility_matrix(df)
    
    # Calculate cosine similarity between users
    user_similarity = cosine_similarity(utility_matrix)
    
    # Find k nearest neighbors for the target user
    knn = NearestNeighbors(n_neighbors=k, metric='cosine')
    knn.fit(user_similarity)
    _, indices = knn.kneighbors([user_similarity[user_id - 1]])
    
    # Get the ratings of the nearest neighbors
    neighbor_ratings = utility_matrix.iloc[indices[0]]
    
    # Calculate the average rating for each item
    item_ratings = neighbor_ratings.mean(axis=0)
    
    # Filter out items already rated by the target user
    user_ratings = utility_matrix.loc[user_id]
    recommended_items = item_ratings[user_ratings == 0].sort_values(ascending=False)

    
    return recommended_items.index.tolist()[:num_recommendations]  

# User Input (User ID and number of nearest neighbors)
user_id = 1
k_neighbors = 3
num_recommendations = 6

# Get collaborative filtering recommendations using kNN
collaborative_filtering_recommendations_ids = collaborative_filtering_recommendation(ratings, user_id, k_neighbors, num_recommendations)

print(collaborative_filtering_recommendations_ids)

collaborative_filtering_recommendations_df = movies_rating_tags_df[movies_rating_tags_df['movieId'].isin(collaborative_filtering_recommendations)]

# Display recommended movieIds
print("Collaborative Filtering Recommendations:")
print(collaborative_filtering_recommendations_df)

[1199, 3994, 1923, 4993, 1200, 1215]
Collaborative Filtering Recommendations:
      movieId                                              title  \
3622     4993  Lord of the Rings: The Fellowship of the Ring,...   
896      1200                                             Aliens   
895      1199                                             Brazil   
910      1215                                   Army of Darkness   
1398     1923                       There's Something About Mary   
2969     3994                                        Unbreakable   

                                            genres    year  \
3622                          [Adventure, Fantasy]  2001.0   
896            [Action, Adventure, Horror, Sci-Fi]  1986.0   
895                              [Fantasy, Sci-Fi]  1985.0   
910   [Action, Adventure, Comedy, Fantasy, Horror]  1993.0   
1398                             [Comedy, Romance]  1998.0   
2969                               [Drama, Sci-Fi]  2000.0   

          

# Hybrid-filtering

In [55]:
def hybrid_based_recommendation(knowledge_recommendations_ids, content_recommendations_ids, collaborative_recommendations_ids):

    recommendations = knowledge_recommendations_ids+content_recommendations_ids+collaborative_recommendations_ids
    
    return recommendations
    
recommendations_hybrid = hybrid_based_recommendation(k_recommendations_ids,content_based_recommendations_ids, collaborative_filtering_recommendations_ids)
print(recommendations_hybrid)


[2571, 260, 1196, 1089, 841, 393, 3593, 204, 1199, 3994, 1923, 4993, 1200, 1215]


In [56]:
print(movies_rating_tags_df.loc[recommendations_hybrid])

      movieId                                        title  \
2571     3450                               Grumpy Old Men   
260       301                   Picture Bride (Bijo photo)   
1196     1600                              She's So Lovely   
1089     1423                             Hearts and Minds   
841      1116             Single Girl, A (Fille seule, La)   
393       453                            For Love or Money   
3593     4957                                Sudden Impact   
204       238  Far From Home: The Adventures of Yellow Dog   
1199     1603                                        Mimic   
3994     5669                        Bowling for Columbine   
1923     2559                              King and I, The   
4993     7815                                 True Stories   
1200     1604                                  Money Talks   
1215     1621                                    Soul Food   

                          genres    year  \
2571                    [

# Debugging


611
