In [18]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors
import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, Flatten, Concatenate, Dense
from tensorflow.keras.models import Model
from sklearn.model_selection import train_test_split


In [19]:
movies = pd.read_csv("dataset/small_dataset/movies_full_2.csv")
ratings = pd.read_csv("dataset/small_dataset/ratings.csv")
tags = pd.read_csv("dataset/small_dataset/tags.csv")
tags.drop(columns=['userId', 'timestamp'], inplace=True)

In [20]:
def create_weighted_rating_tags_df(movies_df, ratings_df, tags_df):
    movies_rating_user_df = pd.merge(movies_df, ratings_df, on="movieId", how="inner")

    movies_rating_df = movies_rating_user_df[['movieId', 'title', 'rating', 'genres', 'year', 'url']].groupby(['movieId', 'title', 'genres', 'year', 'url'])['rating'].agg(['count', 'mean']).round(1)
    movies_rating_df.sort_values('count', ascending=False, inplace=True)
    movies_rating_df.rename(columns={'count' : 'Num_ratings', 'mean': 'Average_rating'}, inplace=True)

    C = round(ratings_df['rating'].mean(), 2)
    m = 500
    movies_rating_df['Bayesian_rating'] = (movies_rating_df['Num_ratings'] / (movies_rating_df['Num_ratings'] + m)) * movies_rating_df['Average_rating'] + (m / (movies_rating_df['Num_ratings'] + m)) * C
    movies_rating_df.drop(columns='Average_rating', inplace=True)
    movies_rating_df.rename(columns={'Num_ratings' : 'count', 'Bayesian_rating' : 'weighted_rating'}, inplace=True)
    movies_rating_df.reset_index(inplace=True)
    

    movies_rating_tags_df = pd.merge(movies_rating_df, tags_df, how='left', on='movieId')
    movies_rating_tags_df['tag'] = movies_rating_tags_df['tag'].fillna(value='')
    movies_rating_tags_df = movies_rating_tags_df.groupby(['movieId', 'title', 'genres', 'year', 'url', 'count', 'weighted_rating'])['tag'].apply(list).reset_index()
    movies_rating_tags_df['genres'] = movies_rating_tags_df['genres'].str.split('|')
    movies_rating_tags_df['tag'] = movies_rating_tags_df['tag'].apply(lambda x: [] if x == [float('nan')] else x)
    movies_rating_tags_df.sort_values(by='weighted_rating', ascending=False, inplace=True)
    return movies_rating_tags_df


movies_rating_tags_df = create_weighted_rating_tags_df(movies, ratings, tags)
movies_rating_tags_df.head(20)

Unnamed: 0,movieId,title,genres,year,url,count,weighted_rating,tag
276,318,"Shawshank Redemption, The","[Crime, Drama]",1994.0,https://m.media-amazon.com/images/M/MV5BMDFkYT...,317,3.849204,"[prison, Stephen King, wrongful imprisonment, ..."
313,356,Forrest Gump,"[Comedy, Drama, Romance, War]",1994.0,https://m.media-amazon.com/images/M/MV5BNWIwOD...,329,3.777805,"[shrimp, Vietnam, bubba gump shrimp, lieutenan..."
256,296,Pulp Fiction,"[Comedy, Crime, Drama, Thriller]",1994.0,https://m.media-amazon.com/images/M/MV5BNGNhMD...,307,3.766295,"[good dialogue, great soundtrack, non-linear, ..."
509,593,"Silence of the Lambs, The","[Crime, Horror, Thriller]",1991.0,https://m.media-amazon.com/images/M/MV5BNjNhZT...,279,3.750706,"[Hannibal Lector, disturbing, drama, gothic, p..."
1932,2571,"Matrix, The","[Action, Sci-Fi, Thriller]",1999.0,https://m.media-amazon.com/images/M/MV5BNzQzOT...,278,3.750129,"[martial arts, sci-fi, alternate universe, phi..."
2217,2959,Fight Club,"[Action, Crime, Drama, Thriller]",1999.0,https://m.media-amazon.com/images/M/MV5BMmEzNT...,218,3.742897,"[dark comedy, psychology, thought-provoking, t..."
224,260,Star Wars: Episode IV - A New Hope,"[Action, Adventure, Sci-Fi]",1977.0,https://m.media-amazon.com/images/M/MV5BNzVlY2...,251,3.733955,"[classic, space action, action, sci-fi, EPIC, ..."
657,858,"Godfather, The","[Crime, Drama]",1972.0,https://m.media-amazon.com/images/M/MV5BM2MyNj...,192,3.721965,[Mafia]
460,527,Schindler's List,"[Drama, War]",1993.0,https://m.media-amazon.com/images/M/MV5BNDE4OT...,220,3.713889,"[moving, thought-provoking, Holocaust, based o..."
892,1196,Star Wars: Episode V - The Empire Strikes Back,"[Action, Adventure, Sci-Fi]",1980.0,https://m.media-amazon.com/images/M/MV5BYmU1ND...,211,3.707736,"[I am your father, space, space opera, classic..."


In [21]:
print(movies_rating_tags_df.iloc[2137])

movieId                                                       157110
title                           00 Schneider - Jagd auf Nihil Baxter
genres                                               [Comedy, Crime]
year                                                          1994.0
url                https://m.media-amazon.com/images/M/MV5BOTQzMz...
count                                                              1
weighted_rating                                             3.501996
tag                                                               []
Name: 8954, dtype: object


# Knowledge-based

In [22]:
def knowledge_based_recommendation(df, user_preferences):
    # Filter movies based on user's preferred genres
    recommended_movies = df[df['genres'].apply(lambda x: any(genre in user_preferences['preferred_genres'] for genre in x))]
    
    # Remove movies with disliked genres
    if user_preferences['disliked_genres']:
        recommended_movies = recommended_movies[~recommended_movies['genres'].apply(lambda x: any(genre in user_preferences['disliked_genres'] for genre in x))]
    
    return recommended_movies.head(3)  # Return top 3 recommendations


user_preferences = {'preferred_genres': ['Action', 'Adventure'], 'disliked_genres': ['Crime']}

print(knowledge_based_recommendation(movies_rating_tags_df, user_preferences))

      movieId                                           title  \
1932     2571                                     Matrix, The   
224       260              Star Wars: Episode IV - A New Hope   
892      1196  Star Wars: Episode V - The Empire Strikes Back   

                           genres    year  \
1932   [Action, Sci-Fi, Thriller]  1999.0   
224   [Action, Adventure, Sci-Fi]  1977.0   
892   [Action, Adventure, Sci-Fi]  1980.0   

                                                    url  count  \
1932  https://m.media-amazon.com/images/M/MV5BNzQzOT...    278   
224   https://m.media-amazon.com/images/M/MV5BNzVlY2...    251   
892   https://m.media-amazon.com/images/M/MV5BYmU1ND...    211   

      weighted_rating                                                tag  
1932         3.750129  [martial arts, sci-fi, alternate universe, phi...  
224          3.733955  [classic, space action, action, sci-fi, EPIC, ...  
892          3.707736  [I am your father, space, space opera, classi

# Content-based

In [23]:
def content_based_recommendation(df, user_preferences):
    # Combine genres and tags to create a single text representation for each movie
    df['features'] = df['genres'] + df['tag']
    df['features'] = df['features'].apply(lambda x: ' '.join(x))
    
    # Create TF-IDF Vectorizer
    tfidf_vectorizer = TfidfVectorizer(stop_words='english')
    
    # Fit and transform the TF-IDF vectorizer
    tfidf_matrix = tfidf_vectorizer.fit_transform(df['features'])
    
    # Calculate cosine similarity between movies
    cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
    
    # Get indices of movies similar to the user's liked movies
    similar_movies_indices = []
    for movie_title in user_preferences['liked_movies']:
        idx = df.index[df['title'] == movie_title][0]
        similar_movies_indices.extend(cosine_sim[idx].argsort()[-2:-11:-1])  # Get top 10 similar movies
    
    # Remove duplicates and movies already liked by the user
    similar_movies_indices = list(set(similar_movies_indices) - set(df.index[df['title'].isin(user_preferences['liked_movies'])]))
    
    # Return top 3 recommended movies
    recommended_movies = df.loc[similar_movies_indices[:3]]
    return recommended_movies


# User Input (User's liked movies)
user_preferences = {'liked_movies': ['Shawshank Redemption, The', 'Toy Story']}

# Get content-based recommendations with genres and tags
content_based_recommendations = content_based_recommendation(movies_rating_tags_df, user_preferences)

# Display recommended movies
print("Content-Based Recommendations with Genres and Tags:")
print(content_based_recommendations[['title', 'genres', 'tag', 'year']])


Content-Based Recommendations with Genres and Tags:
                                 title             genres  \
1089                  Hearts and Minds            [Drama]   
841   Single Girl, A (Fille seule, La)            [Drama]   
393                  For Love or Money  [Comedy, Romance]   

                              tag    year  
1089  [In Netflix queue, Vietnam]  1996.0  
841                            []  1995.0  
393                            []  1993.0  


# collaborative-filtering

In [24]:
# Function to create the utility matrix
def create_utility_matrix(df):
    utility_matrix = df.pivot(index='userId', columns='movieId', values='rating').fillna(0)
    return utility_matrix

# Function for collaborative filtering recommendation using kNN
def collaborative_filtering_recommendation(df, user_id, k, num_recommendations):
    # Create the utility matrix
    utility_matrix = create_utility_matrix(df)
    
    # Calculate cosine similarity between users
    user_similarity = cosine_similarity(utility_matrix)
    
    # Find k nearest neighbors for the target user
    knn = NearestNeighbors(n_neighbors=k, metric='cosine')
    knn.fit(user_similarity)
    _, indices = knn.kneighbors([user_similarity[user_id - 1]])
    
    # Get the ratings of the nearest neighbors
    neighbor_ratings = utility_matrix.iloc[indices[0]]
    
    # Calculate the average rating for each item
    item_ratings = neighbor_ratings.mean(axis=0)
    
    # Filter out items already rated by the target user
    user_ratings = utility_matrix.loc[user_id]
    recommended_items = item_ratings[user_ratings == 0].sort_values(ascending=False)

    
    return recommended_items.index.tolist()[:num_recommendations]  

# User Input (User ID and number of nearest neighbors)
user_id = 1
k_neighbors = 3
num_recommendations = 6

# Get collaborative filtering recommendations using kNN
collaborative_filtering_recommendations = collaborative_filtering_recommendation(ratings, user_id, k_neighbors, num_recommendations)

print(collaborative_filtering_recommendations)

collaborative_filtering_recommendations_df = movies_rating_tags_df[movies_rating_tags_df['movieId'].isin(collaborative_filtering_recommendations)]

# Display recommended movieIds
print("Collaborative Filtering Recommendations:")
print(collaborative_filtering_recommendations_df)

[1199, 3994, 1923, 4993, 1200, 1215]
Collaborative Filtering Recommendations:
      movieId                                              title  \
3622     4993  Lord of the Rings: The Fellowship of the Ring,...   
896      1200                                             Aliens   
895      1199                                             Brazil   
910      1215                                   Army of Darkness   
1398     1923                       There's Something About Mary   
2969     3994                                        Unbreakable   

                                            genres    year  \
3622                          [Adventure, Fantasy]  2001.0   
896            [Action, Adventure, Horror, Sci-Fi]  1986.0   
895                              [Fantasy, Sci-Fi]  1985.0   
910   [Action, Adventure, Comedy, Fantasy, Horror]  1993.0   
1398                             [Comedy, Romance]  1998.0   
2969                               [Drama, Sci-Fi]  2000.0   

          

# Hybrid-filtering

In [25]:
def hybrid_based_recommendation(knowledge_recommendations, content_recommendations, collaborative_recommendations):
    return knowledge_recommendations+content_recommendations+collaborative_recommendations

# Deep learning

In [26]:
# Prepare data
user_ids = ratings['userId'].values
item_ids = ratings['movieId'].values
ratings = ratings['rating'].values

num_users = len(np.unique(user_ids))
num_items = len(np.unique(item_ids))
embedding_size = 50  # Embedding dimensionality

# Split data into train and validation sets
X_train_user, X_val_user, X_train_item, X_val_item, y_train, y_val = train_test_split(
    user_ids, item_ids, ratings, test_size=0.2, random_state=42)

# Define model architecture
user_input = Input(shape=(1,), name='user_input')
item_input = Input(shape=(1,), name='item_input')

user_embedding = Embedding(input_dim=num_users, output_dim=embedding_size)(user_input)
item_embedding = Embedding(input_dim=num_items, output_dim=embedding_size)(item_input)

user_flat = Flatten()(user_embedding)
item_flat = Flatten()(item_embedding)

concat = Concatenate()([user_flat, item_flat])

fc1 = Dense(64, activation='relu')(concat)
fc2 = Dense(32, activation='relu')(fc1)

output = Dense(1, activation='sigmoid')(fc2)

model = Model(inputs=[user_input, item_input], outputs=output)

# Compile model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train model
history = model.fit([X_train_user, X_train_item], y_train, 
                    batch_size=64, epochs=5, 
                    validation_data=([X_val_user, X_val_item], y_val))

Epoch 1/5


InvalidArgumentError: Graph execution error:

Detected at node model_1/embedding_3/embedding_lookup defined at (most recent call last):
  File "<frozen runpy>", line 198, in _run_module_as_main

  File "<frozen runpy>", line 88, in _run_code

  File "c:\Users\João\AppData\Local\Programs\Python\Python311\Lib\site-packages\ipykernel_launcher.py", line 18, in <module>

  File "c:\Users\João\AppData\Local\Programs\Python\Python311\Lib\site-packages\traitlets\config\application.py", line 1075, in launch_instance

  File "c:\Users\João\AppData\Local\Programs\Python\Python311\Lib\site-packages\ipykernel\kernelapp.py", line 739, in start

  File "C:\Users\João\AppData\Roaming\Python\Python311\site-packages\tornado\platform\asyncio.py", line 205, in start

  File "c:\Users\João\AppData\Local\Programs\Python\Python311\Lib\asyncio\base_events.py", line 607, in run_forever

  File "c:\Users\João\AppData\Local\Programs\Python\Python311\Lib\asyncio\base_events.py", line 1922, in _run_once

  File "c:\Users\João\AppData\Local\Programs\Python\Python311\Lib\asyncio\events.py", line 80, in _run

  File "c:\Users\João\AppData\Local\Programs\Python\Python311\Lib\site-packages\ipykernel\kernelbase.py", line 545, in dispatch_queue

  File "c:\Users\João\AppData\Local\Programs\Python\Python311\Lib\site-packages\ipykernel\kernelbase.py", line 534, in process_one

  File "c:\Users\João\AppData\Local\Programs\Python\Python311\Lib\site-packages\ipykernel\kernelbase.py", line 437, in dispatch_shell

  File "c:\Users\João\AppData\Local\Programs\Python\Python311\Lib\site-packages\ipykernel\ipkernel.py", line 362, in execute_request

  File "c:\Users\João\AppData\Local\Programs\Python\Python311\Lib\site-packages\ipykernel\kernelbase.py", line 778, in execute_request

  File "c:\Users\João\AppData\Local\Programs\Python\Python311\Lib\site-packages\ipykernel\ipkernel.py", line 449, in do_execute

  File "c:\Users\João\AppData\Local\Programs\Python\Python311\Lib\site-packages\ipykernel\zmqshell.py", line 549, in run_cell

  File "c:\Users\João\AppData\Local\Programs\Python\Python311\Lib\site-packages\IPython\core\interactiveshell.py", line 3075, in run_cell

  File "c:\Users\João\AppData\Local\Programs\Python\Python311\Lib\site-packages\IPython\core\interactiveshell.py", line 3130, in _run_cell

  File "c:\Users\João\AppData\Local\Programs\Python\Python311\Lib\site-packages\IPython\core\async_helpers.py", line 129, in _pseudo_sync_runner

  File "c:\Users\João\AppData\Local\Programs\Python\Python311\Lib\site-packages\IPython\core\interactiveshell.py", line 3334, in run_cell_async

  File "c:\Users\João\AppData\Local\Programs\Python\Python311\Lib\site-packages\IPython\core\interactiveshell.py", line 3517, in run_ast_nodes

  File "c:\Users\João\AppData\Local\Programs\Python\Python311\Lib\site-packages\IPython\core\interactiveshell.py", line 3577, in run_code

  File "C:\Users\João\AppData\Local\Temp\ipykernel_15460\422169729.py", line 37, in <module>

  File "c:\Users\João\AppData\Local\Programs\Python\Python311\Lib\site-packages\keras\src\utils\traceback_utils.py", line 65, in error_handler

  File "c:\Users\João\AppData\Local\Programs\Python\Python311\Lib\site-packages\keras\src\engine\training.py", line 1807, in fit

  File "c:\Users\João\AppData\Local\Programs\Python\Python311\Lib\site-packages\keras\src\engine\training.py", line 1401, in train_function

  File "c:\Users\João\AppData\Local\Programs\Python\Python311\Lib\site-packages\keras\src\engine\training.py", line 1384, in step_function

  File "c:\Users\João\AppData\Local\Programs\Python\Python311\Lib\site-packages\keras\src\engine\training.py", line 1373, in run_step

  File "c:\Users\João\AppData\Local\Programs\Python\Python311\Lib\site-packages\keras\src\engine\training.py", line 1150, in train_step

  File "c:\Users\João\AppData\Local\Programs\Python\Python311\Lib\site-packages\keras\src\utils\traceback_utils.py", line 65, in error_handler

  File "c:\Users\João\AppData\Local\Programs\Python\Python311\Lib\site-packages\keras\src\engine\training.py", line 590, in __call__

  File "c:\Users\João\AppData\Local\Programs\Python\Python311\Lib\site-packages\keras\src\utils\traceback_utils.py", line 65, in error_handler

  File "c:\Users\João\AppData\Local\Programs\Python\Python311\Lib\site-packages\keras\src\engine\base_layer.py", line 1149, in __call__

  File "c:\Users\João\AppData\Local\Programs\Python\Python311\Lib\site-packages\keras\src\utils\traceback_utils.py", line 96, in error_handler

  File "c:\Users\João\AppData\Local\Programs\Python\Python311\Lib\site-packages\keras\src\engine\functional.py", line 515, in call

  File "c:\Users\João\AppData\Local\Programs\Python\Python311\Lib\site-packages\keras\src\engine\functional.py", line 672, in _run_internal_graph

  File "c:\Users\João\AppData\Local\Programs\Python\Python311\Lib\site-packages\keras\src\utils\traceback_utils.py", line 65, in error_handler

  File "c:\Users\João\AppData\Local\Programs\Python\Python311\Lib\site-packages\keras\src\engine\base_layer.py", line 1149, in __call__

  File "c:\Users\João\AppData\Local\Programs\Python\Python311\Lib\site-packages\keras\src\utils\traceback_utils.py", line 96, in error_handler

  File "c:\Users\João\AppData\Local\Programs\Python\Python311\Lib\site-packages\keras\src\layers\core\embedding.py", line 272, in call

indices[0,0] = 54276 is not in [0, 9436)
	 [[{{node model_1/embedding_3/embedding_lookup}}]] [Op:__inference_train_function_2857]

In [None]:
def get_recommendations(user_id, model, item_ids, top_n=10):
    # Repeat the user ID for all items
    user_input = np.array([user_id] * len(item_ids))
    # Get predictions for the user-item pairs
    predictions = model.predict([user_input, item_ids])
    # Sort predictions in descending order and get indices of top N items
    top_indices = np.argsort(predictions.flatten())[::-1][:top_n]
    # Return the top N recommended item IDs
    return item_ids[top_indices]

# Assuming you have a trained model named 'model' and DataFrame named 'ratings_df'
# Also assuming you have a list of unique movie IDs named 'unique_movie_ids'
# Replace 'user_id_to_test' with the ID of the user you want to generate recommendations for

user_id_to_test = 123  # Replace with the actual user ID

# Assuming you have already loaded your ratings data and split it into train and validation sets
# Assuming you also have a list of all unique movie IDs named 'all_movie_ids'

# Filter out movies already rated by the user
all_movie_ids = ratings['userId'].unique().tolist()
user_movie_ids = ratings[ratings['userId'] == user_id_to_test]['movieId'].values
unrated_movie_ids = np.setdiff1d(all_movie_ids, user_movie_ids)

# Generate recommendations for the user
recommendations = get_recommendations(user_id_to_test, model, unrated_movie_ids)

print("Top recommendations for user", user_id_to_test, ":")
for movie_id in recommendations:
    movie_title = movies_rating_tags_df[movies_rating_tags_df['movieId'] == movie_id]['title'].values[0]
    print(movie_title)

# Debugging


In [None]:

print(ratings['userId'].unique().tolist())

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 22