In [1]:
import pandas as pd
import numpy as np

In [2]:
movies = pd.read_csv("datasets/movies.csv")
ratings = pd.read_csv("datasets/ratings.csv")
tags = pd.read_csv("datasets/tags.csv")

movies.head(2)
ratings.head(2)
tags.head(10)

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200
5,2,89774,Tom Hardy,1445715205
6,2,106782,drugs,1445715054
7,2,106782,Leonardo DiCaprio,1445715051
8,2,106782,Martin Scorsese,1445715056
9,7,48516,way too long,1169687325


In [3]:
tags.isnull().sum()

userId       0
movieId      0
tag          0
timestamp    0
dtype: int64

In [4]:
merged_df = pd.merge(movies, tags, on='movieId')


movies_with_tags = merged_df.groupby(['movieId', 'title', 'genres'])['tag'].apply(lambda x: ', '.join(x)).reset_index()

movies_with_tags.rename(columns={'tag': 'tags'}, inplace=True)


In [5]:
movies_with_tags['genres'] = movies_with_tags['genres'].str.replace('|', ',', regex=False)  

In [6]:
movies_with_tags['content'] = movies_with_tags['genres'] + ', ' + movies_with_tags['tags']

In [7]:
movies_with_tags

Unnamed: 0,movieId,title,genres,tags,content
0,1,Toy Story (1995),"Adventure,Animation,Children,Comedy,Fantasy","pixar, pixar, fun","Adventure,Animation,Children,Comedy,Fantasy, p..."
1,2,Jumanji (1995),"Adventure,Children,Fantasy","fantasy, magic board game, Robin Williams, game","Adventure,Children,Fantasy, fantasy, magic boa..."
2,3,Grumpier Old Men (1995),"Comedy,Romance","moldy, old","Comedy,Romance, moldy, old"
3,5,Father of the Bride Part II (1995),Comedy,"pregnancy, remake","Comedy, pregnancy, remake"
4,7,Sabrina (1995),"Comedy,Romance",remake,"Comedy,Romance, remake"
...,...,...,...,...,...
1567,183611,Game Night (2018),"Action,Comedy,Crime,Horror","Comedy, funny, Rachel McAdams","Action,Comedy,Crime,Horror, Comedy, funny, Rac..."
1568,184471,Tomb Raider (2018),"Action,Adventure,Fantasy","adventure, Alicia Vikander, video game adaptation","Action,Adventure,Fantasy, adventure, Alicia Vi..."
1569,187593,Deadpool 2 (2018),"Action,Comedy,Sci-Fi","Josh Brolin, Ryan Reynolds, sarcasm","Action,Comedy,Sci-Fi, Josh Brolin, Ryan Reynol..."
1570,187595,Solo: A Star Wars Story (2018),"Action,Adventure,Children,Sci-Fi","Emilia Clarke, star wars","Action,Adventure,Children,Sci-Fi, Emilia Clark..."


In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer()

tfidf_matrix = tfidf_vectorizer.fit_transform(movies_with_tags['content'])

In [9]:
from sklearn.metrics.pairwise import linear_kernel
content_similarity = linear_kernel(tfidf_matrix, tfidf_matrix)


In [10]:
def get_content_based_recommendations(movie_title, top_n):
    index = movies_with_tags[movies_with_tags['title'] == movie_title].index[0]
    similarity_scores = content_similarity[index]
    similar_indices = similarity_scores.argsort()[::-1][1:top_n + 1]
    recommendations = movies_with_tags.loc[similar_indices, 'title'].values.tolist()
    return recommendations

In [11]:
get_content_based_recommendations("Toy Story (1995)",4)

["Bug's Life, A (1998)", 'Toy Story 2 (1999)', 'Up (2009)', 'Sintel (2010)']

In [12]:
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


In [13]:
movies_with_title = 3

In [14]:
movies_with_ratings = pd.merge(ratings, movies, on='movieId')

In [15]:
movies_pivot= movies_with_ratings.pivot_table(columns='userId',index='title',values='rating') 
movies_pivot

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
'71 (2014),,,,,,,,,,,...,,,,,,,,,,4.0
'Hellboy': The Seeds of Creation (2004),,,,,,,,,,,...,,,,,,,,,,
'Round Midnight (1986),,,,,,,,,,,...,,,,,,,,,,
'Salem's Lot (2004),,,,,,,,,,,...,,,,,,,,,,
'Til There Was You (1997),,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
eXistenZ (1999),,,,,,,,,,,...,,,5.0,,,,,4.5,,
xXx (2002),,,,,,,,,1.0,,...,,,,,,,,3.5,,2.0
xXx: State of the Union (2005),,,,,,,,,,,...,,,,,,,,,,1.5
¡Three Amigos! (1986),4.0,,,,,,,,,,...,,,,,,,,,,


In [16]:
movies_pivot.fillna(0,inplace=True)

In [17]:
from scipy.sparse import csr_matrix
movies_sparse = csr_matrix(movies_pivot)

In [18]:
from sklearn.neighbors import NearestNeighbors
model = NearestNeighbors(algorithm='brute')

In [19]:
model.fit(movies_sparse)

In [20]:
def get_collaborative_filtering_recommendations(movie_name,top_k):
    movie_id = np.where(movies_pivot.index == movie_name)[0][0]
    distances, suggestions = model.kneighbors(movies_pivot.iloc[movie_id, :].values.reshape(1, -1), n_neighbors=top_k + 1)    
    recommended_movies = [movies_pivot.index[suggestions[0][i]] for i in range(1, len(suggestions[0]))]
    return recommended_movies


In [21]:
get_collaborative_filtering_recommendations("Toy Story (1995)",5)

['Toy Story 2 (1999)',
 'Mission: Impossible (1996)',
 'Independence Day (a.k.a. ID4) (1996)',
 "Bug's Life, A (1998)",
 'Nutty Professor, The (1996)']

In [22]:
def get_hybrid_recommendations( movie_title, top_n):
    content_based_recommendations = get_content_based_recommendations(movie_title, top_n)
    collaborative_filtering_recommendations = get_collaborative_filtering_recommendations(movie_title, top_n)
    hybrid_recommendations = list(set(content_based_recommendations + collaborative_filtering_recommendations))
    return hybrid_recommendations[:top_n]

In [23]:
get_hybrid_recommendations('Toy Story 2 (1999)',10)

['Antz (1998)',
 'Fantasia (1940)',
 'Galaxy Quest (1999)',
 'Aladdin (1992)',
 'Prince of Egypt, The (1998)',
 'Mulan (1998)',
 'Alice in Wonderland (1951)',
 'Batman Returns (1992)',
 '101 Dalmatians (One Hundred and One Dalmatians) (1961)',
 'Honey, I Shrunk the Kids (1989)']

In [24]:
movies_with_ratings.head(1)

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy


In [25]:
from surprise import Dataset, Reader, SVD
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(movies_with_ratings[['userId', 
                                  'movieId', 
                                  'rating']], reader)
algo = SVD()
trainset = data.build_full_trainset()
algo.fit(trainset)


<surprise.prediction_algorithms.matrix_factorization.SVD at 0x12a7ffcd580>

In [26]:
def get_collaborative_filtering_recommendations(user_id, top_n):
    testset = trainset.build_anti_testset()
    testset = filter(lambda x: x[0] == user_id, testset)
    predictions = algo.test(testset)
    predictions.sort(key=lambda x: x.est, reverse=True)
    recommendations = [prediction.iid for prediction in predictions[:top_n]]
    recommended_movies = movies[movies["movieId"].isin(recommendations)]
    recommended_titles = recommended_movies["title"].tolist()
    return recommended_titles

In [27]:
get_collaborative_filtering_recommendations(6,5)

['Go (1999)',
 'Green Mile, The (1999)',
 'City of God (Cidade de Deus) (2002)',
 'There Will Be Blood (2007)',
 'In Bruges (2008)']

In [28]:
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import Input, Embedding, Flatten, Dense, Concatenate
from tensorflow.keras.models import Model

In [39]:
movies.shape

(9742, 3)

In [None]:
ratings['userId'] -= 1
ratings['movieId'] -= 1

train, test = train_test_split(ratings, test_size=0.2, random_state=42)
unique_movie_ids = train['movieId'].unique()
# test = test[test['movieId'].isin(unique_movie_ids)]

(8983,)


In [42]:
user_input = Input(shape=(1,), name='user_input')
item_input = Input(shape=(1,), name='item_input')

user_embedding = Embedding(input_dim=ratings['userId'].nunique(), output_dim=50, name='user_embedding')(user_input)
item_embedding = Embedding(input_dim=ratings['movieId'].nunique(), output_dim=50, name='item_embedding')(item_input)

user_vector = Flatten()(user_embedding)
item_vector = Flatten()(item_embedding)

concat = Concatenate()([user_vector, item_vector])
dense_1 = Dense(128, activation='relu')(concat)
dense_2 = Dense(64, activation='relu')(dense_1)
output = Dense(1, activation='sigmoid')(dense_2)


In [43]:
model = Model([user_input, item_input], output)
model.compile(optimizer='adam', loss='mean_squared_error', metrics=['accuracy'])

model.summary()

In [44]:
# Prepare inputs for training
train_user = train['userId'].values
train_item = train['movieId'].values
train_rating = train['rating'].values / 5.0  # Normalize ratings

test_user = test['userId'].values
test_item = test['movieId'].values
test_rating = test['rating'].values / 5.0

# Train the model
history = model.fit(
    [train_user, train_item], train_rating,
    validation_data=([test_user, test_item], test_rating),
    epochs=10, batch_size=256, verbose=1
)


Epoch 1/10


InvalidArgumentError: Graph execution error:

Detected at node functional_1_1/item_embedding_1/GatherV2 defined at (most recent call last):
  File "<frozen runpy>", line 198, in _run_module_as_main

  File "<frozen runpy>", line 88, in _run_code

  File "c:\Users\Ayush\ML-Projects\MovieLens\.venv\Lib\site-packages\ipykernel_launcher.py", line 18, in <module>

  File "c:\Users\Ayush\ML-Projects\MovieLens\.venv\Lib\site-packages\traitlets\config\application.py", line 1075, in launch_instance

  File "c:\Users\Ayush\ML-Projects\MovieLens\.venv\Lib\site-packages\ipykernel\kernelapp.py", line 739, in start

  File "c:\Users\Ayush\ML-Projects\MovieLens\.venv\Lib\site-packages\tornado\platform\asyncio.py", line 205, in start

  File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.12_3.12.2288.0_x64__qbz5n2kfra8p0\Lib\asyncio\base_events.py", line 640, in run_forever

  File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.12_3.12.2288.0_x64__qbz5n2kfra8p0\Lib\asyncio\base_events.py", line 1992, in _run_once

  File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.12_3.12.2288.0_x64__qbz5n2kfra8p0\Lib\asyncio\events.py", line 88, in _run

  File "c:\Users\Ayush\ML-Projects\MovieLens\.venv\Lib\site-packages\ipykernel\kernelbase.py", line 545, in dispatch_queue

  File "c:\Users\Ayush\ML-Projects\MovieLens\.venv\Lib\site-packages\ipykernel\kernelbase.py", line 534, in process_one

  File "c:\Users\Ayush\ML-Projects\MovieLens\.venv\Lib\site-packages\ipykernel\kernelbase.py", line 437, in dispatch_shell

  File "c:\Users\Ayush\ML-Projects\MovieLens\.venv\Lib\site-packages\ipykernel\ipkernel.py", line 362, in execute_request

  File "c:\Users\Ayush\ML-Projects\MovieLens\.venv\Lib\site-packages\ipykernel\kernelbase.py", line 778, in execute_request

  File "c:\Users\Ayush\ML-Projects\MovieLens\.venv\Lib\site-packages\ipykernel\ipkernel.py", line 449, in do_execute

  File "c:\Users\Ayush\ML-Projects\MovieLens\.venv\Lib\site-packages\ipykernel\zmqshell.py", line 549, in run_cell

  File "c:\Users\Ayush\ML-Projects\MovieLens\.venv\Lib\site-packages\IPython\core\interactiveshell.py", line 3075, in run_cell

  File "c:\Users\Ayush\ML-Projects\MovieLens\.venv\Lib\site-packages\IPython\core\interactiveshell.py", line 3130, in _run_cell

  File "c:\Users\Ayush\ML-Projects\MovieLens\.venv\Lib\site-packages\IPython\core\async_helpers.py", line 128, in _pseudo_sync_runner

  File "c:\Users\Ayush\ML-Projects\MovieLens\.venv\Lib\site-packages\IPython\core\interactiveshell.py", line 3334, in run_cell_async

  File "c:\Users\Ayush\ML-Projects\MovieLens\.venv\Lib\site-packages\IPython\core\interactiveshell.py", line 3517, in run_ast_nodes

  File "c:\Users\Ayush\ML-Projects\MovieLens\.venv\Lib\site-packages\IPython\core\interactiveshell.py", line 3577, in run_code

  File "C:\Users\Ayush\AppData\Local\Temp\ipykernel_14000\1310881490.py", line 11, in <module>

  File "c:\Users\Ayush\ML-Projects\MovieLens\.venv\Lib\site-packages\keras\src\utils\traceback_utils.py", line 117, in error_handler

  File "c:\Users\Ayush\ML-Projects\MovieLens\.venv\Lib\site-packages\keras\src\backend\tensorflow\trainer.py", line 368, in fit

  File "c:\Users\Ayush\ML-Projects\MovieLens\.venv\Lib\site-packages\keras\src\backend\tensorflow\trainer.py", line 216, in function

  File "c:\Users\Ayush\ML-Projects\MovieLens\.venv\Lib\site-packages\keras\src\backend\tensorflow\trainer.py", line 129, in multi_step_on_iterator

  File "c:\Users\Ayush\ML-Projects\MovieLens\.venv\Lib\site-packages\keras\src\backend\tensorflow\trainer.py", line 110, in one_step_on_data

  File "c:\Users\Ayush\ML-Projects\MovieLens\.venv\Lib\site-packages\keras\src\backend\tensorflow\trainer.py", line 56, in train_step

  File "c:\Users\Ayush\ML-Projects\MovieLens\.venv\Lib\site-packages\keras\src\utils\traceback_utils.py", line 117, in error_handler

  File "c:\Users\Ayush\ML-Projects\MovieLens\.venv\Lib\site-packages\keras\src\layers\layer.py", line 899, in __call__

  File "c:\Users\Ayush\ML-Projects\MovieLens\.venv\Lib\site-packages\keras\src\utils\traceback_utils.py", line 117, in error_handler

  File "c:\Users\Ayush\ML-Projects\MovieLens\.venv\Lib\site-packages\keras\src\ops\operation.py", line 46, in __call__

  File "c:\Users\Ayush\ML-Projects\MovieLens\.venv\Lib\site-packages\keras\src\utils\traceback_utils.py", line 156, in error_handler

  File "c:\Users\Ayush\ML-Projects\MovieLens\.venv\Lib\site-packages\keras\src\models\functional.py", line 182, in call

  File "c:\Users\Ayush\ML-Projects\MovieLens\.venv\Lib\site-packages\keras\src\ops\function.py", line 171, in _run_through_graph

  File "c:\Users\Ayush\ML-Projects\MovieLens\.venv\Lib\site-packages\keras\src\models\functional.py", line 632, in call

  File "c:\Users\Ayush\ML-Projects\MovieLens\.venv\Lib\site-packages\keras\src\utils\traceback_utils.py", line 117, in error_handler

  File "c:\Users\Ayush\ML-Projects\MovieLens\.venv\Lib\site-packages\keras\src\layers\layer.py", line 899, in __call__

  File "c:\Users\Ayush\ML-Projects\MovieLens\.venv\Lib\site-packages\keras\src\utils\traceback_utils.py", line 117, in error_handler

  File "c:\Users\Ayush\ML-Projects\MovieLens\.venv\Lib\site-packages\keras\src\ops\operation.py", line 46, in __call__

  File "c:\Users\Ayush\ML-Projects\MovieLens\.venv\Lib\site-packages\keras\src\utils\traceback_utils.py", line 156, in error_handler

  File "c:\Users\Ayush\ML-Projects\MovieLens\.venv\Lib\site-packages\keras\src\layers\core\embedding.py", line 140, in call

  File "c:\Users\Ayush\ML-Projects\MovieLens\.venv\Lib\site-packages\keras\src\ops\numpy.py", line 5239, in take

  File "c:\Users\Ayush\ML-Projects\MovieLens\.venv\Lib\site-packages\keras\src\backend\tensorflow\numpy.py", line 2063, in take

indices[6,0] = 88807 is not in [0, 9724)
	 [[{{node functional_1_1/item_embedding_1/GatherV2}}]] [Op:__inference_multi_step_on_iterator_3621]