In [2]:
import pandas as pd
import numpy as np

In [3]:
movies = pd.read_csv("datasets/movies.csv")
ratings = pd.read_csv("datasets/ratings.csv")
tags = pd.read_csv("datasets/tags.csv")

movies.head(2)
ratings.head(2)
tags.head(10)

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200
5,2,89774,Tom Hardy,1445715205
6,2,106782,drugs,1445715054
7,2,106782,Leonardo DiCaprio,1445715051
8,2,106782,Martin Scorsese,1445715056
9,7,48516,way too long,1169687325


In [4]:
tags.isnull().sum()

userId       0
movieId      0
tag          0
timestamp    0
dtype: int64

In [5]:
merged_df = pd.merge(movies, tags, on='movieId')


movies_with_tags = merged_df.groupby(['movieId', 'title', 'genres'])['tag'].apply(lambda x: ', '.join(x)).reset_index()

movies_with_tags.rename(columns={'tag': 'tags'}, inplace=True)


In [6]:
movies_with_tags['genres'] = movies_with_tags['genres'].str.replace('|', ',', regex=False)  

In [7]:
movies_with_tags['content'] = movies_with_tags['genres'] + ', ' + movies_with_tags['tags']

In [8]:
movies_with_tags

Unnamed: 0,movieId,title,genres,tags,content
0,1,Toy Story (1995),"Adventure,Animation,Children,Comedy,Fantasy","pixar, pixar, fun","Adventure,Animation,Children,Comedy,Fantasy, p..."
1,2,Jumanji (1995),"Adventure,Children,Fantasy","fantasy, magic board game, Robin Williams, game","Adventure,Children,Fantasy, fantasy, magic boa..."
2,3,Grumpier Old Men (1995),"Comedy,Romance","moldy, old","Comedy,Romance, moldy, old"
3,5,Father of the Bride Part II (1995),Comedy,"pregnancy, remake","Comedy, pregnancy, remake"
4,7,Sabrina (1995),"Comedy,Romance",remake,"Comedy,Romance, remake"
...,...,...,...,...,...
1567,183611,Game Night (2018),"Action,Comedy,Crime,Horror","Comedy, funny, Rachel McAdams","Action,Comedy,Crime,Horror, Comedy, funny, Rac..."
1568,184471,Tomb Raider (2018),"Action,Adventure,Fantasy","adventure, Alicia Vikander, video game adaptation","Action,Adventure,Fantasy, adventure, Alicia Vi..."
1569,187593,Deadpool 2 (2018),"Action,Comedy,Sci-Fi","Josh Brolin, Ryan Reynolds, sarcasm","Action,Comedy,Sci-Fi, Josh Brolin, Ryan Reynol..."
1570,187595,Solo: A Star Wars Story (2018),"Action,Adventure,Children,Sci-Fi","Emilia Clarke, star wars","Action,Adventure,Children,Sci-Fi, Emilia Clark..."


In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer()

tfidf_matrix = tfidf_vectorizer.fit_transform(movies_with_tags['content'])

In [10]:
from sklearn.metrics.pairwise import linear_kernel
content_similarity = linear_kernel(tfidf_matrix, tfidf_matrix)


In [11]:
def get_content_based_recommendations(movie_title, top_n):
    index = movies_with_tags[movies_with_tags['title'] == movie_title].index[0]
    similarity_scores = content_similarity[index]
    similar_indices = similarity_scores.argsort()[::-1][1:top_n + 1]
    recommendations = movies_with_tags.loc[similar_indices, 'title'].values.tolist()
    return recommendations

In [12]:
get_content_based_recommendations("Toy Story (1995)",4)

["Bug's Life, A (1998)", 'Toy Story 2 (1999)', 'Up (2009)', 'Sintel (2010)']

In [13]:
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


In [14]:
movies_with_title = 3

In [15]:
movies_with_ratings = pd.merge(ratings, movies, on='movieId')
movies_with_ratings = movies_with_ratings.sort_values(by=['userId', 'movieId', 'timestamp'], ascending=[True, True, False])
movies_with_ratings = movies_with_ratings.drop_duplicates(subset=['userId', 'movieId'], keep='first')
movies_with_ratings = movies_with_ratings.reset_index(drop=True)
movies_with_ratings['release_year'] = movies_with_ratings['title'].str.extract(r'\((\d{4})\)')

In [16]:
movies_pivot= movies_with_ratings.pivot_table(columns='userId',index='title',values='rating') 
movies_pivot

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
'71 (2014),,,,,,,,,,,...,,,,,,,,,,4.0
'Hellboy': The Seeds of Creation (2004),,,,,,,,,,,...,,,,,,,,,,
'Round Midnight (1986),,,,,,,,,,,...,,,,,,,,,,
'Salem's Lot (2004),,,,,,,,,,,...,,,,,,,,,,
'Til There Was You (1997),,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
eXistenZ (1999),,,,,,,,,,,...,,,5.0,,,,,4.5,,
xXx (2002),,,,,,,,,1.0,,...,,,,,,,,3.5,,2.0
xXx: State of the Union (2005),,,,,,,,,,,...,,,,,,,,,,1.5
¡Three Amigos! (1986),4.0,,,,,,,,,,...,,,,,,,,,,


In [17]:
movies_pivot.fillna(0,inplace=True)

In [18]:
from scipy.sparse import csr_matrix
movies_sparse = csr_matrix(movies_pivot)

In [19]:
from sklearn.neighbors import NearestNeighbors
model = NearestNeighbors(algorithm='brute')

In [20]:
model.fit(movies_sparse)

In [21]:
def get_collaborative_filtering_recommendations(movie_name,top_k):
    movie_id = np.where(movies_pivot.index == movie_name)[0][0]
    distances, suggestions = model.kneighbors(movies_pivot.iloc[movie_id, :].values.reshape(1, -1), n_neighbors=top_k + 1)    
    recommended_movies = [movies_pivot.index[suggestions[0][i]] for i in range(1, len(suggestions[0]))]
    return recommended_movies


In [22]:
get_collaborative_filtering_recommendations("Toy Story (1995)",5)

['Toy Story 2 (1999)',
 'Mission: Impossible (1996)',
 'Independence Day (a.k.a. ID4) (1996)',
 "Bug's Life, A (1998)",
 'Nutty Professor, The (1996)']

In [23]:
def get_hybrid_recommendations( movie_title, top_n):
    content_based_recommendations = get_content_based_recommendations(movie_title, top_n)
    collaborative_filtering_recommendations = get_collaborative_filtering_recommendations(movie_title, top_n)
    hybrid_recommendations = list(set(content_based_recommendations + collaborative_filtering_recommendations))
    return hybrid_recommendations[:top_n]

In [24]:
get_hybrid_recommendations('Toy Story 2 (1999)',10)

['Fox and the Hound, The (1981)',
 'Alice in Wonderland (1951)',
 'Mulan (1998)',
 'Antz (1998)',
 'Galaxy Quest (1999)',
 'Peter Pan (1953)',
 'Prince of Egypt, The (1998)',
 'Aristocats, The (1970)',
 "Bug's Life, A (1998)",
 'Toy Story (1995)']

In [25]:
movies_with_ratings.head(1)

Unnamed: 0,userId,movieId,rating,timestamp,title,genres,release_year
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1995


In [26]:
from surprise import Dataset, Reader, SVD
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(movies_with_ratings[['userId', 
                                  'movieId', 
                                  'rating']], reader)
algo = SVD()
trainset = data.build_full_trainset()
algo.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x148fabe0320>

In [27]:
def get_collaborative_filtering_recommendations(user_id, top_n):
    testset = trainset.build_anti_testset()
    testset = filter(lambda x: x[0] == user_id, testset)
    predictions = algo.test(testset)
    predictions.sort(key=lambda x: x.est, reverse=True)
    recommendations = [prediction.iid for prediction in predictions[:top_n]]
    recommended_movies = movies[movies["movieId"].isin(recommendations)]
    recommended_titles = recommended_movies["title"].tolist()
    return recommended_titles

In [28]:
get_collaborative_filtering_recommendations(6,5)

['Seventh Seal, The (Sjunde inseglet, Det) (1957)',
 'Patton (1970)',
 'Remember the Titans (2000)',
 'Gallipoli (1981)',
 'Dark Knight, The (2008)']

In [29]:
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import Input, Embedding, Flatten, Dense, Concatenate
from tensorflow.keras.models import Model
from datetime import datetime


In [30]:
def compute_time_decay(timestamp, current_time, lambda_decay=0.001):
    time_difference = (current_time - timestamp) / (60 * 60 * 24)  # Convert to days
    decay_weight = np.exp(-lambda_decay * time_difference)
    return decay_weight

In [31]:
current_time = movies_with_ratings['timestamp'].max()
movies_with_ratings['decay_weight'] = movies_with_ratings['timestamp'].apply(
    lambda x: compute_time_decay(x, current_time)
)

In [32]:
movies_with_ratings['userId'] -= 1
movies_with_ratings['movieId'] -= 1

train, test = train_test_split(movies_with_ratings, test_size=0.2, random_state=42)

In [33]:
train['userId'], user_index = pd.factorize(train['userId'])
train['movieId'], movie_index = pd.factorize(train['movieId'])
test['userId'] = test['userId'].map(lambda x: user_index.get_loc(x) if x in user_index else -1)
test['movieId'] = test['movieId'].map(lambda x: movie_index.get_loc(x) if x in movie_index else -1)
test = test[(test['userId'] != -1) & (test['movieId'] != -1)]

In [34]:
user_input = Input(shape=(1,), name='user_input')
item_input = Input(shape=(1,), name='item_input')

user_embedding = Embedding(input_dim=movies_with_ratings['userId'].nunique(), output_dim=50, name='user_embedding')(user_input)
item_embedding = Embedding(input_dim=movies_with_ratings['movieId'].nunique(), output_dim=50, name='item_embedding')(item_input)

user_vector = Flatten()(user_embedding)
item_vector = Flatten()(item_embedding)

concat = Concatenate()([user_vector, item_vector])
dense_1 = Dense(128, activation='relu')(concat)
dense_2 = Dense(64, activation='relu')(dense_1)
output = Dense(1, activation='sigmoid')(dense_2)


In [35]:
model = Model([user_input, item_input], output)
model.compile(optimizer='adam', loss='mean_squared_error', metrics=['accuracy'])

model.summary()

In [36]:
train_user = train['userId'].values
train_item = train['movieId'].values
train_rating = train['rating'].values / 5.0  * train['decay_weight'].values 

test_user = test['userId'].values
test_item = test['movieId'].values
test_rating = test['rating'].values / 5.0 * test['decay_weight'].values

history = model.fit(
    [train_user, train_item], train_rating,
    validation_data=([test_user, test_item], test_rating),
    epochs=10, batch_size=256, verbose=1
)


Epoch 1/10
[1m316/316[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 9ms/step - accuracy: 0.0000e+00 - loss: 0.0486 - val_accuracy: 0.0000e+00 - val_loss: 0.0053
Epoch 2/10
[1m316/316[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 8ms/step - accuracy: 0.0000e+00 - loss: 0.0051 - val_accuracy: 0.0000e+00 - val_loss: 0.0049
Epoch 3/10
[1m316/316[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 8ms/step - accuracy: 0.0000e+00 - loss: 0.0038 - val_accuracy: 0.0000e+00 - val_loss: 0.0049
Epoch 4/10
[1m316/316[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 8ms/step - accuracy: 0.0000e+00 - loss: 0.0028 - val_accuracy: 0.0000e+00 - val_loss: 0.0047
Epoch 5/10
[1m316/316[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 8ms/step - accuracy: 0.0000e+00 - loss: 0.0023 - val_accuracy: 0.0000e+00 - val_loss: 0.0048
Epoch 6/10
[1m316/316[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 8ms/step - accuracy: 0.0000e+00 - loss: 0.0019 - val_accuracy: 0.0000e+00

In [37]:
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9739,193585,Flint (2017),Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


In [39]:
def recommend_items(user_id, model, n=5):
    item_ids = np.arange(movies_with_ratings['movieId'].nunique())
    user_array = np.full(len(item_ids), user_id)
    predictions = model.predict([user_array, item_ids]).flatten()
    top_n_items = predictions.argsort()[-n:][::-1]
    recommended_movie_ids = movie_index[top_n_items]
    recommended_movies = movies_with_ratings[movies_with_ratings["movieId"].isin(recommended_movie_ids)]
    recommended_titles = recommended_movies["title"].tolist()
    return recommended_titles

# Example: Recommend top 5 items for user 0
print("Recommended items:", recommend_items(5, model, n=5))

[1m304/304[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step
Recommended items: ['Blade Runner 2049 (2017)', 'Band of Brothers (2001)', 'Blade Runner 2049 (2017)', 'Blade Runner 2049 (2017)', 'Blade Runner 2049 (2017)', 'Band of Brothers (2001)', 'Blade Runner 2049 (2017)', 'Blade Runner 2049 (2017)', 'Band of Brothers (2001)', 'Blade Runner 2049 (2017)', 'Blade Runner 2049 (2017)', 'Blade Runner 2049 (2017)', 'Band of Brothers (2001)', 'Blade Runner 2049 (2017)', 'Band of Brothers (2001)', 'Blade Runner 2049 (2017)', 'Blade Runner 2049 (2017)', 'Band of Brothers (2001)', 'Blade Runner 2049 (2017)', "Dad's Army (1971)", "Won't You Be My Neighbor? (2018)", 'Sorry to Bother You (2018)', 'Blade Runner 2049 (2017)', 'Blade Runner 2049 (2017)', 'Blade Runner 2049 (2017)', 'Blade Runner 2049 (2017)', 'Band of Brothers (2001)', 'Blade Runner 2049 (2017)']
