In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from keras.models import Model
from keras.layers import Input, Embedding, Dot, Flatten, Dense


In [4]:
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')

In [6]:
df = ratings.merge(movies, on='movieId')
df = df[['userId', 'movieId', 'rating', 'title', 'genres']]
df.dropna(inplace=True)
df.head()

Unnamed: 0,userId,movieId,rating,title,genres
0,1,1,4.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,1,3,4.0,Grumpier Old Men (1995),Comedy|Romance
2,1,6,4.0,Heat (1995),Action|Crime|Thriller
3,1,47,5.0,Seven (a.k.a. Se7en) (1995),Mystery|Thriller
4,1,50,5.0,"Usual Suspects, The (1995)",Crime|Mystery|Thriller


In [8]:
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(movies['genres'].fillna(''))


In [10]:
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)


In [12]:
def recommend_movies(title, cosine_sim=cosine_sim):
    idx = movies.loc[movies['title'] == title].index[0]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:11]
    movie_indices = [i[0] for i in sim_scores]
    return movies['title'].iloc[movie_indices]


In [14]:
user_ids = df['userId'].unique().tolist()
movie_ids = df['movieId'].unique().tolist()

user_to_index = {user_id: i for i, user_id in enumerate(user_ids)}
movie_to_index = {movie_id: i for i, movie_id in enumerate(movie_ids)}

df['user'] = df['userId'].map(user_to_index)
df['movie'] = df['movieId'].map(movie_to_index)


In [16]:
num_users = len(user_ids)
num_movies = len(movie_ids)

# User input layer
user_input = Input(shape=(1,))
user_embedding = Embedding(num_users, 50)(user_input)
user_vec = Flatten()(user_embedding)

# Movie input layer
movie_input = Input(shape=(1,))
movie_embedding = Embedding(num_movies, 50)(movie_input)
movie_vec = Flatten()(movie_embedding)

# Dot product of user and movie vectors
dot_product = Dot(axes=1)([user_vec, movie_vec])

# Output layer
output = Dense(1)(dot_product)

# Build and compile model
model = Model([user_input, movie_input], output)
model.compile(optimizer='adam', loss='mean_squared_error')


In [18]:
X = df[['user', 'movie']]
y = df['rating']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [20]:
history = model.fit(
    [X_train['user'], X_train['movie']],
    y_train,
    batch_size=64,
    epochs=10,
    validation_split=0.1
)


Epoch 1/10
[1m1135/1135[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 10ms/step - loss: 10.8881 - val_loss: 1.9408
Epoch 2/10
[1m1135/1135[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 12ms/step - loss: 1.3610 - val_loss: 1.1740
Epoch 3/10
[1m1135/1135[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 12ms/step - loss: 0.7649 - val_loss: 1.0652
Epoch 4/10
[1m1135/1135[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 12ms/step - loss: 0.5634 - val_loss: 1.0492
Epoch 5/10
[1m1135/1135[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 16ms/step - loss: 0.4262 - val_loss: 1.0643
Epoch 6/10
[1m1135/1135[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 12ms/step - loss: 0.3192 - val_loss: 1.0875
Epoch 7/10
[1m1135/1135[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 13ms/step - loss: 0.2416 - val_loss: 1.1169
Epoch 8/10
[1m1135/1135[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 12ms/step - loss: 0.1867 - val_loss: 1.1265
Epoch 9

In [22]:
loss = model.evaluate([X_test['user'], X_test['movie']], y_test)
print(f'Test Loss: {loss}')


[1m631/631[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 1.1279
Test Loss: 1.1320723295211792


In [24]:
def recommend_for_user(user_id, num_recommendations=10):
    user_index = user_to_index[user_id]
    movie_indices = np.array(list(movie_to_index.values()))
    predicted_ratings = model.predict([np.full_like(movie_indices, user_index), movie_indices]).flatten()
    recommended_movie_indices = np.argsort(predicted_ratings)[-num_recommendations:][::-1]
    recommended_movies = [movies.iloc[i]['title'] for i in recommended_movie_indices]
    return recommended_movies


In [36]:
recommend_for_user(10)


[1m304/304[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step


['Right Stuff, The (1983)',
 'Emma (1996)',
 'Mediterraneo (1991)',
 'Dead Poets Society (1989)',
 'Skipped Parts (2000)',
 'Firewalker (1986)',
 'Fried Green Tomatoes (1991)',
 'Sting, The (1973)',
 'Army of Darkness (1993)',
 'My Left Foot (1989)']