In [1]:
import pandas as pd

ratings = pd.read_csv('ratings.csv')
movies = pd.read_csv('movies.csv')
tags = pd.read_csv('tags.csv')

In [2]:
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
62418,209157,We (2018),Drama
62419,209159,Window of the Soul (2001),Documentary
62420,209163,Bad Poems (2018),Comedy|Drama
62421,209169,A Girl Thing (2001),(no genres listed)


In [3]:
movies.head(10)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
5,6,Heat (1995),Action|Crime|Thriller
6,7,Sabrina (1995),Comedy|Romance
7,8,Tom and Huck (1995),Adventure|Children
8,9,Sudden Death (1995),Action
9,10,GoldenEye (1995),Action|Adventure|Thriller


In [4]:
# Preprocess the data
movies.dropna(inplace=True)  # remove rows with missing values

# Extract the features of the movies
genres = movies['genres'].str.get_dummies(sep='|')  # one-hot encode genres

# Combine the extracted features into a single dataframe
movie_features = genres

In [5]:
movie_features

Unnamed: 0,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,0,0,1,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
1,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0
3,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0
4,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
62418,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
62419,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
62420,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0
62421,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [6]:
import numpy as np

# Normalize the movie feature vectors
movie_features = movie_features.apply(lambda x: x/np.linalg.norm(x))

# Compute the dot product between the movie feature vectors
dot_product = movie_features.dot(movie_features.T)

# Compute the cosine similarity between the movie feature vectors
cosine_similarity = dot_product / (np.linalg.norm(movie_features, axis=1) * np.linalg.norm(movie_features, axis=1).T)


In [7]:
dot_product

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,62413,62414,62415,62416,62417,62418,62419,62420,62421,62422
0,0.001349,0.000948,0.000059,0.000059,0.000059,0.000000,0.000059,0.000582,0.000000,0.000241,...,0.000000,0.0,0.000000,0.000059,0.000059,0.000000,0.000000,0.000059,0.000000,0.000241
1,0.000948,0.000948,0.000000,0.000000,0.000000,0.000000,0.000000,0.000582,0.000000,0.000241,...,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000241
2,0.000059,0.000000,0.000189,0.000189,0.000059,0.000000,0.000189,0.000000,0.000000,0.000000,...,0.000000,0.0,0.000000,0.000059,0.000189,0.000000,0.000000,0.000059,0.000000,0.000000
3,0.000059,0.000000,0.000189,0.000228,0.000059,0.000000,0.000189,0.000000,0.000000,0.000000,...,0.000039,0.0,0.000000,0.000098,0.000189,0.000039,0.000000,0.000098,0.000000,0.000039
4,0.000059,0.000000,0.000059,0.000059,0.000059,0.000000,0.000059,0.000000,0.000000,0.000000,...,0.000000,0.0,0.000000,0.000059,0.000059,0.000000,0.000000,0.000059,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
62418,0.000000,0.000000,0.000000,0.000039,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000039,0.0,0.000000,0.000039,0.000000,0.000039,0.000000,0.000039,0.000000,0.000039
62419,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000178,0.000000,0.000000,0.000000
62420,0.000059,0.000000,0.000059,0.000098,0.000059,0.000000,0.000059,0.000000,0.000000,0.000000,...,0.000039,0.0,0.000000,0.000098,0.000059,0.000039,0.000000,0.000098,0.000000,0.000039
62421,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.0,0.000198,0.000000,0.000000,0.000000,0.000000,0.000000,0.000198,0.000000


In [8]:
cosine_similarity

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,62413,62414,62415,62416,62417,62418,62419,62420,62421,62422
0,1.000000,1.000000,0.313921,0.260122,1.0,0.000000,0.313921,1.000000,0.0,0.489460,...,0.0,0.0,0.0,0.602835,0.182436,0.0,0.0,0.602835,0.0,0.579383
1,0.702934,1.000000,0.000000,0.000000,0.0,0.000000,0.000000,1.000000,0.0,0.489460,...,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.579383
2,0.043947,0.000000,1.000000,0.828624,1.0,0.000000,1.000000,0.000000,0.0,0.000000,...,0.0,0.0,0.0,0.602835,0.581152,0.0,0.0,0.602835,0.0,0.000000
3,0.043947,0.000000,1.000000,1.000000,1.0,0.000000,1.000000,0.000000,0.0,0.000000,...,1.0,0.0,0.0,1.000000,0.581152,1.0,0.0,1.000000,0.0,0.093788
4,0.043947,0.000000,0.313921,0.260122,1.0,0.000000,0.313921,0.000000,0.0,0.000000,...,0.0,0.0,0.0,0.602835,0.182436,0.0,0.0,0.602835,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
62418,0.000000,0.000000,0.000000,0.171376,0.0,0.000000,0.000000,0.000000,0.0,0.000000,...,1.0,0.0,0.0,0.397165,0.000000,1.0,0.0,0.397165,0.0,0.093788
62419,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,...,0.0,0.0,0.0,0.000000,0.000000,0.0,1.0,0.000000,0.0,0.000000
62420,0.043947,0.000000,0.313921,0.431499,1.0,0.000000,0.313921,0.000000,0.0,0.000000,...,1.0,0.0,0.0,1.000000,0.182436,1.0,0.0,1.000000,0.0,0.093788
62421,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,...,0.0,0.0,1.0,0.000000,0.000000,0.0,0.0,0.000000,1.0,0.000000


In [9]:
ratings.head(10)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510
5,1,1088,4.0,1147868495
6,1,1175,3.5,1147868826
7,1,1217,3.5,1147878326
8,1,1237,5.0,1147868839
9,1,1250,4.0,1147868414


In [10]:
# Select a user
user_id = 18

# Select a movie that the user has liked in the past
liked_movie_id = ratings[ratings['userId'] == user_id]['movieId'].values[0]

In [11]:
movie_features

Unnamed: 0,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,0.000000,0.000000,0.015532,0.018477,0.018458,0.007699,0.0,0.000000,0.000000,0.019135,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
1,0.000000,0.000000,0.015532,0.000000,0.018458,0.000000,0.0,0.000000,0.000000,0.019135,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
2,0.000000,0.000000,0.000000,0.000000,0.000000,0.007699,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.011382,0.0,0.0,0.0,0.0
3,0.000000,0.000000,0.000000,0.000000,0.000000,0.007699,0.0,0.000000,0.006249,0.000000,0.0,0.0,0.0,0.0,0.0,0.011382,0.0,0.0,0.0,0.0
4,0.000000,0.000000,0.000000,0.000000,0.000000,0.007699,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
62418,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.006249,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
62419,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.013357,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
62420,0.000000,0.000000,0.000000,0.000000,0.000000,0.007699,0.0,0.000000,0.006249,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
62421,0.014055,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0


In [12]:
# Find the index of the selected movie in the movie_features dataframe
liked_movie_index = movie_features.index[movies['movieId'] == liked_movie_id].tolist()[0]

In [13]:
# Find the top-N most similar movies to the selected movie
N = 10  # number of recommendations
similar_movie_indices = cosine_similarity[liked_movie_index].sort_values(ascending=False)[1:N+1].index.tolist()

  similar_movie_indices = cosine_similarity[liked_movie_index].sort_values(ascending=False)[1:N+1].index.tolist()


In [14]:
# Recommend the top-N most similar movies to the user
recommended_movie_ids = movies.iloc[similar_movie_indices]['movieId'].tolist()
recommended_movies = movies[movies['movieId'].isin(recommended_movie_ids)]
print(recommended_movies)

       movieId                                              title  \
661        673                                   Space Jam (1996)   
4780      4886                              Monsters, Inc. (2001)   
8571     26093  Wonderful World of the Brothers Grimm, The (1962)   
12969    65577                     Tale of Despereaux, The (2008)   
17686    92348      Puss in Boots (Nagagutsu o haita neko) (1969)   
22286   114240                                     Aladdin (1992)   
22633   115875          Toy Story Toons: Hawaiian Vacation (2011)   
22634   115879                  Toy Story Toons: Small Fry (2011)   
50013   180091            Pokémon the Movie: I Choose You! (2017)   
50701   181601                     Olaf's Frozen Adventure (2017)   

                                                  genres  
661    Adventure|Animation|Children|Comedy|Fantasy|Sc...  
4780         Adventure|Animation|Children|Comedy|Fantasy  
8571   Adventure|Animation|Children|Comedy|Drama|Fant...  
1296

In [15]:
def predict_movies(userId):
    # Select a movie that the user has liked in the past
    liked_movie_id = ratings[ratings['userId'] == userId]['movieId'].values[0]
    # Find the index of the selected movie in the movie_features dataframe
    liked_movie_index = movie_features.index[movies['movieId'] == liked_movie_id].tolist()[0]
    # Find the top-N most similar movies to the selected movie
    N = 10  # number of recommendations
    similar_movie_indices = cosine_similarity[liked_movie_index].sort_values(ascending=False)[1:N+1].index.tolist()
    # Recommend the top-N most similar movies to the user
    recommended_movie_ids = movies.iloc[similar_movie_indices]['movieId'].tolist()
    recommended_movies = movies[movies['movieId'].isin(recommended_movie_ids)]
    return recommended_movies

In [18]:
predict_movies(125)

  similar_movie_indices = cosine_similarity[liked_movie_index].sort_values(ascending=False)[1:N+1].index.tolist()


Unnamed: 0,movieId,title,genres
661,673,Space Jam (1996),Adventure|Animation|Children|Comedy|Fantasy|Sc...
4780,4886,"Monsters, Inc. (2001)",Adventure|Animation|Children|Comedy|Fantasy
8571,26093,"Wonderful World of the Brothers Grimm, The (1962)",Adventure|Animation|Children|Comedy|Drama|Fant...
12969,65577,"Tale of Despereaux, The (2008)",Adventure|Animation|Children|Comedy|Fantasy
17686,92348,Puss in Boots (Nagagutsu o haita neko) (1969),Adventure|Animation|Children|Comedy|Fantasy|Ro...
22286,114240,Aladdin (1992),Adventure|Animation|Children|Comedy|Fantasy
22633,115875,Toy Story Toons: Hawaiian Vacation (2011),Adventure|Animation|Children|Comedy|Fantasy
22634,115879,Toy Story Toons: Small Fry (2011),Adventure|Animation|Children|Comedy|Fantasy
50013,180091,Pokémon the Movie: I Choose You! (2017),Adventure|Animation|Children|Comedy|Drama|Fantasy
50701,181601,Olaf's Frozen Adventure (2017),Adventure|Animation|Children|Comedy|Fantasy


In [19]:
def predict_rating(userId, movieId):
    # Find the index of the selected movie in the movie_features dataframe
    movie_index = movie_features.index[movies['movieId'] == movieId].tolist()[0]
    # Find the top-N most similar movies to the selected movie
    N = 10  # number of recommendations
    similar_movie_indices = cosine_similarity[movie_index].sort_values(ascending=False)[1:N+1].index.tolist()
    # Select the movies that the user has rated in the past
    rated_movies = ratings[ratings['userId'] == userId]['movieId'].tolist()
    # Calculate the average rating for the selected movies
    average_rating = ratings[ratings['movieId'].isin(rated_movies)]['rating'].mean()
    # Predict the rating for the selected movie
    prediction = average_rating
    return prediction

In [130]:
import numpy as np
from sklearn.metrics import mean_squared_error, precision_score
from sklearn.model_selection import train_test_split
# Split the ratings data into a training set and a test set
ratings_train, ratings_test = train_test_split(ratings, test_size=0.2)

# Train your recommendation system using the training set
# (code to train the recommendation system goes here)

# Make predictions on the test set using the trained recommendation system
predicted_ratings = []
for _, row in ratings_test.iterrows():
    userId = row['userId']
    movieId = row['movieId']
    predicted_rating = predict_rating(userId, movieId)  # function to make predictions
    predicted_ratings.append(predicted_rating)

# Calculate MSE and RMSE
mse = mean_squared_error(ratings_test['rating'], predicted_ratings)
rmse = np.sqrt(mse)
# Calculate precision score
precision = precision_score(ratings_test['rating'], predicted_ratings)

print('MSE:', mse)
print('RMSE:', rmse)
print('Precision:', precision)

  similar_movie_indices = cosine_similarity[movie_index].sort_values(ascending=False)[1:N+1].index.tolist()


KeyboardInterrupt: 

In [21]:
from sklearn.metrics import mean_squared_error, precision_score
from sklearn.model_selection import train_test_split
X_train, X_test = train_test_split(ratings, test_size=1000)
train_size = X_train.shape[0]
test_size = X_test.shape[0]
print("Test size:", test_size)
error = 0
for k in range(test_size): 
    u = X_test.iloc[k,0]
    i = X_test.iloc[k,1]
    r = X_test.iloc[k,2]
    error += np.abs(r - predict_rating(u,i))
print(error/test_size)

Test size: 1000


  similar_movie_indices = cosine_similarity[movie_index].sort_values(ascending=False)[1:N+1].index.tolist()


0.8067586674764351


In [22]:
#RMSE
np.sqrt(error/test_size)

0.8981974546147607