In [1]:
import pandas as pd

ratings = pd.read_csv('data/ml-32m/ratings.csv', dtype={'userId': 'int64', 'movieId': 'int64', 'rating': 'float64'})
movies = pd.read_csv('data/ml-32m/movies.csv', dtype={'movieId': 'int64', 'title': 'str', 'genres': 'str'})

In [18]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,17,4.0,944249077
1,1,25,1.0,944250228
2,1,29,2.0,943230976
3,1,30,5.0,944249077
4,1,32,5.0,943228858


In [19]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [2]:
print(ratings.isnull().sum())
print()
print(movies.isnull().sum())

userId       0
movieId      0
rating       0
timestamp    0
dtype: int64

movieId    0
title      0
genres     0
dtype: int64


In [3]:
data = pd.merge(ratings, movies, on='movieId')
data.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,17,4.0,944249077,Sense and Sensibility (1995),Drama|Romance
1,1,25,1.0,944250228,Leaving Las Vegas (1995),Drama|Romance
2,1,29,2.0,943230976,"City of Lost Children, The (Cité des enfants p...",Adventure|Drama|Fantasy|Mystery|Sci-Fi
3,1,30,5.0,944249077,Shanghai Triad (Yao a yao yao dao waipo qiao) ...,Crime|Drama
4,1,32,5.0,943228858,Twelve Monkeys (a.k.a. 12 Monkeys) (1995),Mystery|Sci-Fi|Thriller


In [28]:
print(ratings['userId'].nunique(), "users")
print(ratings['movieId'].nunique(), "movies")

200948 users
84432 movies


In [4]:
from scipy.sparse import csr_matrix

user_movie_matrix_sparse = csr_matrix((ratings['rating'], 
                                        (ratings['userId'], 
                                         ratings['movieId'])))
user_movie_matrix_sparse

<200949x292758 sparse matrix of type '<class 'numpy.float64'>'
	with 32000204 stored elements in Compressed Sparse Row format>

In [5]:
from surprise import Dataset, Reader, SVD, accuracy
from surprise.model_selection import train_test_split

reader = Reader(rating_scale=(0.5, 5))
data = Dataset.load_from_df(ratings[['userId','movieId','rating']], reader)

trainSet, testSet = train_test_split(data, test_size=0.2, random_state=27)

model = SVD()
model.fit(trainSet)

predictions = model.test(testSet)
rmse = accuracy.rmse(predictions)
print(f"RMSE: {rmse}")

RMSE: 0.7712
RMSE: 0.7711562472787986


In [12]:
import pickle

with open("model_31-10-24.pkl", "rb") as file:
    model = pickle.load(file)

<class 'list'>


In [7]:
wanted_genres = ['Action','Drama','Thriller','Comedy']
unwanted_genres = ['Fantasy']

min_genre_match = 3

def genre_match(row, genres):
    movie_genres = set(row.split('|'))
    return len(movie_genres.intersection(genres)) >= (min_genre_match if len(genres) >= min_genre_match else len(genres))

filtered_movies = movies[
    movies['genres'].apply(lambda x: genre_match(x, wanted_genres)) & 
    ~movies['genres'].str.contains('|'.join(unwanted_genres))
]

# randomize to prevent oldest first?

# liked_movies = [205383,293]
# liked_of_recommended_movies = filtered_movies[filtered_movies['movieId'].isin(liked_movies)]
# print(liked_of_recommended_movies)

recommended_movies = []
for movie_id in filtered_movies['movieId'].values:
    predicted_rating = model.predict(uid=0, iid=movie_id)
    recommended_movies.append((movie_id, predicted_rating.est))

recommended_movies.sort(key=lambda x: x[1], reverse=True)

top_recommendations = filtered_movies[filtered_movies['movieId'].isin([x[0] for x in recommended_movies[:10]])]
print(top_recommendations[['title','genres']])

                                                   title  \
289    Léon: The Professional (a.k.a. The Professiona...   
2867                                   Fight Club (1999)   
5671         Professional, The (Le professionnel) (1981)   
5905                 City of God (Cidade de Deus) (2002)   
9757                    Knockin' on Heaven's Door (1997)   
11622          Great War, The (Grande guerra, La) (1959)   
11923                Elite Squad (Tropa de Elite) (2007)   
12577                           Absolute Giganten (1999)   
14939                                   Inception (2010)   
23074                                  Wild Tales (2014)   

                                                genres  
289                        Action|Crime|Drama|Thriller  
2867                       Action|Crime|Drama|Thriller  
5671                             Action|Drama|Thriller  
5905             Action|Adventure|Crime|Drama|Thriller  
9757                         Action|Comedy|Crime|Drama

In [6]:
import pickle

with open('model_31-10-24.pkl', 'wb') as file:
    pickle.dump(model, file)
    
print(f"Model saved!")

Model saved!
