In [1]:
import pandas as pd
from sklearn.impute import KNNImputer

ratings = pd.read_csv('data/ml-32m/ratings.csv', dtype={'userId': 'int64', 'movieId': 'int64', 'rating': 'float64'})
ratings = ratings.head(100000)
movies = pd.read_csv('data/ml-32m/movies.csv', dtype={'movieId': 'int64', 'title': 'str', 'genres': 'str'})

In [3]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,17,4.0,944249077
1,1,25,1.0,944250228
2,1,29,2.0,943230976
3,1,30,5.0,944249077
4,1,32,5.0,943228858


In [4]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [5]:
print(ratings.isnull().sum())
print()
print(movies.isnull().sum())

userId       0
movieId      0
rating       0
timestamp    0
dtype: int64

movieId    0
title      0
genres     0
dtype: int64


In [16]:
data = pd.merge(ratings, movies, on='movieId')
data.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,17,4.0,944249077,Sense and Sensibility (1995),Drama|Romance
1,1,25,1.0,944250228,Leaving Las Vegas (1995),Drama|Romance
2,1,29,2.0,943230976,"City of Lost Children, The (Cité des enfants p...",Adventure|Drama|Fantasy|Mystery|Sci-Fi
3,1,30,5.0,944249077,Shanghai Triad (Yao a yao yao dao waipo qiao) ...,Crime|Drama
4,1,32,5.0,943228858,Twelve Monkeys (a.k.a. 12 Monkeys) (1995),Mystery|Sci-Fi|Thriller


In [7]:
print(ratings['userId'].nunique(), "users")
print(ratings['movieId'].nunique(), "movies")

66 users
3965 movies


In [17]:
from scipy.sparse import csr_matrix

user_movie_matrix_sparse = csr_matrix((ratings['rating'], 
                                        (ratings['userId'], 
                                         ratings['movieId'])))
user_movie_matrix_sparse

<627x291486 sparse matrix of type '<class 'numpy.float64'>'
	with 100000 stored elements in Compressed Sparse Row format>

In [20]:
from surprise import Dataset, Reader, SVD, accuracy, KNNBasic, NormalPredictor
from surprise.model_selection import train_test_split

reader = Reader(rating_scale=(0.5, 5))
data = Dataset.load_from_df(ratings[['userId','movieId','rating']], reader)

trainSet, testSet = train_test_split(data, test_size=0.2, random_state=27)

model = NormalPredictor()
model.fit(trainSet)

predictions = model.test(testSet)
rmse = accuracy.rmse(predictions)
print(f"RMSE: {rmse}")

RMSE: 1.4663
RMSE: 1.466291416842621


In [5]:
import pickle

with open("model_31-10-24.pkl", "rb") as file:
    model = pickle.load(file)

In [14]:
wanted = ["Children"]
unwanted = ["Animation"]

min_genre_match = 3

wanted_condition = True if len(wanted) == 0 else movies['genres'].apply(
    lambda x: len(set(x.split('|')).intersection(wanted)) >= min(len(wanted),min_genre_match))
unwanted_condition = True if len(unwanted) == 0 else movies['genres'].apply(
    lambda x: not any(genre in x.split('|') for genre in unwanted))

filtered_movies = movies[wanted_condition & unwanted_condition]

# randomize to prevent older movies first?

# liked_movies = [205383,293]
# liked_of_recommended_movies = filtered_movies[filtered_movies['movieId'].isin(liked_movies)]
# print(liked_of_recommended_movies)

# use tags.csv with AI to find similar movies with similar tags

recommended_movies = []
for movie_id in filtered_movies['movieId'].values:
    predicted_rating = model.predict(uid=0, iid=movie_id)
    recommended_movies.append((movie_id, predicted_rating.est))

recommended_movies.sort(key=lambda x: x[1], reverse=True)

top_recommendations = filtered_movies[filtered_movies['movieId'].isin([x[0] for x in recommended_movies[:10]])]
top_recommendations.to_numpy().tolist()

[[953, "It's a Wonderful Life (1946)", 'Children|Drama|Fantasy|Romance'],
 [2804, 'Christmas Story, A (1983)', 'Children|Comedy'],
 [98241, 'Starry starry night (Xing kong) (2011)', 'Children|Drama|Fantasy'],
 [118085, 'Welcome, or No Trespassing (1964)', 'Children|Comedy'],
 [120311, 'Drishyam (2013)', 'Children|Drama|Thriller'],
 [139620,
  "Everything's Gonna Be Great (1998)",
  'Adventure|Children|Comedy|Drama'],
 [158958, 'Pollyanna (2003)', 'Children|Drama'],
 [203757, 'So Long, My Son (2019)', 'Children|Drama'],
 [263965,
  'Downton Abbey: Christmas Special 2015 (2015)',
  'Children|Drama|Romance'],
 [289193, 'The Wonderful Story of Henry Sugar', 'Adventure|Children|Comedy']]

In [6]:
import pickle

with open('model_31-10-24.pkl', 'wb') as file:
    pickle.dump(model, file)
    
print(f"Model saved!")

Model saved!
