# **Install and Import Modules**

In [1]:
from surprise import Dataset, Reader
from surprise.prediction_algorithms.matrix_factorization import SVD
from surprise import accuracy
import pandas as pd
from sklearn.preprocessing import LabelEncoder, MultiLabelBinarizer
from sklearn.model_selection import train_test_split

# **Load and Preprocess the Data**

In [2]:
ratings_df = pd.read_csv("/home/antoine/PROJET_MLOPS_RECO_MOVIES/data/raw/bronze/ratings.csv")
movies_df = pd.read_csv("/home/antoine/PROJET_MLOPS_RECO_MOVIES/data/raw/bronze/movies.csv")

df = pd.merge(ratings_df, movies_df[['movieId', 'genres']], on = 'movieId', how = 'left')

df

Unnamed: 0,userId,movieId,rating,timestamp,genres
0,1,2,3.5,1112486027,Adventure|Children|Fantasy
1,1,29,3.5,1112484676,Adventure|Drama|Fantasy|Mystery|Sci-Fi
2,1,32,3.5,1112484819,Mystery|Sci-Fi|Thriller
3,1,47,3.5,1112484727,Mystery|Thriller
4,1,50,3.5,1112484580,Crime|Mystery|Thriller
...,...,...,...,...,...
20000258,138493,68954,4.5,1258126920,Adventure|Animation|Children|Drama
20000259,138493,69526,4.5,1259865108,Action|Adventure|Sci-Fi|IMAX
20000260,138493,69644,3.0,1260209457,Action|Adventure|Animation|Children|Comedy|Rom...
20000261,138493,70286,5.0,1258126944,Mystery|Sci-Fi|Thriller


In [3]:
user_encoder = LabelEncoder()
movie_encoder = LabelEncoder()
mlb = MultiLabelBinarizer()

df['userId'] = user_encoder.fit_transform(df['userId'])
df['movieId'] = movie_encoder.fit_transform(df['movieId'])

df = df.join(pd.DataFrame(mlb.fit_transform(df.pop('genres').str.split('|')), columns = mlb.classes_, index = df.index ))

In [4]:
df.drop(columns = "(no genres listed)", inplace = True)


In [5]:
df.head()

Unnamed: 0,userId,movieId,rating,timestamp,Action,Adventure,Animation,Children,Comedy,Crime,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,0,1,3.5,1112486027,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,28,3.5,1112484676,0,1,0,0,0,0,...,0,0,0,0,1,0,1,0,0,0
2,0,31,3.5,1112484819,0,0,0,0,0,0,...,0,0,0,0,1,0,1,1,0,0
3,0,46,3.5,1112484727,0,0,0,0,0,0,...,0,0,0,0,1,0,0,1,0,0
4,0,49,3.5,1112484580,0,0,0,0,0,1,...,0,0,0,0,1,0,0,1,0,0


# **Build the Model with Collabrative filtering**

In [6]:

train_df, test_df = train_test_split(df, test_size = 0.5)
train_df

Unnamed: 0,userId,movieId,rating,timestamp,Action,Adventure,Animation,Children,Comedy,Crime,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
9476563,65528,11274,5.0,1195485752,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
17326622,119790,3188,4.0,1181193423,1,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
16523210,114325,221,5.0,842620294,0,0,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0
4632718,31756,108,4.0,1048912687,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
11089183,76629,2496,3.5,1352961431,0,0,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19567239,135424,772,4.5,1171916098,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
16880975,116738,335,4.0,845577826,0,0,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0
2671166,18137,2037,2.5,1326202727,0,0,0,0,0,0,...,0,1,0,0,0,0,0,1,0,0
15326674,105989,9,3.0,945573991,1,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [7]:
reader = Reader(rating_scale = (0.5, 5))
data = Dataset.load_from_df(train_df[['userId', 'movieId', 'rating']], reader)
trainset = data.build_full_trainset()


: 

In [None]:
model_svd = SVD()
model_svd.fit(trainset)

predictions_svd = model_svd.test(trainset.build_anti_testset())
accuracy.rmse(predictions_svd)

# **Make Recommendations**

In [24]:
def get_top_n_recommendations(user_id, n=10):
  user_movies = df[df['userId'] == user_id]['movieId'].unique()
  all_movies = df['movieId'].unique()
  movies_to_predict = list(set(all_movies) - set(user_movies))

  user_movie_pairs = [(user_id, movie_id, 0) for movie_id in movies_to_predict]
  predictions_cf = model_svd.test(user_movie_pairs)

  top_n_recommendations = sorted(predictions_cf, key = lambda x: x.est)[:n]

  for pred in top_n_recommendations:
    predicted_rating = pred.est
    print(predicted_rating)


  top_n_movie_ids = [int(pred.iid) for pred in top_n_recommendations]

  top_n_movies = movie_encoder.inverse_transform(top_n_movie_ids)

  return top_n_movies

In [25]:

user_id = 501
recommendations = get_top_n_recommendations(user_id)
top_n_movies_titles = movies_df[movies_df['movieId'].isin(recommendations)]['title'].tolist()
print(f"Top 5 Recommendations for User {user_id}:")
for i, title in enumerate(top_n_movies_titles, 1):
  print(f"{i}.{title}")

1.5463819581287739
1.8023725684558956
1.8625275464904463
1.8697898880000998
1.8820046033566906
Top 5 Recommendations for User 221:
1.Stuart Saves His Family (1995)
2.Richie Rich (1994)
3.Honey, I Blew Up the Kid (1992)
4.Superman IV: The Quest for Peace (1987)
5.Battlefield Earth (2000)


In [None]:
import pickle
# Enregistrement du modèle avec pickle
with open("/home/antoine/PROJET_MLOPS_RECO_MOVIES/data/models/model_SVD_1.pkl", "wb") as f:
    pickle.dump(model_svd, f)