In [None]:
import numpy as np
import pandas as pd
import pickle
from scipy.sparse import csr_matrix
from implicit.als import AlternatingLeastSquares
from sklearn.preprocessing import normalize

ratings = pd.read_csv("data/ratings_10k.csv")
movies = pd.read_csv("data/movies_10k.csv")

# Mappings
user_ids = ratings.userId.unique()
movie_ids = ratings.movieId.unique()

user_to_index = {u:i for i,u in enumerate(user_ids)}
movie_to_index = {m:i for i,m in enumerate(movie_ids)}
index_to_movie = {i:m for m,i in movie_to_index.items()}

# Build matrix
rows = ratings.userId.map(user_to_index)
cols = ratings.movieId.map(movie_to_index)
data = ratings.rating.values

user_item_matrix = csr_matrix(
    (data, (rows, cols)),
    shape=(len(user_ids), len(movie_ids))
)

# Train ALS
als = AlternatingLeastSquares(
    factors=64,
    regularization=0.05,
    iterations=20
)
als.fit(user_item_matrix)

# Save
pickle.dump(als, open("artifacts/als_model.pkl", "wb"))
pickle.dump(index_to_movie, open("artifacts/index_to_movie.pkl", "wb"))


In [None]:
genome = pd.read_csv("data/genome_scores_10k.csv")

movie_tag_df = genome.pivot(
    index="movieId",
    columns="tagId",
    values="relevance"
).fillna(0)

content_movie_ids = movie_tag_df.index.tolist()
content_features = normalize(movie_tag_df.values)

movieid_to_content_index = {
    mid: i for i, mid in enumerate(content_movie_ids)
}

np.save("artifacts/content_features.npy", content_features)
pickle.dump(movieid_to_content_index, open("artifacts/movieid_to_content_index.pkl", "wb"))


In [None]:
from scipy.sparse import save_npz
save_npz("artifacts/user_item_matrix.npz", user_item_matrix)
