### Embedding-based (Sentence Transformer) RecSys
1. Generate embeddings
    * item: 
        - use title + genre
        - optional: movie description
    * user: 
        - Represent user by the Top-rated movies (rating >= 3)
        - Get the average emebedding


2. Movie-to-movie recommender
    - input = target movie embedding
    - similarity metric = cosine
    - output = Top-k similar movie embeddings


3. user-to-movie recommender
    - input = target user embedding
    - similarity metric = cosine
    - output = Top-k similar movie embeddings

In [1]:
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from tqdm import tqdm

tqdm.pandas()

  from .autonotebook import tqdm as notebook_tqdm


In [10]:
RATINGS_DATA_PATH = './data/ratings.csv'
USER_DATA_PATH = './data/users.csv'
ITEM_DATA_PATH = './data/movies.csv'

ratings_df = pd.read_csv(RATINGS_DATA_PATH)
user_df = pd.read_csv(USER_DATA_PATH)
item_df = pd.read_csv(ITEM_DATA_PATH)

In [11]:
genre_labels = [
    'unknown', 'Action', 'Adventure', 'Animation', "Children's", 'Comedy', 'Crime', 
    'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 
    'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western'
]

def extract_genres_from_row(row):
    return [genre for genre in genre_labels if row[genre] == 1]

# extract genres as a list
item_df['genres'] = item_df.apply(extract_genres_from_row, axis=1)
item_df['movie_text'] = item_df['title'] + ' ' + item_df['genres'].apply(lambda x: ' '.join(x))

item_df = item_df[['movie_id', 'title', 'genres', 'movie_text']]

In [12]:
item_df

Unnamed: 0,movie_id,title,genres,movie_text
0,1,Toy Story (1995),"[Animation, Children's, Comedy]",Toy Story (1995) Animation Children's Comedy
1,2,GoldenEye (1995),"[Action, Adventure, Thriller]",GoldenEye (1995) Action Adventure Thriller
2,3,Four Rooms (1995),[Thriller],Four Rooms (1995) Thriller
3,4,Get Shorty (1995),"[Action, Comedy, Drama]",Get Shorty (1995) Action Comedy Drama
4,5,Copycat (1995),"[Crime, Drama, Thriller]",Copycat (1995) Crime Drama Thriller
...,...,...,...,...
1677,1678,Mat' i syn (1997),[Drama],Mat' i syn (1997) Drama
1678,1679,B. Monkey (1998),"[Romance, Thriller]",B. Monkey (1998) Romance Thriller
1679,1680,Sliding Doors (1998),"[Drama, Romance]",Sliding Doors (1998) Drama Romance
1680,1681,You So Crazy (1994),[Comedy],You So Crazy (1994) Comedy


### Load embedding model

In [4]:
model = SentenceTransformer('all-MiniLM-L6-v2') # all-MiniLM-L6-v2

### Generate item embeddings

In [None]:
# generate item embeddings
item_df['embedding'] = item_df['movie_text'].apply(lambda x: model.encode(x, show_progress_bar=True))

### Generate user embeddings

In [15]:
movie_embeddings = item_df[['movie_id', 'embedding']].set_index('movie_id')

MIN_RATING = 4.0

# compute user embedding: average embedding of the highly rated movies of each user 
def get_user_embedding(user_id):
    # get movies rated >= min_rating by the user
    user_rated = ratings_df[(ratings_df['user_id'] == user_id) & (ratings_df['rating'] >= MIN_RATING)]
    movie_ids = user_rated['item_id'].values
    
    # filter embeddings of the filtered movies
    user_movie_embeds = movie_embeddings.loc[movie_ids]['embedding'].tolist()
    
    # average movie embedding
    if len(user_movie_embeds) == 0:
        return np.zeros(len(next(iter(movie_embeddings['embedding']))))  # zero vector
    else:
        return np.mean(user_movie_embeds, axis=0)

# create the user embedding for all users
user_df['embedding'] = user_df['user_id'].progress_apply(get_user_embedding)


100%|██████████| 943/943 [00:00<00:00, 2352.08it/s]


### Find recommendations: Top-k movies to user

In [22]:
def get_top_k_recommendations(user_id, k, users_df, movies_df, ratings_df):
    # get user embedding
    user_row = users_df[users_df['user_id'] == user_id]
    if user_row.empty:
        raise Exception(f"user_id: {user_id} not found")
    user_embedding = np.array(user_row.iloc[0]['embedding']).reshape(1, -1)

    # filter candidate movies
    seen_movie_ids = ratings_df[ratings_df['user_id'] == user_id]['item_id'].values # filter out the already seen movies
    candidate_movie_df = movies_df[~movies_df['movie_id'].isin(seen_movie_ids)].copy()
    if candidate_movie_df.empty:
        return pd.DataFrame(columns=["movie_id", "title", "score"])

    # compute similarity
    item_embeddings = np.vstack(candidate_movie_df['embedding'].values)
    similarities = cosine_similarity(user_embedding, item_embeddings)[0]

    # return top-K similar movies
    candidate_movie_df['score'] = similarities
    top_k = candidate_movie_df.sort_values(by='score', ascending=False).head(k)

    return top_k[['movie_id', 'title', 'genres', 'score']]


### Evaluate the model

In [None]:
import pandas as pd
import numpy as np
from collections import defaultdict
from typing import Callable, List, Dict
from tqdm import tqdm

def evaluate_model(
    recommend_func: Callable[[int, int], List[int]],
    test_data: pd.DataFrame,
    k: int = 10,
    min_relevant_rating: float = 4.0
) -> Dict[str, float]:
    """
    Evaluate a recommender model using test ratings.
    
    Args:
        recommend_func: function(user_id, k) -> List[item_id]
        test_data: pd.DataFrame with ['user_id', 'item_id', 'rating']
        k: top-k items to recommend
        min_relevant_rating: threshold above which an item is considered relevant
        
    Returns:
        dict with averaged metrics (Precision@K, Recall@K, MAP@K, NDCG@K)
    """
    
    user_item_test = defaultdict(set)

    # Build ground-truth relevant items per user
    for _, row in test_data.iterrows():
        if row['rating'] >= min_relevant_rating:
            user_item_test[row['user_id']].add(row['item_id'])

    users = list(user_item_test.keys())

    precision_list, recall_list, map_list, ndcg_list = [], [], [], []

    for user_id in tqdm(users, desc="Evaluating users"):
        top_k_df = get_top_k_recommendations(user_id, k, users, movies, ratings)

        recommended_items = top_k_df['movie_id'].tolist()
        relevant_items = user_item_test[user_id]

        if not relevant_items:
            continue  # skip users with no relevant ground truth

        hits = [1 if item in relevant_items else 0 for item in recommended_items]
        num_hits = sum(hits)

        # Precision@K
        precision = num_hits / k

        # Recall@K
        recall = num_hits / len(relevant_items)

        # MAP@K
        ap = 0.0
        hit_count = 0
        for idx, hit in enumerate(hits):
            if hit:
                hit_count += 1
                ap += hit_count / (idx + 1)
        map_k = ap / min(len(relevant_items), k)

        # NDCG@K
        dcg = sum([rel / np.log2(idx + 2) for idx, rel in enumerate(hits)])
        ideal_hits = [1] * min(len(relevant_items), k)
        idcg = sum([1.0 / np.log2(idx + 2) for idx in range(len(ideal_hits))])
        ndcg = dcg / idcg if idcg > 0 else 0.0

        precision_list.append(precision)
        recall_list.append(recall)
        map_list.append(map_k)
        ndcg_list.append(ndcg)

    return {
        'Precision@K': np.mean(precision_list),
        'Recall@K': np.mean(recall_list),
        'MAP@K': np.mean(map_list),
        'NDCG@K': np.mean(ndcg_list),
        'Users Evaluated': len(precision_list)
    }


### Inference

In [None]:
# inference
user_id = 25
top_k = 10

recommendations = get_top_k_recommendations(user_id, top_k, user_df, item_df, ratings_df)
recommendations.head(top_k)

Unnamed: 0,movie_id,title,genres,score
771,772,Kids (1995),[Drama],0.714384
1487,1488,Germinal (1993),[Drama],0.700821
1446,1447,Century (1993),[Drama],0.695436
48,49,I.Q. (1994),"[Comedy, Romance]",0.690957
79,80,Hot Shots! Part Deux (1993),"[Action, Comedy, War]",0.685311
727,728,Junior (1994),"[Comedy, Sci-Fi]",0.679665
807,808,"Program, The (1993)","[Action, Drama]",0.673613
108,109,Mystery Science Theater 3000: The Movie (1996),"[Comedy, Sci-Fi]",0.672894
511,512,Wings of Desire (1987),"[Comedy, Drama, Romance]",0.669741
733,734,Made in America (1993),[Comedy],0.667385


In [27]:
# movies rated by user
user_ratings = ratings_df[ratings_df['user_id'] == user_id]
user_ratings = pd.merge(user_ratings, item_df, left_on='item_id', right_on='movie_id')[['title', 'genres', 'rating']].sort_values(by='rating', ascending=False) # merge movie titles & sort by rating
print(f"\nTop Rated movies by user:")
user_ratings.head(10)


Top Rated movies by user:


Unnamed: 0,title,genres,rating
0,Return of the Jedi (1983),"[Action, Adventure, Romance, Sci-Fi, War]",5
11,"Philadelphia Story, The (1940)","[Comedy, Romance]",5
38,Vertigo (1958),"[Mystery, Thriller]",5
33,Wallace & Gromit: The Best of Aardman Animatio...,[Animation],5
55,"Silence of the Lambs, The (1991)","[Drama, Thriller]",5
28,"Wrong Trousers, The (1993)","[Animation, Comedy]",5
25,Star Wars (1977),"[Action, Adventure, Romance, Sci-Fi, War]",5
22,"Grand Day Out, A (1992)","[Animation, Comedy]",5
61,"Close Shave, A (1995)","[Animation, Comedy, Thriller]",5
67,Back to the Future (1985),"[Comedy, Sci-Fi]",5
