In [1]:
from lightfm import LightFM
from lightfm.data import Dataset
from lightfm.data import Dataset as LFM_Dataset
from lightfm.evaluation import precision_at_k
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np



In [2]:
RATINGS_DATA_PATH = './data/ratings.csv'
USER_DATA_PATH = './data/users.csv'
ITEM_DATA_PATH = './data/movies.csv'

TEST_SIZE = 0.2
K_RECS = 10
LR = 0.05
EPOCHS = 10

In [5]:
# Load data
ratings_df = pd.read_csv(RATINGS_DATA_PATH)
user_df = pd.read_csv(USER_DATA_PATH)
item_df = pd.read_csv(ITEM_DATA_PATH)

# train/test split
train_df, test_df = train_test_split(
    ratings_df, test_size=TEST_SIZE,
    stratify=ratings_df['user_id'],
    random_state=42
)

# test set (later for evaluation)
user_seen_movies_dict_test = test_df.groupby('user_id')['item_id'].apply(set).to_dict()

# lightFM dataset object
lfm_dataset = Dataset()

# fit the dataset with id's and features
lfm_dataset.fit(
    users=ratings_df['user_id'].unique(),
    items=ratings_df['item_id'].unique(),
    user_features=[
        f"{col}:{val}" for col in user_df.columns[1:] for val in user_df[col].unique()
    ],
    item_features=[
        f"{col}:{val}" for col in item_df.columns[1:] for val in item_df[col].unique()
    ]
    # user_features=user_df.columns[1:].tolist(),
    # item_features=item_df.columns[1:].tolist()
)

# build user/item features matrices
user_features = user_df.set_index('user_id')
item_features = item_df.set_index('movie_id')

def build_features(features_df, ids):
    tuples = []
    for idx, row in features_df.loc[ids].iterrows():
        features = [f"{col}:{row[col]}" for col in features_df.columns]
        tuples.append((idx, features))
    return tuples

# Build interactions
interactions, _ = lfm_dataset.build_interactions([(uid, iid, 1.0) for uid, iid in zip(train_df['user_id'], train_df['item_id'])]) # returns tuple of csr_matrix

# Build user/item features
user_feature_tuples = build_features(user_features, train_df['user_id'].unique())
item_feature_tuples = build_features(item_features, train_df['item_id'].unique())

user_features_matrix = lfm_dataset.build_user_features(user_feature_tuples)
item_features_matrix = lfm_dataset.build_item_features(item_feature_tuples)




In [6]:
# Train LightFM model (Hybrid with metadata)
model = LightFM(loss='warp', learning_rate=LR, random_state=42) # warp, bpr
model.fit(
    interactions,
    user_features=user_features_matrix,
    item_features=item_features_matrix,
    epochs=EPOCHS,
    num_threads=2,
    verbose=True
)

Epoch 0
Epoch 1
Epoch 2
Epoch 3
Epoch 4
Epoch 5
Epoch 6
Epoch 7
Epoch 8
Epoch 9


<lightfm.lightfm.LightFM at 0x10e0bb9d0>

In [11]:
def get_recommendations(user_id, k=K_RECS):
    seen_movies = set(train_df[train_df['user_id'] == user_id]['item_id'])
    all_candidate_movies = list(set(item_df['movie_id']) - seen_movies)
    
    user_index = lfm_dataset.mapping()[0][user_id]
    item_mapping = lfm_dataset.mapping()[2]
    reverse_item_mapping = {v: k for k, v in item_mapping.items()}

    # Filter valid candidate movies present in LightFM mapping
    valid_candidates = [m for m in all_candidate_movies if m in item_mapping]
    candidate_indices = [item_mapping[m] for m in valid_candidates]

    # Predict scores
    scores = model.predict(
        user_ids=user_index,
        item_ids=candidate_indices,
        user_features=user_features_matrix,
        item_features=item_features_matrix
    )

    # Top-k indices and corresponding movie_ids
    top_k_idx = np.argsort(-scores)[:k]
    top_k_movie_ids = [valid_candidates[i] for i in top_k_idx]
    top_k_scores = [scores[i] for i in top_k_idx]

    # merge movie metadata
    metadata = item_df.set_index('movie_id').loc[top_k_movie_ids][['title']].reset_index()
    metadata['score'] = top_k_scores

    return metadata[['movie_id', 'title', 'score']]

In [12]:
user_id = 42
k = 10
top_k_recs_df = get_recommendations(user_id, k=k)
print(top_k_recs_df)

   movie_id                        title      score
0        50             Star Wars (1977) -18.568016
1       100                 Fargo (1996) -18.921183
2       258               Contact (1997) -18.943678
3       288                Scream (1996) -18.987839
4        56          Pulp Fiction (1994) -19.057384
5       127        Godfather, The (1972) -19.120678
6       286  English Patient, The (1996) -19.196796
7         7        Twelve Monkeys (1995) -19.218435
8       300         Air Force One (1997) -19.257360
9       117             Rock, The (1996) -19.318903


In [13]:
def evaluate_recommender(user_seen_movies_dict: dict, k=K_RECS):
    hits, precision_sum, recall_sum, ndcg_sum = 0, 0.0, 0.0, 0.0
    total_users = 0

    for user_id in user_seen_movies_dict:
        true_set = set(user_seen_movies_dict[user_id])
        recs_df = get_recommendations(user_id, k=k)
        recs = recs_df['movie_id'].tolist()
        hit_set = true_set & set(recs)
        num_hits = len(hit_set)
        hits += int(num_hits > 0)
        precision_sum += num_hits / k
        recall_sum += num_hits / len(true_set)

        # NDCG@k
        dcg = 0.0
        for i, movie in enumerate(recs):
            if movie in true_set:
                dcg += 1 / np.log2(i + 2)
        idcg = sum(1 / np.log2(i + 2) for i in range(min(len(true_set), k)))
        ndcg = dcg / idcg if idcg > 0 else 0
        ndcg_sum += ndcg

        total_users += 1

    return {
        'HitRate@k': hits / total_users,
        'Precision@k': precision_sum / total_users,
        'Recall@k': recall_sum / total_users,
        'NDCG@k': ndcg_sum / total_users
    }


# Evaluate
metrics = evaluate_recommender(user_seen_movies_dict_test, k=K_RECS)
print(f"LightFM Hybrid Recommender Evaluation (k={K_RECS}):")
for metric, value in metrics.items():
    print(f"{metric}: {value:.4f}")


LightFM Hybrid Recommender Evaluation (k=10):
HitRate@k: 0.7646
Precision@k: 0.2004
Recall@k: 0.1112
NDCG@k: 0.2270
