In [6]:
# !pip install lightfm

# Reference
https://www.kaggle.com/code/niyamatalmass/lightfm-hybrid-recommendation-system \

https://making.lyst.com/lightfm/docs/home.html

Help users discover movies they are most likely to enjoy based on thier unique taste

# Import Dataset

In [7]:
import pandas as pd

MOVIE_METADATA_PATH = "movie_enriched_view.parquet"
USER_RATINGS_PATH = "user_ratings_200users_30each.parquet"
movies_df = pd.read_parquet(MOVIE_METADATA_PATH)
users_df = pd.read_parquet(USER_RATINGS_PATH)

##NOTE Generate numeric identifier: LightFM python only except numeric id

In [8]:
import pandas as pd

def generate_int_id(df: pd.DataFrame, col_name: str, new_col_name: str = None) -> pd.DataFrame:
    """
    Convert a string/categorical column into unique integer IDs.

    Parameters
    ----------
    df : pd.DataFrame
        Input DataFrame.
    col_name : str
        Name of the column to encode (e.g. 'user_id' or 'movie_id').
    new_col_name : str, optional
        Name of the new column to store integer IDs.
        - If provided: a new column is created.
        - If None: the original column is overwritten.

    Returns
    -------
    pd.DataFrame
        DataFrame with the encoded integer column.
    """

    # Factorize the column: assigns a unique integer to each unique value
    # e.g. ['u1', 'u2', 'u1'] → [0, 1, 0]
    codes, uniques = pd.factorize(df[col_name])

    # Determine where to store the result:
    # - Use new_col_name if provided
    # - Otherwise overwrite the original column
    target_col = new_col_name if new_col_name is not None else col_name

    # Store the encoded values in the target column
    df[target_col] = codes.astype('int64')

    return df


In [9]:
users_df = generate_int_id(users_df, "user_id")

# Data Preparation

Note that if we don’t have all user and items ids at once, we can repeatedly call fit_partial to supply additional ids. In this case, we will use this capability to add some item feature mappings:

# Item Features

In [10]:
def extract_item_feature_list(df, feature_columns):
    """
    Extract unique lowercase feature values from selected metadata columns.

    Parameters
    ----------
    df : pd.DataFrame
        DataFrame containing movie metadata.
    feature_columns : list of str
        Column names to extract features from.

    Returns
    -------
    Set[str]
        Set of all unique feature names (lowercased).
    """
    movie_all_features = set()
    for _, row in df.iterrows():
        combined = []
        for col in feature_columns:
            combined.extend(str(row[col]).split(','))
        cleaned = [f.strip().lower() for f in combined if f.strip()]
        movie_all_features.update(cleaned)
    return  list(movie_all_features)


In [11]:
def build_item_feature_tuples(df, feature_columns):
    """
    Build (movie_id, [features...]) tuples for LightFM item_features.

    Parameters
    ----------
    df : pd.DataFrame
        DataFrame containing movie metadata.
    feature_columns : list of str
        Columns to use for feature extraction.

    Returns
    -------
    List[Tuple[int, List[str]]]
        Tuples of movie_id and its feature list.
    """
    tuples = []
    for _, row in df.iterrows():
        combined = []
        for col in feature_columns:
            combined.extend(str(row[col]).split(','))
        cleaned = [f.strip().lower() for f in combined if f.strip()]
        tuples.append((row['movie_id'], cleaned))
    return tuples


In [12]:
from lightfm.data import Dataset

# Step 1: Prepare dataset
dataset = Dataset()
dataset.fit(users_df['user_id'], movies_df['movie_id'])

In [13]:
### TODO Rewrite Function
# Step 2: Extract feature set and fit dataset
feature_columns = ['genres', 'keywords']
movie_all_features = extract_item_feature_list(movies_df, feature_columns)
dataset.fit_partial(items=movies_df['movie_id'], item_features=movie_all_features)

# TODO check build_item_features parameter format
# Step 3: Build item_features matrix
item_feature_tuples = build_item_feature_tuples(movies_df, feature_columns)
item_features = dataset.build_item_features(item_feature_tuples)

# Set Variable

In [14]:
METHOD_STR="random_train_test_split" # split_user_interactions_df random_train_test_split
WEIGHT_METHOD_STR = "default" # default , condition_3, ratings


In [15]:
import pandas as pd
import numpy as np

def split_user_interactions_df(df, user_col='user_id', test_size=0.2, random_state=42):
    """
    Optimized version: split each user's interactions into train and test sets.

    Parameters
    ----------
    df : pd.DataFrame
        DataFrame with user-item interactions.
    user_col : str
        Column for user ID.
    test_size : float
        Fraction of each user's data to put into test.
    random_state : int
        Random seed for reproducibility.

    Returns
    -------
    train_df : pd.DataFrame
    test_df : pd.DataFrame
    """
    rng = np.random.default_rng(random_state)

    # Assign a row number within each user group
    df = df.copy()
    df['_row_id'] = df.groupby(user_col).cumcount()

    # Count interactions per user
    user_counts = df.groupby(user_col)['_row_id'].max() + 1

    test_rows = []
    for user_id, count in user_counts.items():
        if count < 2:
            continue
        test_size_u = max(1, int(count * test_size))
        test_indices = rng.choice(count, size=test_size_u, replace=False)
        test_rows.extend(df[(df[user_col] == user_id) & (df['_row_id'].isin(test_indices))].index)

    test_df = df.loc[test_rows].drop(columns=['_row_id'])
    train_df = df.drop(index=test_rows).drop(columns=['_row_id'])

    return train_df.reset_index(drop=True), test_df.reset_index(drop=True)


In [16]:
from lightfm.cross_validation import random_train_test_split

def train_test_split_method_return_interaction(
    method_str=None,
    weight_method_str=None,
    df=None,
    user_col="user_id",
    test_size=0.2,
    random_state=42,
    dataset=None
):
    """
    Split interactions and weights using selected strategy.

    Parameters
    ----------
    method_str : str
        Splitting strategy, one of ["split_user_interactions_df", "random_train_test_split"].
    weight_method_str : str
        Weighting strategy, one of ["default", "ratings", "condition_<rating_threshold>"].
    df : pd.DataFrame
        DataFrame with user_id, movie_id, rating.
    user_col : str
        Column for user IDs (default "user_id").
    test_size : float
        Proportion for test split.
    random_state : int
        Random seed.
    dataset : lightfm.data.Dataset
        LightFM dataset object used for building interactions.

    Returns
    -------
    method_str, train_interactions, test_interactions, train_weights, test_weights
    """

    def build_interactions_by_weight(df, strategy, dataset):
        if strategy == "default":
            return dataset.build_interactions([
                (u, i, 1.0)
                for u, i, r in zip(df["user_id"], df["movie_id"], df["rating"])
            ])
        elif strategy == "ratings":
            return dataset.build_interactions([
                (u, i, r)
                for u, i, r in zip(df["user_id"], df["movie_id"], df["rating"])
            ])
        elif str(strategy).startswith("condition_"):
            threshold = int(strategy.split("_")[1])
            return dataset.build_interactions([
                (u, i, 1.0 if r >= threshold else 0.0)
                for u, i, r in zip(df["user_id"], df["movie_id"], df["rating"])
            ])
        else:
            raise ValueError(f"Invalid weight_method_str: {strategy}")

    if method_str == "split_user_interactions_df":
        train_df, test_df = split_user_interactions_df(
            df=df, user_col=user_col, test_size=test_size, random_state=random_state
        )

        train_interactions, train_weights = build_interactions_by_weight(train_df, weight_method_str, dataset)
        test_interactions, test_weights = build_interactions_by_weight(test_df, weight_method_str, dataset)

    elif method_str == "random_train_test_split":
        full_interactions, full_weights = build_interactions_by_weight(df, weight_method_str, dataset)

        train_interactions, test_interactions = random_train_test_split(
            full_interactions, test_percentage=test_size, random_state=random_state
        )
        train_weights, test_weights = random_train_test_split(
            full_weights, test_percentage=test_size, random_state=random_state
        )

    else:
        raise ValueError(f"Invalid method_str: {method_str}")

    return method_str, train_interactions, test_interactions, train_weights, test_weights


In [17]:
method_str, train_interactions, test_interactions, train_weights, test_weights = \
    train_test_split_method_return_interaction(
        method_str=METHOD_STR, # "split_user_interactions_df" "random_train_test_split"
        weight_method_str=WEIGHT_METHOD_STR, #default, ratings, condition_3
        df=users_df,
        user_col="user_id",
        test_size=0.2,
        random_state=42,
        dataset=dataset
    )

In [18]:
# EPOCHS = 10

# Hyperparameter


In [19]:
LOSS_FUNCTION = "warp" # warp , bpr, logistic
EPOCHS = 100
L_RATE = 0.05
NO_COM = 30
MAX_SAM = 30


In [20]:
from lightfm import LightFM

# 3. Train the model using only the training interactions and corresponding weights
model = LightFM(
    loss=LOSS_FUNCTION, # warp , bpr, logistic
    learning_rate=L_RATE,
    no_components=NO_COM,
    max_sampled=MAX_SAM,
)

model.fit(
    interactions=train_interactions,
    item_features=item_features,
    sample_weight=train_weights,
    epochs=EPOCHS,
)

<lightfm.lightfm.LightFM at 0x7864c49f5d10>

# Evaluation

In [21]:
"""# Evaluation"""
from lightfm.evaluation import precision_at_k, recall_at_k, auc_score

def evaluate_model(model, interactions, k_list, item_features=None, train_interactions=None, split_name="TRAIN"):
    precision_scores = {}
    recall_scores = {}

    for k in k_list:
        precision = precision_at_k(
            model, interactions, train_interactions=train_interactions,
            item_features=item_features, k=k
        ).mean()
        recall = recall_at_k(
            model, interactions, train_interactions=train_interactions,
            item_features=item_features, k=k
        ).mean()

        precision_scores[k] = precision
        recall_scores[k] = recall

    auc = auc_score(
        model, interactions, train_interactions=train_interactions,
        item_features=item_features
    ).mean()

    # Print section
    print(f"{split_name} Evaluation")
    for k in k_list:
        print(f"Precision@{k}: {precision_scores[k]:.4f}")
        print(f"Recall@{k}: {recall_scores[k]:.4f}\n")
    print(f"AUC: {auc:.4f}\n")

    return precision_scores, recall_scores, auc


# Summary

In [22]:
# Set k values
k_values = [3, 5, 10]

"""# Summary"""
print("Summary")
print(f"Train Test Method: {METHOD_STR}")
print(f"Weight Method: {WEIGHT_METHOD_STR}")
print(f"Loss Function: {LOSS_FUNCTION}")
print("Epochs:", EPOCHS)
print("Learning rate:", L_RATE)
print("No. components:", NO_COM)
print("Max sampled:", MAX_SAM)
print()

# Evaluate on train
train_precision_scores, train_recall_scores, train_auc = evaluate_model(
    model, train_interactions, k_values, item_features=item_features, split_name="TRAIN"
)

# Evaluate on test
test_precision_scores, test_recall_scores, test_auc = evaluate_model(
    model, test_interactions, k_values, item_features=item_features,
    train_interactions=train_interactions, split_name="TEST"
)

Summary
Train Test Method: random_train_test_split
Weight Method: default
Loss Function: warp
Epochs: 100
Learning rate: 0.05
No. components: 30
Max sampled: 30

TRAIN Evaluation
Precision@3: 0.6717
Recall@3: 0.0845

Precision@5: 0.6570
Recall@5: 0.1378

Precision@10: 0.6185
Recall@10: 0.2588

AUC: 0.9999

TEST Evaluation
Precision@3: 0.2667
Recall@3: 0.1341

Precision@5: 0.2240
Recall@5: 0.1859

Precision@10: 0.1755
Recall@10: 0.2977

AUC: 0.9720



# Recommend

In [25]:
import numpy as np

def sample_recommendation(model, dataset, interactions, item_features,
                          user_ids, movies_df, top_n=5):
    """
    Show known positives and top-N recommendations with scores for each user.

    Parameters:
    - model: Trained LightFM model
    - dataset: LightFM dataset object (needed to extract mappings)
    - interactions: Sparse interaction matrix
    - item_features: Sparse matrix of item metadata
    - user_ids: List of actual user IDs (external IDs)
    - movies_df: DataFrame with 'movie_id' and 'title'
    - top_n: Number of recommendations to return per user

    Returns:
    - Dict with user_id as key and list of top-N recommendations as value
    """
    user_id_map, _, item_id_map, _ = dataset.mapping()
    item_id_inv_map = {v: k for k, v in item_id_map.items()}

    def get_seen_movie_ids(uidx):
        seen_indices = interactions.tocsr()[uidx].indices
        return [item_id_inv_map[i] for i in seen_indices]

    def get_movie_info(movie_id):
        row = movies_df[movies_df['movie_id'] == movie_id]
        if row.empty:
            return {"title": "Unknown", "genres": "N/A", "keywords": "N/A"}
        row = row.iloc[0]
        return {
            "title": row.get("title", "Unknown"),
            "genres": row.get("genres", "N/A"),
            "keywords": row.get("keywords", "N/A")
        }

    results = {}

    for user_id in user_ids:
        if user_id not in user_id_map:
            print(f"User {user_id} not found in training set.\n")
            continue

        uidx = user_id_map[user_id]
        n_items = interactions.shape[1]

        seen_movie_ids = get_seen_movie_ids(uidx)
        known_titles = movies_df[movies_df['movie_id'].isin(seen_movie_ids)]['title'].tolist()

        scores = model.predict(uidx, np.arange(n_items), item_features=item_features)
        top_indices = np.argsort(-scores)[:top_n]

        # === DEBUG ===
        print(f"\n🎯 User: {user_id}")
        print("✅ Known positives:")
        for movie_id in seen_movie_ids[:min(len(seen_movie_ids), 10)]:
            info = get_movie_info(movie_id)
            print(f"   • {info['title']}")
            print(f"     Genres: {info['genres']}")
            print(f"     Keywords: {info['keywords']}")

        print(f"🔮 Top {top_n} Recommendations:")
        user_recommendations = []
        for idx in top_indices:
            movie_id = item_id_inv_map[idx]
            info = get_movie_info(movie_id)
            score = round(scores[idx], 4)

            print(f"   • {info['title']} (score: {score})")
            print(f"     Genres: {info['genres']}")
            print(f"     Keywords: {info['keywords']}")

            user_recommendations.append({
                "movie_id": movie_id,
                "title": info["title"],
                "genres": info["genres"],
                "keywords": info["keywords"],
                "score": score
            })

        results[user_id] = user_recommendations

    return results


In [26]:
sample_recommendation(
    model=model,
    dataset=dataset,
    interactions=train_interactions,
    item_features=item_features,
    user_ids=[0],
    movies_df=movies_df,
    top_n=10
)


🎯 User: 0
✅ Known positives:
   • Hedwig and the Angry Inch
     Genres: Drama, Music, Comedy
     Keywords: lgbt in the military, self identity, glam rock, singer, child molestation, transsexuality, restaurant chain, nonbinary director, military brat, lgbt, rock odyssey, transvestism, theatrical manager
   • (Nie)znajomi
     Genres: Drama, Comedy
     Keywords: man woman relationship, father daughter relationship, husband wife relationship, lgbt, mother daughter relationship, friendship, remake
   • Charlotte's Web
     Genres: Drama, Family, Animation, Music, Comedy
     Keywords: spider web, talking pig, musical, anthropomorphism, friendship, cartoon spider
   • Sidekicks
     Genres: Adventure, Drama, Family, Action
     Keywords: asthmatic, gym instructor, martial arts training, computer programmer, bully, nunchaku, parent teacher romance, student mentor relationship, training-montage, daydreaming, martial arts, martial arts tournament, chinese restaurant, karate
   • Lo Steinwa

{0: [{'movie_id': 365507,
   'title': 'Emperor Meiji and General Nogi',
   'genres': 'Drama, War, History',
   'keywords': 'japan, history of japan, russo-japanese war, imperial japan',
   'score': np.float32(2.1125)},
  {'movie_id': 4344,
   'title': 'Hotel Very Welcome',
   'genres': 'Drama',
   'keywords': 'travel, backpacker, thailand, goa, relationship, woman director, sense of life',
   'score': np.float32(2.0049)},
  {'movie_id': 1292715,
   'title': 'The Notorious Bored Samurai 3',
   'genres': 'Drama, History, Action',
   'keywords': 'jidaigeki',
   'score': np.float32(1.9789)},
  {'movie_id': 707588,
   'title': 'Lo Steinway',
   'genres': 'Drama, War, History, Animation',
   'keywords': 'truce, great war, piano',
   'score': np.float32(1.956)},
  {'movie_id': 13403,
   'title': 'Hedwig and the Angry Inch',
   'genres': 'Drama, Music, Comedy',
   'keywords': 'lgbt in the military, self identity, glam rock, singer, child molestation, transsexuality, restaurant chain, nonbinary