# Recommender System Training - LightFM

In this notebook we will train recommender system in order to map users susceptible answer to a question.

In [1]:
from utils import SERIALIZED_DATA_ARTIFACT_NAMES, read_data, track_model_metrics, prepare_all_predictions
import pandas as pd
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
random_state = 42

## Pure CF model

### Get Datasets

In [21]:
# Load datasets containing all users and questions interactions
answers = read_data("answers.json")
users = read_data("users.json")
questions = read_data("questions.json")

In [22]:
# Load train test datasets
train = pd.read_csv("../"+SERIALIZED_DATA_ARTIFACT_NAMES.path_to_train_dataset)
test_cs = pd.read_csv("../"+SERIALIZED_DATA_ARTIFACT_NAMES.path_to_test_cold_start_dataset)
test_ws = pd.read_csv("../"+SERIALIZED_DATA_ARTIFACT_NAMES.path_to_test_warm_start_dataset)
eval_cs = pd.read_csv("../"+SERIALIZED_DATA_ARTIFACT_NAMES.path_to_eval_cold_start_dataset)
eval_ws = pd.read_csv("../"+SERIALIZED_DATA_ARTIFACT_NAMES.path_to_eval_warm_start_dataset)
test = pd.concat([test_cs, test_ws], ignore_index=True)
eval_ds = pd.concat([eval_cs, eval_ws], ignore_index=True)

### Create LightFM Dataset

In [46]:
from lightfm.data import Dataset

dataset = Dataset()
dataset.fit((question_id for question_id in questions.question_id.tolist()),
            (user_id for user_id in users.id.tolist()))

In [47]:
num_question, num_user = dataset.interactions_shape()
print('Num question: {}, num_user {}.'.format(num_question, num_user))

Num question: 100000, num_user 138698.


### Training with equal weights

Create interactions for each dataset to train and test model

In [48]:
train_interactions, _ = dataset.build_interactions(((row["question_id"], row["user_id"])
                                                      for index, row in train.iterrows()))
test_interactions, _ = dataset.build_interactions(((row["question_id"], row["user_id"])
                                                      for index, row in test.iterrows()))
test_ws_interactions, _ = dataset.build_interactions(((row["question_id"], row["user_id"])
                                                      for index, row in test_ws.iterrows()))
test_cs_interactions, _ = dataset.build_interactions(((row["question_id"], row["user_id"])
                                                      for index, row in test_cs.iterrows()))
eval_interactions, _ = dataset.build_interactions(((row["question_id"], row["user_id"])
                                                      for index, row in eval_ds.iterrows()))

#### Train model

In [49]:
NUM_THREADS=4
NUM_EPOCHS=10
K=5

In [50]:
from lightfm import LightFM


model_equal_weight = LightFM(loss='warp', random_state=random_state, no_components=32)

# output1, _ = track_model_metrics(model=model, 
#                                  train_interactions=train_interactions, 
#                                  test_interactions=eval_interactions, 
#                                  k=K,
#                                  no_epochs=NUM_EPOCHS, 
#                                  no_threads=NUM_THREADS)

In [51]:
model_equal_weight.fit(train_interactions, verbose=True, epochs=NUM_EPOCHS)

Epoch: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:01<00:00,  6.05it/s]


<lightfm.lightfm.LightFM at 0x7fe8901bf8e0>

In [53]:
# Import the evaluation routines
from lightfm.evaluation import auc_score, precision_at_k

# Compute and print the AUC score
test_auc = auc_score(model_equal_weight, test_interactions, train_interactions=train_interactions, num_threads=NUM_THREADS).mean()
print('Collaborative filtering test AUC: %s' % test_auc)

test_precisionk = precision_at_k(model_equal_weight, test_interactions, train_interactions=train_interactions, k=10, num_threads=NUM_THREADS).mean()
print('Collaborative filtering test precision at k: %s' % test_precisionk)

test_ws_auc = auc_score(model_equal_weight, test_ws_interactions, train_interactions=train_interactions, num_threads=NUM_THREADS).mean()
print('Collaborative filtering test warm_start AUC: %s' % test_ws_auc)

test_ws_precisionk = precision_at_k(model_equal_weight, test_ws_interactions, train_interactions=train_interactions, k=10, num_threads=NUM_THREADS).mean()
print('Collaborative filtering test precision at k: %s' % test_ws_precisionk)

test_cs_auc = auc_score(model_equal_weight, test_cs_interactions, train_interactions=train_interactions, num_threads=NUM_THREADS).mean()
print('Collaborative filtering test cold_start AUC: %s' % test_cs_auc)

test_cs_precisionk = precision_at_k(model_equal_weight, test_cs_interactions, train_interactions=train_interactions, k=10, num_threads=NUM_THREADS).mean()
print('Collaborative filtering test precision at k: %s' % test_cs_precisionk)

Collaborative filtering test AUC: 0.73596704
Collaborative filtering test precision at k: 0.0073099416
Collaborative filtering test warm_start AUC: 0.7329053
Collaborative filtering test precision at k: 0.0089845825
Collaborative filtering test cold_start AUC: 0.7390289
Collaborative filtering test precision at k: 0.0056353007


### Training with weights

In [54]:
train_interactions, train_weights = dataset.build_interactions(((row["question_id"], row["user_id"], row["score"])
                                                      for index, row in train.iterrows()))
test_interactions, test_weights = dataset.build_interactions(((row["question_id"], row["user_id"], row["score"])
                                                      for index, row in test.iterrows()))
test_ws_interactions, test_ws_weights = dataset.build_interactions(((row["question_id"], row["user_id"], row["score"])
                                                      for index, row in test_ws.iterrows()))
test_cs_interactions, test_cs_weights = dataset.build_interactions(((row["question_id"], row["user_id"], row["score"])
                                                      for index, row in test_cs.iterrows()))
eval_interactions, eval_weights = dataset.build_interactions(((row["question_id"], row["user_id"], row["score"])
                                                      for index, row in eval_ds.iterrows()))

#### Train model

In [55]:
NUM_THREADS=4
NUM_EPOCHS=10
K=5

In [56]:
from lightfm import LightFM


model_score_weight = LightFM(loss='warp', random_state=random_state, no_components=32)

# output2, _ = track_model_metrics(model=model, 
#                                  train_interactions=train_interactions, 
#                                  test_interactions=eval_interactions, 
#                                  k=K,
#                                  no_epochs=NUM_EPOCHS, 
#                                  no_threads=NUM_THREADS)

In [57]:
model_score_weight.fit(train_interactions, sample_weight=train_weights, verbose=True, epochs=NUM_EPOCHS)

Epoch: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:01<00:00,  7.55it/s]


<lightfm.lightfm.LightFM at 0x7fe7e31efbe0>

In [58]:
from lightfm.evaluation import auc_score, precision_at_k

test_auc = auc_score(model_score_weight, test_interactions, train_interactions=train_interactions, num_threads=NUM_THREADS).mean()
print('Collaborative filtering test AUC: %s' % test_auc)

test_precisionk = precision_at_k(model_score_weight, test_interactions, train_interactions=train_interactions, k=10, num_threads=NUM_THREADS).mean()
print('Collaborative filtering test precision at k: %s' % test_precisionk)

test_ws_auc = auc_score(model_score_weight, test_ws_interactions, train_interactions=train_interactions, num_threads=NUM_THREADS).mean()
print('Collaborative filtering test warm_start AUC: %s' % test_ws_auc)

test_ws_precisionk = precision_at_k(model_score_weight, test_ws_interactions, train_interactions=train_interactions, k=10, num_threads=NUM_THREADS).mean()
print('Collaborative filtering test precision at k: %s' % test_ws_precisionk)

test_cs_auc = auc_score(model_score_weight, test_cs_interactions, train_interactions=train_interactions, num_threads=NUM_THREADS).mean()
print('Collaborative filtering test cold_start AUC: %s' % test_cs_auc)

test_cs_precisionk = precision_at_k(model_score_weight, test_cs_interactions, train_interactions=train_interactions, k=10, num_threads=NUM_THREADS).mean()
print('Collaborative filtering test precision at k: %s' % test_cs_precisionk)

Collaborative filtering test AUC: 0.7310329
Collaborative filtering test precision at k: 0.0071770335
Collaborative filtering test warm_start AUC: 0.72916305
Collaborative filtering test precision at k: 0.008984583
Collaborative filtering test cold_start AUC: 0.73290265
Collaborative filtering test precision at k: 0.0053694844


## Hybrid model

In [1]:
from utils import SERIALIZED_DATA_ARTIFACT_NAMES, read_data, track_model_metrics, prepare_all_predictions
import pandas as pd
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
random_state = 42

### Get Datasets

In [3]:
# Load datasets containing all users and questions interactions
answers = read_data("answers.json")
users = read_data("users.json")
questions = read_data("questions.json")

In [4]:
# Load train test datasets
users_features = pd.read_csv("../data/features/users_features.csv")
users_features.fillna("", inplace=True)
questions_features = pd.read_csv("../data/features/question_features.csv")
train = pd.read_csv("../"+SERIALIZED_DATA_ARTIFACT_NAMES.path_to_train_dataset)
test_cs = pd.read_csv("../"+SERIALIZED_DATA_ARTIFACT_NAMES.path_to_test_cold_start_dataset)
test_ws = pd.read_csv("../"+SERIALIZED_DATA_ARTIFACT_NAMES.path_to_test_warm_start_dataset)
eval_cs = pd.read_csv("../"+SERIALIZED_DATA_ARTIFACT_NAMES.path_to_eval_cold_start_dataset)
eval_ws = pd.read_csv("../"+SERIALIZED_DATA_ARTIFACT_NAMES.path_to_eval_warm_start_dataset)
test = pd.concat([test_cs, test_ws], ignore_index=True)
eval_ds = pd.concat([eval_cs, eval_ws], ignore_index=True)

  users_features = pd.read_csv("../data/features/users_features.csv")


In [5]:
questions_features.isna().sum()

question_id      0
tag1           121
tag2           168
tag3           166
tag4            93
tag5            69
dtype: int64

In [6]:
tags_column = ["tag1", "tag2", "tag3", "tag4", "tag5"]
num_col = ['reputation','up_votes','down_votes','views']

### Create LightFM Dataset

In [7]:
from lightfm.data import Dataset

dataset = Dataset()
dataset.fit((question_id for question_id in questions.question_id.tolist()),
            (user_id for user_id in users.id.tolist()))

In [8]:
num_question, num_user = dataset.interactions_shape()
print('Num question: {}, num_user {}.'.format(num_question, num_user))

Num question: 100000, num_user 138698.


Add information about users and questions

In [9]:

dataset.fit_partial(users=(question_id for question_id in questions_features.question_id.tolist()),
                    user_features=(pd.unique(questions_features[tags_column].values.ravel('K'))),
                    items=(users_id for users_id in users_features.id.tolist()),
                    item_features=(pd.unique(users_features[tags_column+num_col].values.ravel('K')))
                    )

### Build interactions, user features and question features

In [10]:
train_interactions, train_weights = dataset.build_interactions(((row["question_id"], row["user_id"], row["score"])
                                                      for index, row in train.iterrows()))
test_interactions, test_weights = dataset.build_interactions(((row["question_id"], row["user_id"], row["score"])
                                                      for index, row in test.iterrows()))
test_ws_interactions, test_ws_weights = dataset.build_interactions(((row["question_id"], row["user_id"], row["score"])
                                                      for index, row in test_ws.iterrows()))
test_cs_interactions, test_cs_weights = dataset.build_interactions(((row["question_id"], row["user_id"], row["score"])
                                                      for index, row in test_cs.iterrows()))
eval_interactions, eval_weights = dataset.build_interactions(((row["question_id"], row["user_id"], row["score"])
                                                      for index, row in eval_ds.iterrows()))

In [11]:
from typing import Optional, TypeVar

COOMatrix = TypeVar("COOMatrix")

def get_questions_features(data: pd.DataFrame, dataset: Dataset, tags_column: list[str]) -> COOMatrix:
    """Build questions features from lightfm dataset and dataframe

    Args:
        data: interactions to build
        dataset: light fm dataset

    Returns:
        COOMatrix: questions features matrix
    """
    columns = ["question_id"] + tags_column
    return dataset.build_user_features(((row[0], list(row[1:])) for row in data[columns].itertuples(index=False, name=None)))  # type: ignore


def get_users_features(data: pd.DataFrame, dataset: Dataset, tags_column: list[str]) -> COOMatrix:
    """Build users features from lightfm dataset and dataframe

    Args:
        data: interactions to build
        dataset: light fm dataset

    Returns:
        COOMatrix: users features matrix
    """
    columns = ["id"] + tags_column + num_col
    return dataset.build_item_features(((row[0], list(row[1:])) for row in data[columns].itertuples(index=False, name=None)))  # type: ignore

In [12]:
users_features_matrix = get_users_features(  # type: ignore
            data=users_features, dataset=dataset, tags_column=tags_column+num_col
        )

In [13]:
questions_features_matrix = get_questions_features(  # type: ignore
            data=questions_features, dataset=dataset, tags_column=tags_column
        )

### Training with weights

#### Train model

In [27]:
NUM_THREADS=4
NUM_EPOCHS=10
K=5

In [28]:
from lightfm import LightFM


model_hybrid = LightFM(loss='warp', random_state=random_state, no_components=32)

# output3, _ = track_model_metrics(model=model, 
#                                  train_interactions=train_interactions, 
#                                  test_interactions=eval_interactions, 
#                                  k=K,
#                                  no_epochs=NUM_EPOCHS, 
#                                  no_threads=NUM_THREADS)

In [29]:
model_hybrid.fit(train_interactions, sample_weight=train_weights, verbose=True, epochs=NUM_EPOCHS, item_features=users_features_matrix, user_features=questions_features_matrix)

Epoch: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:04<00:00,  2.28it/s]


<lightfm.lightfm.LightFM at 0x7f594820cb80>

In [30]:
from lightfm.evaluation import auc_score, precision_at_k

test_auc = auc_score(model_hybrid, test_interactions, train_interactions=train_interactions, item_features=users_features_matrix, user_features=questions_features_matrix, num_threads=NUM_THREADS).mean()
print('Hybrid test AUC: %s' % test_auc)

test_precisionk = precision_at_k(model_hybrid, test_interactions, train_interactions=train_interactions, k=10, item_features=users_features_matrix, user_features=questions_features_matrix, num_threads=NUM_THREADS).mean()
print('Hybrid test precision at k: %s' % test_precisionk)

test_ws_auc = auc_score(model_hybrid, test_ws_interactions, train_interactions=train_interactions, item_features=users_features_matrix, user_features=questions_features_matrix, num_threads=NUM_THREADS).mean()
print('Hybrid test warm_start AUC: %s' % test_ws_auc)

test_ws_precisionk = precision_at_k(model_hybrid, test_ws_interactions, train_interactions=train_interactions, k=10, item_features=users_features_matrix, user_features=questions_features_matrix, num_threads=NUM_THREADS).mean()
print('Hybrid test precision at k: %s' % test_ws_precisionk)

test_cs_auc = auc_score(model_hybrid, test_cs_interactions, train_interactions=train_interactions, item_features=users_features_matrix, user_features=questions_features_matrix, num_threads=NUM_THREADS).mean()
print('Hybrid test cold_start AUC: %s' % test_cs_auc)

test_cs_precisionk = precision_at_k(model_hybrid, test_cs_interactions, train_interactions=train_interactions, k=10, item_features=users_features_matrix, user_features=questions_features_matrix, num_threads=NUM_THREADS).mean()
print('Hybrid test precision at k: %s' % test_cs_precisionk)

Hybrid test AUC: 0.8304192
Hybrid test precision at k: 0.007123871
Hybrid test warm_start AUC: 0.82303596
Hybrid test precision at k: 0.00909091
Hybrid test cold_start AUC: 0.83780235
Hybrid test precision at k: 0.005156832
