# Matrix Factorization using LightFM

In [1]:
!pip install lightfm



In [2]:
import lightfm
from lightfm.data import Dataset
from lightfm import LightFM
from lightfm.evaluation import precision_at_k, auc_score
from lightfm.cross_validation import random_train_test_split
import pandas as pd
from sklearn.model_selection import train_test_split
import altair as alt



In [3]:
df_reviews = pd.read_json("yelp_dataset/yelp_academic_dataset_review.json", lines=True)
df_reviews = df_reviews[:1000]

In [4]:
df_reviews.head(2)

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date
0,KU_O5udG6zpxOg-VcAEodg,mh_-eMZ6K5RLWhZyISBhwA,XQfwVwDr-v0ZS3_CbbE5Xw,3,0,0,0,"If you decide to eat here, just be aware it is...",2018-07-07 22:09:11
1,BiTunyQ73aT9WBnpR9DZGw,OyoGAe7OKpv6SyGZT5g77Q,7ATYjTIgM3jUlt4UM3IypQ,5,1,0,1,I've taken a lot of spin classes over the year...,2012-01-03 15:28:18


In [5]:
dataset = Dataset()
dataset.fit(df_reviews["user_id"], df_reviews["business_id"])
num_users, num_buz = dataset.interactions_shape()
print('Num users: {}, num_businesses {}.'.format(num_users, num_buz))
(interactions, weights) = dataset.build_interactions([x for x in zip(df_reviews["user_id"], df_reviews["business_id"])])

print(repr(interactions))

Num users: 994, num_businesses 823.
<994x823 sparse matrix of type '<class 'numpy.int32'>'
	with 1000 stored elements in COOrdinate format>


In [6]:
train, test = random_train_test_split(interactions=interactions, test_percentage=0.2, random_state=2)

In [7]:
def train_eval(train, test, emb_size, lr, epochs, l2, loss="logistic"):
    NUM_THREADS = 4
    model = LightFM(
        no_components=emb_size, 
        loss=loss,
        user_alpha=l2,
    )
    model.fit(
        interactions=train, 
        epochs=epochs, 
        num_threads=NUM_THREADS,
        verbose=True,
    )
    # train_precision = precision_at_k(model, train, k=10).mean()
    # test_precision = precision_at_k(model, test, k=10).mean()

    train_auc = auc_score(model, train).mean()
    # exclude cold-start
    test_auc = auc_score(model, test, train_interactions=train).mean()

    # print('Precision: train %.2f, test %.2f.' % (train_precision, test_precision))
    # print('AUC: train %.2f, test %.2f.' % (train_auc, test_auc)) 
    return {"emb_size": emb_size, "lr": lr, "l2": l2, "epochs": epochs, "train_auc": train_auc, "test_auc": test_auc}
    

In [8]:
%%time
lr = 0.05
epochs = 20
l2 = 0.002
metrics = []
for emb_size in range(100, 1200, 200):
    print(f"Embedding size: {emb_size}")
    metric_record = train_eval(train, test, emb_size, lr, epochs, l2)
    metrics.append(metric_record)
metrics = pd.DataFrame(data=metrics)

Embedding size: 100


Epoch: 100%|███████████████████████████████████████████████████████████████████████████| 20/20 [00:00<00:00, 582.25it/s]


Embedding size: 300


Epoch: 100%|███████████████████████████████████████████████████████████████████████████| 20/20 [00:00<00:00, 247.12it/s]


Embedding size: 500


Epoch: 100%|███████████████████████████████████████████████████████████████████████████| 20/20 [00:00<00:00, 148.42it/s]


Embedding size: 700


Epoch: 100%|███████████████████████████████████████████████████████████████████████████| 20/20 [00:00<00:00, 108.35it/s]


Embedding size: 900


Epoch: 100%|████████████████████████████████████████████████████████████████████████████| 20/20 [00:00<00:00, 85.39it/s]


Embedding size: 1100


Epoch: 100%|████████████████████████████████████████████████████████████████████████████| 20/20 [00:00<00:00, 69.87it/s]


CPU times: user 1.62 s, sys: 96.7 ms, total: 1.72 s
Wall time: 1.71 s


In [12]:
chart = alt.Chart(metrics[["emb_size", "train_auc", "test_auc"]].melt("emb_size")).mark_line().encode(
    x=alt.X("emb_size", title="Embedding Size"),
    y=alt.Y("value", title="auc"),
    color="variable",
)
chart

In [10]:
metrics[["emb_size", "train_auc", "test_auc"]]

Unnamed: 0,emb_size,train_auc,test_auc
0,100,0.911612,0.251132
1,300,0.850384,0.26202
2,500,0.787702,0.239447
3,700,0.749454,0.247768
4,900,0.73069,0.245755
5,1100,0.713905,0.258431
