# LightFm SandBox

Just a sandbox to play with ligth fm

In [1]:
import pandas as pd
import numpy as np

## Book Rating

In [2]:
import os
import zipfile
import csv

import requests


def _download(url: str, dest_path: str):

    req = requests.get(url, stream=True)
    req.raise_for_status()

    with open(dest_path, "wb") as fd:
        for chunk in req.iter_content(chunk_size=2 ** 20):
            fd.write(chunk)


def get_data():

    ratings_url = ("http://www2.informatik.uni-freiburg.de/" "~cziegler/BX/BX-CSV-Dump.zip")

    if not os.path.exists("data"):
        os.makedirs("data")

        _download(ratings_url, "data/data.zip")

    with zipfile.ZipFile("data/data.zip") as archive:
        return (
            csv.DictReader(
                (x.decode("utf-8", "ignore") for x in archive.open("BX-Book-Ratings.csv")),
                delimiter=";",
            ),
            csv.DictReader(
                (x.decode("utf-8", "ignore") for x in archive.open("BX-Books.csv")), delimiter=";"
            ),
        )


def get_ratings():

    return get_data()[0]


def get_book_features():

    return get_data()[1]

In [3]:
import json
from itertools import islice

ratings, book_features = get_data()

In [4]:
for line in islice(ratings, 2):
    print(json.dumps(line, indent=4))

{
    "User-ID": "276725",
    "ISBN": "034545104X",
    "Book-Rating": "0"
}
{
    "User-ID": "276726",
    "ISBN": "0155061224",
    "Book-Rating": "5"
}


In [5]:
for line in islice(book_features, 1):
    print(json.dumps(line, indent=4))

{
    "ISBN": "0195153448",
    "Book-Title": "Classical Mythology",
    "Book-Author": "Mark P. O. Morford",
    "Year-Of-Publication": "2002",
    "Publisher": "Oxford University Press",
    "Image-URL-S": "http://images.amazon.com/images/P/0195153448.01.THUMBZZZ.jpg",
    "Image-URL-M": "http://images.amazon.com/images/P/0195153448.01.MZZZZZZZ.jpg",
    "Image-URL-L": "http://images.amazon.com/images/P/0195153448.01.LZZZZZZZ.jpg"
}


In [9]:
from lightfm.data import Dataset

dataset = Dataset()
dataset.fit((x['User-ID'] for x in get_ratings()),
            (x['ISBN'] for x in get_ratings()))

In [10]:
num_users, num_items = dataset.interactions_shape()
print('Num users: {}, num_items {}.'.format(num_users, num_items))

Num users: 105283, num_items 340553.


In [12]:
dataset.fit_partial(items=(x['ISBN'] for x in get_book_features()),
                    item_features=((x['Book-Author'], x["Publisher"]) for x in get_book_features()))

In [13]:
(interactions, weights) = dataset.build_interactions(((x['User-ID'], x['ISBN'])
                                                      for x in get_ratings()))

print(repr(interactions))

<105283x341762 sparse matrix of type '<class 'numpy.int32'>'
	with 1149780 stored elements in COOrdinate format>


In [14]:
item_features = dataset.build_item_features(((x['ISBN'], [x['Book-Author'], x["Publisher"]])
                                              for x in get_book_features()))
print(repr(item_features))

ValueError: Feature Mark P. O. Morford not in feature mapping. Call fit first.

In [15]:
from lightfm import LightFM

model = LightFM(loss='bpr')
model.fit(interactions, item_features=item_features)

<lightfm.lightfm.LightFM at 0x7f352013ea00>

In [17]:
model.get_item_representations()

(array([0.05689505, 0.01122606, 0.02425163, ..., 0.00012449, 0.00013582,
        0.01293525], dtype=float32),
 array([[-0.04431621,  0.01467769,  0.03732223, ..., -0.03474615,
         -0.0011189 , -0.04797392],
        [ 0.04997495,  0.04508774,  0.0341003 , ...,  0.00662619,
          0.04166448,  0.01836482],
        [-0.01720993,  0.03878503, -0.04898636, ..., -0.00958192,
         -0.05391732, -0.00313546],
        ...,
        [ 0.0014292 ,  0.04314879,  0.01599961, ...,  0.04944107,
          0.02679814,  0.0148116 ],
        [-0.01905122, -0.02311305,  0.01597524, ...,  0.04012644,
          0.01705031,  0.00819863],
        [-0.0225209 , -0.02254307,  0.00700371, ..., -0.00770362,
          0.02999308, -0.00057471]], dtype=float32))