In [1]:
import os
import zipfile
import csv

import requests

In [2]:
# function for sending requests
def _download(url: str, dest_path: str):

    req = requests.get(url, stream=True)
    req.raise_for_status()

    with open(dest_path, "wb") as fd:
        for chunk in req.iter_content(chunk_size=2 ** 20):
            fd.write(chunk)

In [10]:
# function for downloading dataset and extracting
def get_data():

    ratings_url = ("http://www2.informatik.uni-freiburg.de/" "~cziegler/BX/BX-CSV-Dump.zip")

    if not os.path.exists("data"):
        os.makedirs("data")

        _download(ratings_url, "data/data.zip")

    with zipfile.ZipFile("data/data.zip") as archive:
        return (
            csv.DictReader(
                (x.decode("utf-8", "ignore") for x in archive.open("BX-Book-Ratings.csv")),
                delimiter=";",
            ),
            csv.DictReader(
                (x.decode("utf-8", "ignore") for x in archive.open("BX-Books.csv")), delimiter=";"
            ),
        )

In [11]:
def get_ratings():
    return get_data()[0]

def get_book_features():
    return get_data()[1]

In [22]:
import json
from itertools import islice

ratings, book_features = get_data()

In [17]:
for line in islice(ratings, 2):
    print(json.dumps(line, indent=4))

{
    "User-ID": "96357",
    "ISBN": "0399501487",
    "Book-Rating": "8"
}
{
    "User-ID": "96357",
    "ISBN": "0425092917",
    "Book-Rating": "8"
}


In [18]:
for line in islice(book_features, 1):
    print(json.dumps(line, indent=4))

{
    "ISBN": "0195153448",
    "Book-Title": "Classical Mythology",
    "Book-Author": "Mark P. O. Morford",
    "Year-Of-Publication": "2002",
    "Publisher": "Oxford University Press",
    "Image-URL-S": "http://images.amazon.com/images/P/0195153448.01.THUMBZZZ.jpg",
    "Image-URL-M": "http://images.amazon.com/images/P/0195153448.01.MZZZZZZZ.jpg",
    "Image-URL-L": "http://images.amazon.com/images/P/0195153448.01.LZZZZZZZ.jpg"
}


In [27]:
from lightfm.data import Dataset

dataset = Dataset()
dataset.fit((x['User-ID'] for x in get_ratings()),
            (x['ISBN'] for x in get_ratings()))

In [28]:
num_users, num_items = dataset.interactions_shape()
print('Num users: {}, num_items {}.'.format(num_users, num_items))

Num users: 105283, num_items 340553.


In [29]:
dataset.fit_partial(items=(x['ISBN'] for x in get_book_features()),
                    item_features=(x['Book-Author'] for x in get_book_features()))

In [30]:
(interactions, weights) = dataset.build_interactions(((x['User-ID'], x['ISBN'])
                                                      for x in get_ratings()))

print(repr(interactions))

<105283x341762 sparse matrix of type '<class 'numpy.int32'>'
	with 1149780 stored elements in COOrdinate format>


In [31]:
item_features = dataset.build_item_features(((x['ISBN'], [x['Book-Author']])
                                              for x in get_book_features()))
print(repr(item_features))

<341762x443805 sparse matrix of type '<class 'numpy.float32'>'
	with 613141 stored elements in Compressed Sparse Row format>


In [32]:
from lightfm import LightFM

model = LightFM(loss='bpr')
model.fit(interactions, item_features=item_features)

<lightfm.lightfm.LightFM at 0x2940e3bd488>