In [1]:
import os
import zipfile
import csv

import requests


def _download(url: str, dest_path: str):

    req = requests.get(url, stream=True)
    req.raise_for_status()

    with open(dest_path, "wb") as fd:
        for chunk in req.iter_content(chunk_size=2 ** 20):
            fd.write(chunk)


def get_data():

    ratings_url = ("http://www2.informatik.uni-freiburg.de/~cziegler/BX/BX-CSV-Dump.zip")

    if not os.path.exists("data"):
        os.makedirs("data")

    _download(ratings_url, "data/data.zip")

    with zipfile.ZipFile("data/data.zip") as archive:
        return (
            csv.DictReader(
                (x.decode("utf-8", "ignore") for x in archive.open("BX-Book-Ratings.csv")),
                delimiter=";",
            ),
            csv.DictReader(
                (x.decode("utf-8", "ignore") for x in archive.open("BX-Books.csv")), delimiter=";"
            ),
        )


def get_ratings():

    return get_data()[0]


def get_book_features():

    return get_data()[1]

In [2]:
import json
from itertools import islice

ratings, book_features = get_data()

In [3]:
for line in islice(ratings, 2):
    print(json.dumps(line, indent=4))

{
    "User-ID": "276725",
    "ISBN": "034545104X",
    "Book-Rating": "0"
}
{
    "User-ID": "276726",
    "ISBN": "0155061224",
    "Book-Rating": "5"
}


In [4]:
from lightfm.data import Dataset

dataset = Dataset()
dataset.fit((x['User-ID'] for x in get_ratings()),
            (x['ISBN'] for x in get_ratings()))

In [5]:
num_users, num_items = dataset.interactions_shape()
print('Num users: {}, num_items {}.'.format(num_users, num_items))

Num users: 105283, num_items 340553.


In [6]:
dataset.fit_partial(items=(x['ISBN'] for x in get_book_features()),
                    item_features=(x['Book-Author'] for x in get_book_features()))

In [7]:
(interactions, weights) = dataset.build_interactions(((x['User-ID'], x['ISBN'])
                                                      for x in get_ratings()))

print(repr(interactions))

<105283x341762 sparse matrix of type '<class 'numpy.int32'>'
	with 1149780 stored elements in COOrdinate format>


In [10]:
print(interactions)

  (0, 0)	1
  (1, 1)	1
  (2, 2)	1
  (3, 3)	1
  (3, 4)	1
  (4, 5)	1
  (5, 6)	1
  (6, 7)	1
  (7, 8)	1
  (8, 9)	1
  (9, 10)	1
  (9, 11)	1
  (9, 12)	1
  (9, 13)	1
  (9, 14)	1
  (9, 15)	1
  (10, 16)	1
  (10, 17)	1
  (10, 18)	1
  (10, 19)	1
  (10, 20)	1
  (10, 21)	1
  (10, 22)	1
  (10, 23)	1
  (11, 24)	1
  :	:
  (105276, 109199)	1
  (105276, 255393)	1
  (105276, 340549)	1
  (105277, 146362)	1
  (105278, 50527)	1
  (105278, 34599)	1
  (105278, 3111)	1
  (105278, 50534)	1
  (105278, 56824)	1
  (105278, 12047)	1
  (105278, 26864)	1
  (105278, 20167)	1
  (105278, 46617)	1
  (105278, 8545)	1
  (105278, 195285)	1
  (105278, 340550)	1
  (105278, 8851)	1
  (105278, 2104)	1
  (105278, 340551)	1
  (105278, 284679)	1
  (105278, 226347)	1
  (105279, 7295)	1
  (105280, 12065)	1
  (105281, 78598)	1
  (105282, 340552)	1


In [13]:
import numpy as np
from scipy.sparse import coo_matrix

row  = np.array([0, 3, 1, 0])
col  = np.array([0, 3, 1, 2])
data = np.array([4, 0, 7, 9])
sp_matrix = coo_matrix((data, (row, col)), shape=(4, 4))

In [15]:
print(repr(sp_matrix))
print(sp_matrix)
print(sp_matrix.todense()[1,2])

<4x4 sparse matrix of type '<class 'numpy.int64'>'
	with 4 stored elements in COOrdinate format>
  (0, 0)	4
  (3, 3)	0
  (1, 1)	7
  (0, 2)	9
0


In [5]:
import csv
import json
from itertools import islice

f = open('names.csv', 'w')
fnames = ['val']
writer = csv.DictWriter(f, fieldnames=fnames)
writer.writerow({'val' : 'User-ID;ISBN;Book-Rating'})

for x in csv.DictReader(open("data/aux_data_headers_2.csv"),delimiter=";"):
    if x['Book-Rating']!="0":
        writer.writerow({'val' : x['User-ID']+";" +x['ISBN'] + ";" + x['Book-Rating']})