In [1]:
from copy import deepcopy
from itertools import combinations
import pickle
import typing as tp
from zipfile import ZipFile

from lightfm import LightFM
from lightfm.data import Dataset as LFMDataset
import numpy as np
import pandas as pd
from scipy.sparse import coo_matrix
from sklearn.preprocessing import normalize
from transliterate import translit

In [4]:
books = pd.read_csv('csv/Books.csv')
ratings = pd.read_csv('csv/Ratings.csv')
users = pd.read_csv('csv/Users.csv')

  books = pd.read_csv('csv/Books.csv')


In [5]:
df = pd.merge(ratings, books[['ISBN', 'Book-Title', 'Book-Author']], on=['ISBN'])
df = pd.merge(df, users['User-ID'], on=['User-ID'])

In [6]:
map = {'User-ID' : 'id', 'ISBN' : 'book_id', 'Book-Rating' : 'rating', 'Book-Title' : 'title', 'Book-Author' : 'author', 'Year-Of-Publication' : 'year', 'Location' : 'location', 'Age' : 'age'}
df = df.rename(columns=map)

In [None]:
titles = [
    "Crime and Punishment",
    "Brothers Karamazov",
    "Idiot"
]
author = [
    'Fyodor M. Dostoevsky' for j in range(3)
]

ratings = [5, 5, 5]
avatar_interactions_dostoevsky = pd.DataFrame({"user_id": "avatar_dostoevsky", "Book-Title": titles, "Book-Author" : author, "rating": ratings})
avatar_interactions_dostoevsky = avatar_interactions_dostoevsky.merge(books[['ISBN', 'Book-Title', 'Book-Author']], on=['Book-Title', 'Book-Author'])


In [9]:
df.to_csv("data.csv")

In [None]:
x = list(np.unique(df.id.values))
train_ids = list(np.random.choice(x, size=92106-18421, replace=False))
test_ids = list(set(x) - set(train_ids))

In [None]:
train = df[df.id.isin(train_ids)]
test = df[df.id.isin(test_ids)]

In [282]:
lfm_dataset = LFMDataset()
lfm_dataset.fit(
    users=train["id"].values,
    items=train["book_id"].values,
)

train_matrix, _ = lfm_dataset.build_interactions(zip(*train[["id", "book_id", "rating"]].values.T))

In [284]:
lfm_model = LightFM(
    learning_rate=0.01, 
    loss='warp', 
    no_components=32,
    random_state=42
)
lfm_model.fit(
    interactions=train_matrix, 
    epochs=15,
    num_threads=20
)

<lightfm.lightfm.LightFM at 0x7f8c9efad300>

In [287]:
user_vectors = lfm_model.user_embeddings
item_vectors = lfm_model.item_embeddings
id_item_mapping = {k: v for k, v in lfm_dataset._item_id_mapping.items()}

In [None]:
from annoy import AnnoyIndex
from scipy.spatial.distance import cdist

In [None]:
f = 32
t = AnnoyIndex(f, 'angular')

for idx, item in enumerate(item_vectors):
    t.add_item(idx, item)

t.build(10)

True

In [None]:
id_item_mapping[0]

'034545104X'

In [None]:
book_mapping = {k : v for k, v in zip(train.book_id, train.title)}

In [None]:
[book_mapping[y] for y in [id_item_mapping[x] for x in t.get_nns_by_vector(user_vectors[1], 10)]]

['Midnight in the Garden of Good and Evil',
 'The Sparrow',
 'While I Was Gone',
 'Under the Banner of Heaven : A Story of Violent Faith',
 'Sushi for Beginners : A Novel (Keyes, Marian)',
 'Griffin &amp; Sabine: An Extraordinary Correspondence',
 "A Patchwork Planet (Ballantine Reader's Circle)",
 'The Flanders Panel',
 'All the Pretty Horses (The Border Trilogy, Vol 1)',
 "The Color of Water: A Black Man's Tribute to His White Mother"]

In [None]:
user_vector = user_vectors[1]
closest = t.get_nns_by_vector(
    user_vector,
    100,
    -1,
    include_distances=False
)
closest

[17956,
 8,
 1413,
 18262,
 35098,
 1361,
 10194,
 19148,
 2762,
 67132,
 12911,
 17076,
 19298,
 543,
 765,
 2806,
 19183,
 68206,
 115,
 1015,
 19305,
 16809,
 4,
 18213,
 1019,
 2760,
 31230,
 16712,
 2648,
 1164,
 836,
 2850,
 5,
 10215,
 17667,
 149,
 1104,
 565,
 1012,
 547,
 782,
 1027,
 9025,
 724,
 16977,
 19186,
 690,
 645,
 11299,
 694,
 21792,
 804,
 8928,
 30322,
 2759,
 1903,
 763,
 10240,
 2772,
 18239,
 18009,
 12896,
 37176,
 1071,
 10200,
 27226,
 130,
 11978,
 693,
 38218,
 845,
 1031,
 1026,
 861,
 12637,
 702,
 18364,
 2268,
 544,
 18498,
 42315,
 12653,
 564,
 1206,
 17462,
 30306,
 490,
 537,
 18086,
 18398,
 716,
 33310,
 2766,
 2812,
 61,
 18815,
 31245,
 728,
 39335,
 16923]

In [228]:
from __future__ import annotations

from itertools import islice
from typing import Callable, Dict, Hashable, Literal, Sequence, Tuple

import numpy as np
from annoy import AnnoyIndex
from numpy.typing import NDArray



class AnnoyRecommender:
    def __init__(
            self,
            item_vectors: NDArray[np.float32],
            user_vectors: NDArray[np.float32],
            user_id_user_index_id_mapping: Dict[Hashable, int],
            item_id_item_index_id_mapping: Dict[Hashable, int],
            top_k: int,
            dim: int,
            metric: Literal['angular', 'euclidian', 'manhattan', 'hamming', 'dot'] = 'dot',
            n_trees: int = 10,
            n_jobs: int = -1,
            search_k: int = -1,
            n_neighbors: int = 500
    ):
        self.item_vectors = item_vectors
        self.user_vectors = user_vectors
        self.user_to_num = user_id_user_index_id_mapping
        self.item_to_num = item_id_item_index_id_mapping
        self.num_to_user = {v: k for k, v in user_id_user_index_id_mapping.items()}
        self.num_to_item = {v: k for k, v in item_id_item_index_id_mapping.items()}
        self.top_k = top_k
        self.dim = dim
        self.metric = metric
        self.n_trees = n_trees
        self.n_jobs = n_jobs
        self.search_k = search_k
        self.n_neighbors = n_neighbors


    def fit(self) -> AnnoyRecommender:
        self._build()
        return self

    def _build(self) -> None:
        index = AnnoyIndex(f=self.dim, metric=self.metric)
        for idx, vector in enumerate(self.item_vectors):
            index.add_item(idx, vector)
        index.build(n_trees=self.n_trees, n_jobs=self.n_jobs)
        self.index = index

    def recommend_single_user(
            self, user_id : Hashable, item_whitelist: Sequence[Hashable]
    ) -> Sequence[Hashable]:
        id, item_ids = self.user_to_num[user_id], [
            self.item_to_num[item] for item in item_whitelist
        ]

        user_vector = self.user_vectors[id]

        if len(item_whitelist) == 0:
            item_ids = list(self.item_to_num.values())

        closest = self._get_similar(user_vector=user_vector)
        closest = self._get_filtered_top(
            candidates=closest, allowed_items=item_ids
        )

        recs = [self.num_to_item[item] for item in closest]
        
        return recs
    
    def _get_similar(
            self, user_vector: NDArray[np.float32]
        ) -> Sequence[int]:
        nearest_neighbours = self.index.get_nns_by_vector(
            user_vector, 
            self.n_neighbors,
            self.search_k,
            include_distances=False,
        )

        return nearest_neighbours

    def _get_filtered_top(
            self, candidates: Sequence[int], allowed_items: Sequence[int]
    ) -> Sequence[int]:
        allowed_items_set = set(allowed_items)
        return list(
            islice(
                (cand for cand in candidates if cand in allowed_items_set), self.top_k
            )
        )

In [288]:
ann = AnnoyRecommender(
    item_vectors=item_vectors,
    user_vectors=user_vectors,
    user_id_user_index_id_mapping=lfm_dataset._user_id_mapping,
    item_id_item_index_id_mapping=id_item_mapping,
    top_k=25,
    dim=32,
    metric='angular',
    n_trees=50,
    n_neighbors=20
)

In [289]:
ann.fit()

<__main__.AnnoyRecommender at 0x7f8c9efacfd0>

In [231]:
item_id_mapping = {k : v for v, k in id_item_mapping.items()}

In [None]:
train

Unnamed: 0,id,book_id,rating,title,author
0,276725,034545104X,0,Flesh Tones: A Novel,M. J. Rose
1,2313,034545104X,5,Flesh Tones: A Novel,M. J. Rose
2,2313,0812533550,9,Ender's Game (Ender Wiggins Saga (Paperback)),Orson Scott Card
3,2313,0679745580,8,In Cold Blood (Vintage International),TRUMAN CAPOTE
4,2313,0060173289,9,Divine Secrets of the Ya-Ya Sisterhood : A Novel,Rebecca Wells
...,...,...,...,...,...
1031131,276442,2862749796,7,Le Huit,Katherine Neville
1031132,276618,3788097000,5,Ludwig Marum: Briefe aus dem Konzentrationslag...,Ludwig Marum
1031133,276647,0553571001,0,Christmas With Anne and Other Holiday Stories:...,L. M. Montgomery
1031134,276647,0689822294,10,Heaven (Coretta Scott King Author Award Winner),Angela Johnson


In [278]:
nice_train = train[train.rating > 5]

In [279]:
nice_train

Unnamed: 0,id,book_id,rating,title,author
2,2313,0812533550,9,Ender's Game (Ender Wiggins Saga (Paperback)),Orson Scott Card
3,2313,0679745580,8,In Cold Blood (Vintage International),TRUMAN CAPOTE
4,2313,0060173289,9,Divine Secrets of the Ya-Ya Sisterhood : A Novel,Rebecca Wells
7,2313,0345348036,9,The Princess Bride: S Morgenstern's Classic Ta...,WILLIAM GOLDMAN
8,2313,0553278223,7,The Martian Chronicles,RAY BRADBURY
...,...,...,...,...,...
1031129,276442,2070498492,6,Le NÃ?Â?ud Gordien,Bernard Schlink
1031130,276442,2264032960,6,L'Apprenti du diable,Ellis Peters
1031131,276442,2862749796,7,Le Huit,Katherine Neville
1031134,276647,0689822294,10,Heaven (Coretta Scott King Author Award Winner),Angela Johnson


In [255]:
train[.apply(lambda x: 1 if x > 5 else 0) == 1]

  train[train[train.id==6543].rating.apply(lambda x: 1 if x > 5 else 0) == 1]


IndexingError: Unalignable boolean Series provided as indexer (index of the boolean Series and of the indexed object do not match).

In [291]:
id = 276647
books = []

In [241]:
recommendations = {}

users = list(np.unique(test.id))

In [290]:
train

Unnamed: 0,id,book_id,rating,title,author
0,276725,034545104X,0,Flesh Tones: A Novel,M. J. Rose
1,2313,034545104X,5,Flesh Tones: A Novel,M. J. Rose
2,2313,0812533550,9,Ender's Game (Ender Wiggins Saga (Paperback)),Orson Scott Card
3,2313,0679745580,8,In Cold Blood (Vintage International),TRUMAN CAPOTE
4,2313,0060173289,9,Divine Secrets of the Ya-Ya Sisterhood : A Novel,Rebecca Wells
...,...,...,...,...,...
1031131,276442,2862749796,7,Le Huit,Katherine Neville
1031132,276618,3788097000,5,Ludwig Marum: Briefe aus dem Konzentrationslag...,Ludwig Marum
1031133,276647,0553571001,0,Christmas With Anne and Other Holiday Stories:...,L. M. Montgomery
1031134,276647,0689822294,10,Heaven (Coretta Scott King Author Award Winner),Angela Johnson


In [297]:
recs = ann.recommend_single_user(
    2313, books
)

[book_mapping[x] for x in recs]

['The Princess Diaries',
 'Eva Moves the Furniture: A Novel',
 'Cane River',
 'When the Wind Blows',
 'Little Altars Everywhere',
 'A Heartbreaking Work Of Staggering Genius : A Memoir Based on a True Story',
 'The River King',
 'The Straw Men',
 'Falling Angels',
 'Le Mariage',
 'Grand Avenue',
 'Princess in the Spotlight (The Princess Diaries, Vol. 2)',
 'Still Waters',
 "Postcards From Life's Little Instruction Book",
 'Big Stone Gap',
 "Don't Cry Now",
 "It's My F---ing Birthday : A Novel",
 'The Crimson Petal and the White',
 'The Switch',
 'The Most Wanted']

In [298]:
lfm_dataset = LFMDataset()
lfm_dataset.fit(
    users=test["id"].values,
    items=test["book_id"].values,
)

train_matrix, _ = lfm_dataset.build_interactions(zip(*test[["id", "book_id", "rating"]].values.T))

In [299]:
lfm_model = LightFM(
    learning_rate=0.01, 
    loss='warp', 
    no_components=32,
    random_state=42
)
lfm_model.fit(
    interactions=train_matrix, 
    epochs=15,
    num_threads=20
)

<lightfm.lightfm.LightFM at 0x7f8c9efad000>

In [300]:
user_vectors = lfm_model.user_embeddings
item_vectors = lfm_model.item_embeddings
id_item_mapping = {k: v for k, v in lfm_dataset._item_id_mapping.items()}

In [304]:
users = list(np.unique(test.id))
len(users)

18421

In [305]:
ans = pd.DataFrame(columns=['id', 'title'])
ans

Unnamed: 0,id,title
