In [55]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import networkx as nx
import json
from collections import Counter
from isbnlib import meta
from isbnlib.registry import bibformatters

In [56]:
#nainštalujte
#!pip install isbnlib

In [57]:
# df = pd.read_csv('data/rec-amz-Books.edges', header=None, names=['ID_reviewer', 'ISBN', 'Stars', 'Date'])
# display(df['ISBN'].value_counts())
# display(df['ID_reviewer'].value_counts())

ISBN
0439023483    21398
030758836X    19867
0439023513    14114
0385537859    12973
0007444117    12629
              ...  
0821510401        1
0821508717        1
1939084261        1
1939084768        1
1453758569        1
Name: count, Length: 2330066, dtype: int64

ID_reviewer
A14OJS0VWMOSWO    43201
AFVQZQ8PW0L       28816
A2F6N60Z96CAJI     6121
A320TMDV6KCFU      5955
A2OJW07GQRNJUT     5443
                  ...  
A38P6AZHJE19JU        1
A2UOFPPO08D0W9        1
A1IMFOGM4ZOAY2        1
A3MD9VU1YCUS4Y        1
A8W4BR3HGGS3C         1
Name: count, Length: 8026324, dtype: int64

In [59]:
# np.ones((87085, 87085))

MemoryError: Unable to allocate 56.5 GiB for an array with shape (87085, 87085) and data type float64

In [58]:
# np.ones((10356390, 10356390))

MemoryError: Unable to allocate 780. TiB for an array with shape (10356390, 10356390) and data type float64

In [21]:
def subgraphByListOfValues(values: pd.Series, column: str, df: pd.DataFrame) -> pd.DataFrame:
    return df[np.isin(df[column], values)]

In [22]:
def randomSubgraph(n:int, column: str, df: pd.DataFrame, weighed: bool = True, seed: int = 3, minDeg: int= 10) -> pd.DataFrame:
    np.random.seed(seed)
    values, counts = np.unique(df[column], return_counts=True)
    values, counts = values[counts > minDeg],  counts[counts > minDeg] # nech nemame knihy s velmi malo hodnoteniami (resp uzivatelov co majo citaju)
    return subgraphByListOfValues(np.random.choice(values, n, replace=False, p=(counts / sum(counts)) if weighed else None), column, df)

In [30]:
class RecommendationEngine:
    def __init__(self, filename: str, seed: int=3, n: int = 200, col: str = "ISBN"):
        self.df = pd.read_csv(filename, header=None, names=['ID_reviewer', 'ISBN','Stars','Date'])
        self.subdf = randomSubgraph(n, col, self.df, seed=seed)
        self.SM = {True: None, False: None}
        self.SMC = {True: None, False: None}
        self.G: nx.classes.graph.Graph = nx.from_pandas_edgelist(self.subdf, source='ID_reviewer', target='ISBN', edge_attr='Stars')
        self.userOrder = self.filterNodes(True)
        self.bookOrder = self.filterNodes(False)

    def filterNodes(self, users: bool) -> list[str]:
        return list(filter(lambda x: (x[0] == 'A') if users else (x[0] != 'A'), self.G.nodes()))

    def similarityMatrix(self, users: bool, save: bool=False) -> pd.DataFrame:  # use wisely!
        if self.SM[users] is not None:
            return self.SM[users]
        userOrder = self.filterNodes(True)
        bookOrder = self.filterNodes(False)
        A = nx.algorithms.bipartite.matrix.biadjacency_matrix(self.G, userOrder, bookOrder).toarray()
        product = A @ A.T if users else A.T @ A
        np.fill_diagonal(product, 0)
        order = userOrder if users else bookOrder
        SM = pd.DataFrame(product, columns = order)
        SM.index = order
        if save:
            self.SM[users] = SM
        return SM

    def similarityMatrixCosine(self, users: bool, save: bool=False) -> np.ndarray:  # use wisely!
        # filterList = self.filterNodes(users)
        if self.SMC[users] is not None:
            return self.SMC[users]
        SM = self.similarityMatrix(users)
        norms = (np.linalg.norm(SM, axis=1))
        SMC = SM / norms[:, None] / norms[None, :]
        np.nan_to_num(SMC, copy=False)
        if save:
            self.SMC[users] = SMC
        return SMC

    def similarityVectorForUser(self, user:str, userOrder:list[str] = None, cosine:bool = True):
        if userOrder is None:
            userOrder = self.filterNodes(True)
        u = userOrder.index(user)
        A = nx.algorithms.bipartite.matrix.biadjacency_matrix(self.G, userOrder).toarray()
        result = np.zeros((len(userOrder)))
        for r in range(len(userOrder)):
            result[r] = np.dot(A[r, :], A[u, :])
            if cosine:
                result[r] /= np.sqrt(np.dot(A[r, :], A[r, :]))
        if cosine:
            result /= np.sqrt(np.dot(A[u, :], A[u, :]))
        result[u] = 0
        return result, np.array(userOrder)

    def getNMostSimilarUsers(self, user:str, n:int, ignoreTotalMatch=True):
        sim, users = self.similarityVectorForUser(user, cosine=True)
        if ignoreTotalMatch:
            sim, users = sim[sim < 0.9999], users[sim < 0.9999]
        sim.argsort()
        return users[sim.argsort()[-n:]]

    def getSimilarBooksToABook(self, book):
        df = self.similarityMatrixCosine(False)
        return list(df[book][df[book] == df[book].max()].index)

    def getMetaFromISBN(self,isbn:str) -> json:
      SERVICE = "goob"
      format_json = bibformatters["json"]
      meta_data = (format_json(meta(isbn, SERVICE)))
      #print(meta_data)
      return meta_data

    def fromJsonToDict(self,json_file:json) -> dict:
      return json.loads(json_file)

    def getDf(self)->pd.DataFrame:
      return self.subdf

    def getBooksByUser(self,df:pd.DataFrame,id:str) ->list: #(list of ISBN for user)
      df = df.loc[df['ID_reviewer']==id, 'ISBN'].values
      return df

    def getBooks(self,df:pd.DataFrame,id1:str,id2:str,type_:str) ->list: #same - same books, diff - different books
      b1 = self.getBooksByUser(self.getDf(),id1)
      b2 = self.getBooksByUser(self.getDf(),id2)
      if type_ == "same":
        return list(set(b1).intersection(set(b2)))
      elif type_ == "diff1":
        return list(set(b2) - set(b1)) #vsetky knihy od id2, ktore id1 necital
      elif type_ == "diff2":
        return list(set(b1) - set(b2)) #vsetky knihy od id1, ktore id2 necital

    def getSimilarBooks(self,df:pd.DataFrame,id1:str,id2:str,type_:str) ->list:  #type ==(year, publisher)
        if type_ == "year": #odporucame id1 knihy od uzivatela id2 na zaklade roku vydania knihy
          books_s = self.getBooks(df,id1,id2,"same")
          #print(books_s)
          years = list()
          for i in range(len(books_s)): #zoberieme vseetky roky a pozrieme sa aky najviac prevlada -> zoberiem okolie (-5,5) a to odporucime
            pom = self.getMetaFromISBN(books_s[i])
            if pom != None: #nenaslo ISBN
              years.append(self.fromJsonToDict(pom)["year"])
          most_common_era = Counter(years).most_common(1) #rok,pocet_opakovani
          most_common_era = [int(most_common_era[0][0])-5,int(most_common_era[0][0])+5]
          #print(most_common_era)
          books_d = self.getBooks(df,id1,id2,"diff1")
          recommendation_by_year = list()
          for i in range(len(books_d)): #zistime info o vsetkych kniha id2 a vyberieme nazvy tych ktorych vydania su medzi most_common_era
            pom = self.getMetaFromISBN(books_d[i])
            if pom != None: #nenaslo ISBN
              if most_common_era[0] <= (int(self.fromJsonToDict(pom)["year"])) <= most_common_era[1]:
                title = self.fromJsonToDict(pom)["title"]
                year = self.fromJsonToDict(pom)["year"]
                authors = list()
                for i in range(len(self.fromJsonToDict(pom)["author"])):  #ak mame viac autorov
                  authors.append(self.fromJsonToDict(pom)["author"][i]['name'])
                recommendation_by_year.append([title,year, authors])
                #print(recommendation_by_year)
          if recommendation_by_year == []:
            recommendation_by_year = [("Nenasli sa ziadne knihy z rokov:",most_common_era[0],"-",most_common_era[1])]
          return recommendation_by_year

        elif type_ == "publisher": #odporucame id1 knihy od uzivatela id2, vyberame oblubeneho vydavatelstvo
           books_s = self.getBooks(df,id1,id2,"same")
           publishers = list()
           for i in range(len(books_s)): #zoberieme vseetky vydavatelstva -> most_common
              pom = self.getMetaFromISBN(books_s[i])
              if pom != None: #nenaslo ISBN
                publishers.append(self.fromJsonToDict(pom)["publisher"])
           most_common_publisher = Counter(publishers).most_common(1) #vydavatelstvo,pocet_opakovani
           print(most_common_publisher)
           books_d = self.getBooks(df,id1,id2,"diff1")
           recommendation_by_publisher = list()
           for i in range(len(books_d)): #zistime info o vsetkych kniha id2 a vyberieme nazvy tych ktorych vydania su od most_common_publishe
              pom = self.getMetaFromISBN(books_d[i])
              if pom != None: #nenaslo ISBN
                if most_common_publisher[0] == (self.fromJsonToDict(pom)["publisher"]):
                  title = self.fromJsonToDict(pom)["title"]
                  year = self.fromJsonToDict(pom)["year"]
                  authors = list()
                  for i in range(len(self.fromJsonToDict(pom)["author"])): #ak mame viac autorov
                    authors.append(self.fromJsonToDict(pom)["author"][i]['name'])
                  recommendation_by_publisher.append([title,year, authors])
                #print(recommendation_by_year)
           if recommendation_by_publisher == []:
              recommendation_by_publisher = ["Nenasli sa ziadne knihy od oblubeneho vydavatela"]
           return recommendation_by_publisher


In [31]:
re = RecommendationEngine('data/rec-amz-Books.edges', n=200, seed=76972)

In [49]:
print(f'number of users: {len(re.filterNodes(True))}')

number of users: 87085


In [54]:
randomUser = np.random.choice(re.filterNodes(True))
randomBook = np.random.choice(re.filterNodes(False))
print(f'3 most similar users to {randomUser}: {re.getNMostSimilarUsers(randomUser, 3)}')
print(f'most similar books to {randomBook}: {re.getSimilarBooksToABook(randomBook)}')# vela podobnych znamena, ze kniha si nie je podobna so ziadnou knihou, teda je izolovana

most similar books to 0061537934: ['1937387704']
3 most similar users to A3FI8DSH1SSBVU: ['A3H4CJRX3E9H4W' 'A3JQ1OHUQI4M58' 'A39L6AGSD5ODRA']
most similar books to 0974504874: ['0307476073']


ukazkovy outupt:
most similar books to 1858600073: ['0786407735']
ku knihe "They Walked With Jesus: Past Life Experience With Christ", to naslo "Genocide and Rescue in Wolyn: Recollections of the Ukrainian Nationalist Ethnic Cleansing Campaign Against the Poles During World War II", co myslim nie je uplne odveci odporucanie (na to aku malu podmnozinu knih mame)

pri hladani podobnych uzivatelov ignoreTotalMatch paraemter True ignoruje dokonale podobnych pouzivatelov, ty bu nemali ake ine knihy odporucit
pri podobnych knihach, uz chceme ja dokolane zhody

In [56]:
display(re.similarityMatrix(False, save=True))
# pre users je matica moc velka...n neodporucam to skusat, a ani pre vela knih, to dlho trava..., lebo je tem vela user-ov, ale za cca min to zvladne

Unnamed: 0,0060735457,0060757353,0061148520,0061240443,0061451835,0061950726,006204964X,0062094661,0062265423,0133109895,...,B00CQAPWFK,B00DDUYNXE,B00ED1QGXC,B00H0MFT2A,B00HHYCIIE,B00HRIIVI6,B00HUHUA2O,B00KXCR422,B00L9HIR4E,B00LAOB6PI
0060735457,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0060757353,0,0,0,0,0,2,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0061148520,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
0061240443,0,0,0,0,0,4,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
0061451835,0,0,0,0,0,2,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
B00HRIIVI6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
B00HUHUA2O,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
B00KXCR422,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
B00L9HIR4E,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


pre cast niszie, novy seed asi dost upravil podmnozinu pouzivatelov a knih, ktora bola vybrana

In [57]:
meta_json_book = re.getMetaFromISBN("0000013714")
list_meta = re.fromJsonToDict(meta_json_book)
print(list_meta)
print(list_meta["publisher"])

{'type': 'book', 'title': 'Heavenly Highway Hymns: Shaped-Note Hymnal', 'author': [{'name': 'Stamps/Baxter'}], 'year': '1983', 'identifier': [{'type': 'ISBN', 'id': '9780000013712'}], 'publisher': 'Brentwood Benson'}
Brentwood Benson


In [58]:
print(re.getBooksByUser(re.getDf(),"A1KP12IVP6LAGL"))
books_users = re.getBooks(re.getDf(),"A1KP12IVP6LAGL","AG7R3MMF8QLDT","same")
print("same:" ,books_users)
books_users = re.getBooks(re.getDf(),"A1KP12IVP6LAGL","AG7R3MMF8QLDT","diff1")
print("diff1:" ,books_users)
books_users = re.getBooks(re.getDf(),"A1KP12IVP6LAGL","AG7R3MMF8QLDT","diff2")
print("diff2:" ,books_users)

[]
same: []
diff1: ['076792066X']
diff2: []


In [66]:
print("Recommendation by year: ")
books_recomendation = re.getSimilarBooks(re.getDf(),"A1KP12IVP6LAGL","AG7R3MMF8QLDT","year")
print(books_recomendation)
print("Recommendation by publisher: ")
books_recomendation = re.getSimilarBooks(re.getDf(),"A1KP12IVP6LAGL","AG7R3MMF8QLDT","publisher")
print(books_recomendation)

Recommendation by year: 


IndexError: list index out of range