In [74]:
import numpy as np
import pandas as pd
import networkx as nx
import json
from collections import Counter
#nainštalujte pip install isbnlib
from isbnlib import meta
from isbnlib.registry import bibformatters

In [75]:
# np.ones((87085, 87085))

In [76]:
# np.ones((10356390, 10356390))

In [77]:
def subgraphByListOfValues(values: pd.Series, column: str, df: pd.DataFrame) -> pd.DataFrame:
    return df[np.isin(df[column], values)]

In [78]:
def randomSubgraph(n:int, column: str, df: pd.DataFrame, weighed: bool = True, seed: int = 3, minDeg: int= 10) -> pd.DataFrame:
    np.random.seed(seed)
    values, counts = np.unique(df[column], return_counts=True)
    values, counts = values[counts > minDeg],  counts[counts > minDeg] # nech nemame knihy s velmi malo hodnoteniami (resp uzivatelov co majo citaju)
    return subgraphByListOfValues(np.random.choice(values, n, replace=False, p=(counts / sum(counts)) if weighed else None), column, df)

In [79]:
class RecommendationEngine:
    def __init__(self, filename: str, seed: int=3, n: int = 200, col: str = "ISBN"):
        self.df = pd.read_csv(filename, header=None, names=['ID_reviewer', 'ISBN','Stars','Date'])
        self.subdf = randomSubgraph(n, col, self.df, seed=seed)
        self.SM = {True: None, False: None}
        self.SMC = {True: None, False: None}
        self.G: nx.classes.graph.Graph = nx.from_pandas_edgelist(self.subdf, source='ID_reviewer', target='ISBN', edge_attr='Stars')
        self.userOrder = self.filterNodes(True)
        self.bookOrder = self.filterNodes(False)

    def filterNodes(self, users: bool) -> list[str]:
        return list(filter(lambda x: (x[0] == 'A') if users else (x[0] != 'A'), self.G.nodes()))

    def similarityMatrix(self, users: bool, save: bool=False) -> pd.DataFrame:  # use wisely!
        if self.SM[users] is not None:
            return self.SM[users]
        userOrder = self.filterNodes(True)
        bookOrder = self.filterNodes(False)
        A = nx.algorithms.bipartite.matrix.biadjacency_matrix(self.G, userOrder, bookOrder).toarray()
        product = A @ A.T if users else A.T @ A
        np.fill_diagonal(product, 0)
        order = userOrder if users else bookOrder
        SM = pd.DataFrame(product, columns = order)
        SM.index = order
        if save:
            self.SM[users] = SM
        return SM

    def similarityMatrixCosine(self, users: bool, save: bool=False) -> np.ndarray:  # use wisely!
        if self.SMC[users] is not None:
            return self.SMC[users]
        SM = self.similarityMatrix(users)
        norms = (np.linalg.norm(SM, axis=1))
        SMC = SM / norms[:, None] / norms[None, :]
        np.nan_to_num(SMC, copy=False)
        if save:
            self.SMC[users] = SMC
        return SMC

    def similarityVectorForUser(self, user:str, userOrder:list[str] = None, cosine:bool = True):
        if userOrder is None:
            userOrder = self.filterNodes(True)
        u = userOrder.index(user)
        A = nx.algorithms.bipartite.matrix.biadjacency_matrix(self.G, userOrder, weight='Stars').toarray()
        result = np.zeros((len(userOrder)))
        for r in range(len(userOrder)):
            result[r] = np.dot(A[r, :], A[u, :])
            if cosine:
                result[r] /= np.sqrt(np.dot(A[r, :], A[r, :]))
        if cosine:
            result /= np.sqrt(np.dot(A[u, :], A[u, :]))
        result[u] = 0
        return result, np.array(userOrder)

    def getNMostSimilarUsers(self, user:str, n:int, ignoreTotalMatch=True):
        sim, users = self.similarityVectorForUser(user, cosine=True)
        if ignoreTotalMatch:
            sim, users = sim[sim < 0.9999], users[sim < 0.9999]
        sim.argsort()
        return users[sim.argsort()[-n:]]

    def getSimilarBooksToABook(self, book):
        df = self.similarityMatrixCosine(False)
        return list(df[book][df[book] == df[book].max()].index)

    def getMetaFromISBN(self,isbn:str) -> json:
      SERVICE = "goob"
      format_json = bibformatters["json"]
      meta_data = (format_json(meta(isbn, SERVICE)))
      #print(meta_data)
      return meta_data

    def fromJsonToDict(self,json_file:json) -> dict:
      return json.loads(json_file)

    def getDf(self)->pd.DataFrame:
      return self.subdf

    def getBooksByUser(self,df:pd.DataFrame, id:str) ->list: #(list of ISBN for user)
      # df = df.loc[df['ID_reviewer']==id, 'ISBN'].values
      # return df
        return list(self.G.neighbors(id))

    def getBooks(self,df:pd.DataFrame,id1:str,id2:str,type_:str) ->list: #same - same books, diff - different books
      b1 = self.getBooksByUser(self.getDf(),id1)
      b2 = self.getBooksByUser(self.getDf(),id2)
      if type_ == "same":
        return list(set(b1).intersection(set(b2)))
      elif type_ == "diff1":
        return list(set(b2) - set(b1)) #vsetky knihy od id2, ktore id1 necital
      elif type_ == "diff2":
        return list(set(b1) - set(b2)) #vsetky knihy od id1, ktore id2 necital

    def getSimilarBooks(self,df:pd.DataFrame,id1:str,id2:str,type_:str) ->list:  #type ==(year, publisher)
        if type_ == "year": #odporucame id1 knihy od uzivatela id2 na zaklade roku vydania knihy
          books_s = self.getBooks(df,id1,id2,"same")
          #print(books_s)
          years = list()
          for i in range(len(books_s)): #zoberieme vseetky roky a pozrieme sa aky najviac prevlada -> zoberiem okolie (-5,5) a to odporucime
            pom = self.getMetaFromISBN(books_s[i])
            if pom != None: #nenaslo ISBN
              years.append(self.fromJsonToDict(pom)["year"])
          most_common_era = Counter(years).most_common(1) #rok,pocet_opakovani
          most_common_era = [int(most_common_era[0][0])-5,int(most_common_era[0][0])+5]
          #print(most_common_era)
          books_d = self.getBooks(df,id1,id2,"diff1")
          recommendation_by_year = list()
          for i in range(len(books_d)): #zistime info o vsetkych kniha id2 a vyberieme nazvy tych ktorych vydania su medzi most_common_era
            pom = self.getMetaFromISBN(books_d[i])
            if pom != None: #nenaslo ISBN
              if most_common_era[0] <= (int(self.fromJsonToDict(pom)["year"])) <= most_common_era[1]:
                title = self.fromJsonToDict(pom)["title"]
                year = self.fromJsonToDict(pom)["year"]
                authors = list()
                for i in range(len(self.fromJsonToDict(pom)["author"])):  #ak mame viac autorov
                  authors.append(self.fromJsonToDict(pom)["author"][i]['name'])
                recommendation_by_year.append([title,year, authors])
                #print(recommendation_by_year)
          if recommendation_by_year == []:
            recommendation_by_year = [("Nenasli sa ziadne knihy z rokov:",most_common_era[0],"-",most_common_era[1])]
          return recommendation_by_year

        elif type_ == "publisher": #odporucame id1 knihy od uzivatela id2, vyberame oblubeneho vydavatelstvo
           books_s = self.getBooks(df,id1,id2,"same")
           publishers = list()
           for i in range(len(books_s)): #zoberieme vseetky vydavatelstva -> most_common
              pom = self.getMetaFromISBN(books_s[i])
              if pom != None: #nenaslo ISBN
                publishers.append(self.fromJsonToDict(pom)["publisher"])
           most_common_publisher = Counter(publishers).most_common(1) #vydavatelstvo,pocet_opakovani
           print(most_common_publisher)
           books_d = self.getBooks(df,id1,id2,"diff1")
           recommendation_by_publisher = list()
           for i in range(len(books_d)): #zistime info o vsetkych kniha id2 a vyberieme nazvy tych ktorych vydania su od most_common_publishe
              pom = self.getMetaFromISBN(books_d[i])
              if pom != None: #nenaslo ISBN
                if most_common_publisher[0] == (self.fromJsonToDict(pom)["publisher"]):
                  title = self.fromJsonToDict(pom)["title"]
                  year = self.fromJsonToDict(pom)["year"]
                  authors = list()
                  for i in range(len(self.fromJsonToDict(pom)["author"])): #ak mame viac autorov
                    authors.append(self.fromJsonToDict(pom)["author"][i]['name'])
                  recommendation_by_publisher.append([title,year, authors])
                #print(recommendation_by_year)
           if recommendation_by_publisher == []:
              recommendation_by_publisher = ["Nenasli sa ziadne knihy od oblubeneho vydavatela"]
           return recommendation_by_publisher


In [80]:
re = RecommendationEngine('data/rec-amz-Books.edges', n=200, seed=76972)

In [81]:
print(f'number of users: {len(re.filterNodes(True))}')

number of users: 87085


['0060987103',
 '0061537934',
 '0307476073',
 '0316098329',
 '0575081384',
 '0788749730',
 '1439142009']

In [83]:
randomUser = np.random.choice(re.filterNodes(True))
randomBook = np.random.choice(re.filterNodes(False))
print(f'3 most similar users to {randomUser}: {re.getNMostSimilarUsers(randomUser, 3)}')
print(f'most similar books to {randomBook}: {re.getSimilarBooksToABook(randomBook)}')# vela podobnych znamena, ze kniha si nie je podobna so ziadnou knihou, teda je izolovana

3 most similar users to A2BWZNAJCDKQ96: ['A1FTBXHM3DJTEA' 'ARSDJ482Z8MGV' 'AI3AUM07TTPE1']
most similar books to 1477448616: ['B00DP8R3P4']


ukazkovy outupt:
most similar books to 1858600073: ['0786407735']
ku knihe "They Walked With Jesus: Past Life Experience With Christ", to naslo "Genocide and Rescue in Wolyn: Recollections of the Ukrainian Nationalist Ethnic Cleansing Campaign Against the Poles During World War II", co myslim nie je uplne odveci odporucanie (na to aku malu podmnozinu knih mame)

pri hladani podobnych uzivatelov ignoreTotalMatch paraemter True ignoruje dokonale podobnych pouzivatelov, ty bu nemali ake ine knihy odporucit
pri podobnych knihach, uz chceme ja dokolane zhody

In [84]:
display(re.similarityMatrix(False, save=True))
# pre users je matica moc velka...n neodporucam to skusat, a ani pre vela knih, to dlho trava..., lebo je tem vela user-ov, ale za cca min to zvladne

Unnamed: 0,006057531X,0060593237,0060937750,0060987103,0061120618,0061537934,0061774804,0061950726,0062022326,0071445137,...,B00C2L7N4G,B00CK8CKZS,B00DP8R3P4,B00GJ371PE,B00H39Y6ZQ,B00HCTPAXE,B00HG3COP8,B00KSBQI84,B00L6HH1O4,B00L7H0RXA
006057531X,0,0,0,0,0,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
0060593237,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0060937750,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0060987103,0,0,1,0,0,23,2,2,0,0,...,0,0,0,0,0,0,0,0,0,0
0061120618,0,0,0,0,0,2,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
B00HCTPAXE,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
B00HG3COP8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
B00KSBQI84,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
B00L6HH1O4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


pre cast niszie, novy seed asi dost upravil podmnozinu pouzivatelov a knih, ktora bola vybrana

In [85]:
meta_json_book = re.getMetaFromISBN("0061537934")
list_meta = re.fromJsonToDict(meta_json_book)
print(list_meta)
print(list_meta["publisher"])

print(re.fromJsonToDict(re.getMetaFromISBN(re.getSimilarBooksToABook("0061537934")[0])))

{'type': 'book', 'title': 'The Art Of Racing In The Rain', 'author': [{'name': 'Garth Stein'}], 'year': '2008', 'identifier': [{'type': 'ISBN', 'id': '9780061537936'}], 'publisher': 'Harper'}
Harper
{'type': 'book', 'title': 'The Eradication Dilemma', 'author': [{'name': 'William Wilkerson'}], 'year': '2011', 'identifier': [{'type': 'ISBN', 'id': '9781937387709'}], 'publisher': ''}


In [91]:
user = np.random.choice(list(map(lambda x: x[0], filter(lambda x: x[1] > 5, list(re.G.degree(re.filterNodes(True)))))))
print(re.getBooksByUser(re.getDf(),user))
# a, b, c = re.getNMostSimilarUsers(user, 3)
for u in re.getNMostSimilarUsers(user, 3):
    print(u)
    books_users = re.getBooks(re.getDf(),user,u,"same")
    print("same:" ,books_users)
    books_users = re.getBooks(re.getDf(),user,u,"diff1")
    print("diff1:" ,books_users)
    books_users = re.getBooks(re.getDf(),user,u,"diff2")
    print("diff2:" ,books_users)

['0061774804', '0061950726', '0316153990', '0345534034', '0385346433', '1433527073', '1439142009', '1477805060']
AYMSUHS3ZJTBJ
same: ['0061950726', '0345534034']
diff1: []
diff2: ['0316153990', '1433527073', '0061774804', '0385346433', '1439142009', '1477805060']
AZPNGUGH55MTO
same: ['0061950726', '1477805060']
diff1: ['1940026016']
diff2: ['0316153990', '0345534034', '1433527073', '0061774804', '0385346433', '1439142009']
A1WXJ1I42UOQ2V
same: ['0061950726', '0345534034']
diff1: []
diff2: ['0316153990', '1433527073', '0061774804', '0385346433', '1439142009', '1477805060']


In [87]:
print("Recommendation by year: ")
books_recomendation = re.getSimilarBooks(re.getDf(),"A2BWZNAJCDKQ96","A1FTBXHM3DJTEA","year")
print(books_recomendation)
print("Recommendation by publisher: ")
books_recomendation = re.getSimilarBooks(re.getDf(),"A2BWZNAJCDKQ96","A1FTBXHM3DJTEA","publisher")
print(books_recomendation)

Recommendation by year: 


IndexError: list index out of range

In [88]:
list(re.G.neighbors('1410419592'))

['A1DW24U3K62NY1',
 'A59XHO5Z2FJP4',
 'AM3BITQCZ3S4A',
 'A1420LW29L8NII',
 'A1ICRF8W0W0MY6',
 'A22WZLKAL6RPOH',
 'A2NAKEDQ5NUFF9',
 'A1A2YTFX2XC4O2',
 'A3JQ8HXEGPT64X',
 'A3TYDZG3KN5AML',
 'AD0WUBKBO21KK',
 'A3PURLPGPO4L1V',
 'A2T4I7BNNKFI8P',
 'A1WB2HH8N6UTMC',
 'AS5CK4SWL6ER5',
 'AOLUXXHMQ3N33',
 'A1PXV5ODSL6L27',
 'A1DHS99SF9F67M',
 'A1W64PMDWFG9JZ',
 'A3PVRA7GWG8QVN',
 'A3QC4SG7NA4ULG',
 'A24S9RAYBEQ2I3',
 'A8E6Y3LTPX9XO',
 'A2PN65B6BSTIYZ',
 'A3PM1RLHWQZCHG',
 'A15Q8SNK4ZRJNF',
 'A7QCIXEBG4Y9J',
 'ASPABROV8R7M5',
 'A2YO1YX0N7V4S1',
 'A2WBM4EMTRM1A',
 'ANUWZ013U3HJV',
 'A1AZGNMWEBMFC5',
 'A2YSR1GDPOLRU4',
 'A24S2WV5OPGXW6',
 'AHSNV2I27XTD9',
 'A1QXWM5P0CT45',
 'A2F6N60Z96CAJI',
 'A2EP2UJS9U0O6D',
 'A1UA86VEHA6BV3',
 'A3EP6YVAYS9YM3',
 'A3L9A0ALVMOZQO',
 'A1CBNUBPZPWH5D',
 'A8Z0MJUYOC75E',
 'A15ANBKUY4JSPD',
 'ADGSV25O4HTAB',
 'A24LFO50Q4F7YI',
 'A325JING4PW8FE',
 'A1SRN4FYFUIQOO',
 'A136XBSBZPZKLD',
 'A2N3URJ69JQYCM',
 'A3W2AUV3LLPI86',
 'AVDQOLXXHMUMM',
 'A1IL6W1NK05UW9',
 'A8T93X