In [1]:
import pandas as pd
import freeman as fm
import networkx as nx
import statsmodels.api as sm

from unidecode import unidecode
from itertools import combinations
from collections import defaultdict
from hashlib import md5

In [None]:
data = pd.read_csv("Goodreads_BestBooksEver_1-10000.csv")
data = data.drop(["url", "bookImage", "bookAuthors", "bookDesc", "bookPages", "recommendations"], axis=1)
data = data.dropna(how="any").reset_index(drop=True)
data.head()

In [None]:
# Splitting by |
data["bookGenres"] = data["bookGenres"].apply(lambda x: str(x)).apply(lambda y: y.split("|"))

# ISBN as int
data["bookISBN"] = data["bookISBN"].apply(lambda x: int(x))

# Convert bookTitle to a readable string
data["bookTitle"] = data["bookTitle"].apply(lambda x: str(x))
data["bookTitle"] = data["bookTitle"].apply(lambda y: y.replace("\"", ""))
data["bookTitle"] = data["bookTitle"].apply(lambda t: unidecode(t))

# bookRating as float
data["bookRating"] = data["bookRating"].apply(lambda r: float(r))

# Transform counts to int, then normalize
data["ratingCount"] = data["ratingCount"].apply(lambda r: int(r))
sum_ratings = data["ratingCount"].sum()
data["ratingCount"] = data["ratingCount"].apply(lambda r: r/sum_ratings)

data["reviewCount"] = data["reviewCount"].apply(lambda r: int(r))
sum_reviews = data["reviewCount"].sum()
data["reviewCount"] = data["reviewCount"].apply(lambda r: r/sum_reviews)

In [None]:
data.head()

In [None]:
nGenres = 8

# Selecting only the books that have at least N genres
data = data[data["bookGenres"].map(len) >= nGenres]

# Transforming list to a dict
def make_dic(glist):
    dic = {}
    
    for gc in glist:
        gc = gc.split("/")
        genre, votes = gc[0], gc[1]
        
        votes = votes.replace(",", "")
        if votes == "":
            # Used continue because there is only one case of empty votes in dataset
            continue
            
        dic[genre] = int(votes)
        
    dic = {k: v for k, v in sorted(dic.items(), key=lambda item: item[1], reverse=True)}
        
    return dic

data["dictGenres"] = data["bookGenres"].apply(lambda x: make_dic(x))

# Since we need to check the highestVotedGenres, we can transform them to a hash for easier use
def make_hash(d):
    string = ""
    
    keys = list(d.keys())[:nGenres]
    
    for k in keys:
        string += str(k) + ", "
    
    string = string[:-2]        
    obj = md5(string.encode('utf-8'))
    
    return obj.hexdigest()

# Select the keys for highest voted genres (dic is already in order)
data["highestVotedGenres"] = data["dictGenres"].apply(lambda d: make_hash(d))

data["genreVotes"] = data["dictGenres"].apply(lambda d: sum(d.values()))

In [None]:
# Keeping only columns we will use
network = data[["bookTitle", "bookRating", "ratingCount", "reviewCount", "bookISBN", "highestVotedGenres", "genreVotes"]]

# Remove duplicated rows
network = network[~network.duplicated()]
network.head()

In [None]:
dic = network.to_dict("list")

def list_duplicates(seq):
    tally = defaultdict(list)
    
    for i, item in enumerate(seq):
        tally[item].append(i)
        
    return ((key, locs) for key, locs in tally.items() if len(locs)>1)

idups = dict(sorted(list_duplicates(dic["bookISBN"])))
bdups = dict(sorted(list_duplicates(dic["bookTitle"])))

In [None]:
new_dic = {
    "bookTitle": [], 
    "bookRating": [], 
    "ratingCount": [], 
    "reviewCount": [], 
    "bookISBN": [], 
    "highestVotedGenres": [], 
    "genreVotes": []
}

for i in range(len(dic["bookTitle"])):
    isbn = dic["bookISBN"][i]
    name = dic["bookTitle"][i]
    
    if isbn in idups.keys():
        if isbn in new_dic["bookISBN"]:
            continue
            
        else:    
            new_dic["bookTitle"].append(name)
            new_dic["bookRating"].append(dic["bookRating"][i])
            new_dic["ratingCount"].append(dic["ratingCount"][i])
            new_dic["reviewCount"].append(dic["reviewCount"][i])
            new_dic["bookISBN"].append(isbn)
            new_dic["highestVotedGenres"].append(dic["highestVotedGenres"][i])
            votes = 0
            for index in idups[isbn]:
                votes += dic["genreVotes"][index]
            new_dic["genreVotes"].append(votes)
    
    elif name in bdups.keys():
        if name in new_dic["bookTitle"]:
            continue
        
        else:
            new_dic["bookTitle"].append(name)
            new_dic["bookRating"].append(dic["bookRating"][i])
            new_dic["ratingCount"].append(dic["ratingCount"][i])
            new_dic["reviewCount"].append(dic["reviewCount"][i])
            new_dic["bookISBN"].append(isbn)
            new_dic["highestVotedGenres"].append(dic["highestVotedGenres"][i])
            votes = 0
            for index in bdups[name]:
                votes += dic["genreVotes"][index]
            new_dic["genreVotes"].append(votes)
        
    else:
        new_dic["bookTitle"].append(name)
        new_dic["bookRating"].append(dic["bookRating"][i])
        new_dic["ratingCount"].append(dic["ratingCount"][i])
        new_dic["reviewCount"].append(dic["reviewCount"][i])
        new_dic["bookISBN"].append(isbn)
        new_dic["highestVotedGenres"].append(dic["highestVotedGenres"][i])
        new_dic["genreVotes"].append(dic["genreVotes"][i])
        
network = pd.DataFrame.from_dict(new_dic)
network.head()

In [None]:
network["genreVotes"].describe()

In [None]:
# Cutting off by 2500 genres
network = network[network["genreVotes"] >= 2500]

# Now we normalize
# Didnt do it beforehand to get a better describe on the genreVotes column
sum_genres = network["genreVotes"].sum()
network["genreVotes"] = network["genreVotes"].apply(lambda v: v/sum_genres)

# Resetting index
network.reset_index(drop=True, inplace=True)

# Popularity column
network["Popularity"] = network["ratingCount"] + network["reviewCount"] + network["genreVotes"]

# Dropping unused columns
network = network.drop(["ratingCount", "reviewCount", "genreVotes"], axis=1)

In [None]:
network.head()

In [None]:
rede = {}

genres = network["highestVotedGenres"].unique()

for g in genres:
    lbooks = network[network["highestVotedGenres"] == g]["bookISBN"]
    rede[g] = [isbn for isbn in lbooks]

In [None]:
# Obtendo todas as combinacoes possiveis
# https://stackoverflow.com/questions/464864/how-to-get-all-possible-combinations-of-a-list-s-elements
combs = []

for genres in rede.keys():
    combs += list(combinations(rede[genres], 2))

In [None]:
edges = []
books = []

for c in combs:
    b1 = c[0]
    b2 = c[1]
    if b1 not in books:
        books.append(b1)
    if b2 not in books:
        books.append(b2)
    edges += [(b1, b2)]

books = list(set(books))
print(len(books))
print(len(edges))

In [None]:
with open("books_genres.gml", "w+") as file:
    file.write("graph [\n")
    file.write("    directed 0\n")
    
    for b in books:
        isbn = b
        title = list(network[network["bookISBN"] == b]["bookTitle"])[0]
        
        file.write("    node [\n")
        file.write(f"        id {isbn}\n")
        file.write(f'        label "{title}"\n')
        file.write("    ]\n")
            
    for tup in edges:
        file.write("    edge [\n")
        file.write(f"        source {tup[0]}\n")
        file.write(f"        target {tup[1]}\n")
        file.write("    ]\n")
    
    file.write("]\n")

In [None]:
g = fm.load("books_genres.gml")
for n in g.nodes:
    g.nodes[n]['labpos'] = 'hover'
    g.nodes[n]['size'] = 5
g.move('kamada_kawai')
g.draw()

In [None]:
def make_prox(book, dprox):
    if book in dprox.keys():
        return dprox[book]
    
    else:
        return None
    
prox = nx.closeness_centrality(g)

network["Proximity"] = network["bookISBN"].apply(lambda b: make_prox(b, prox))
reg_data = network.dropna(how="any").reset_index(drop=True)

X1 = reg_data[["bookRating", "Proximity"]]
X2 = reg_data["Proximity"]
Y = reg_data["Popularity"]

In [None]:
model1 = sm.OLS(Y, X1).fit()
p1 = model1.predict(X1)

model1.summary()

In [None]:
model2 = sm.OLS(Y, X2).fit()
p2 = model1.predict(X2)

model2.summary()