In [18]:
import pandas as pd
import freeman as fm
from unidecode import unidecode
from itertools import combinations
from collections import defaultdict, Counter

In [19]:
data = pd.read_csv("Goodreads_BestBooksEver_1-10000.csv")
data = data.drop(["url", "bookImage", "bookAuthors", "bookDesc", "bookPages", "recommendations"], axis=1)
data = data.dropna(how="any").reset_index(drop=True)

In [20]:
data["bookGenres"] = data["bookGenres"].apply(lambda x: str(x)).apply(lambda y: y.split("|"))
data["bookISBN"] = data["bookISBN"].apply(lambda x: int(x))
data["bookTitle"] = data["bookTitle"].apply(lambda x: str(x))
data["bookTitle"] = data["bookTitle"].apply(lambda y: y.replace("\"", ""))
data["bookTitle"] = data["bookTitle"].apply(lambda t: unidecode(t))
data["bookRating"] = data["bookRating"].apply(lambda r: float(r))
data["ratingCount"] = data["ratingCount"].apply(lambda r: int(r))
data["reviewCount"] = data["reviewCount"].apply(lambda r: int(r))

In [21]:
def make_dic(glist):
    dic = {}
    for gc in glist:
        gc = gc.split("/")
        genre, votes = gc[0], gc[1]
        
        votes = votes.replace(",", "")
        if votes == "":
            votes = "0"
            
        dic[genre] = int(votes)
        
    dic = {k: v for k, v in sorted(dic.items(), key=lambda item: item[1], reverse=True)}
        
    return dic

def getHighestGenres(d):
    return frozenset([item[0] for item in list(d.items())])

data["dictGenres"] = data["bookGenres"].apply(lambda x: make_dic(x))
data["highestVotedGenres"] = data["dictGenres"].apply(lambda d: getHighestGenres(d))
data["genreVotes"] = data["dictGenres"].apply(lambda d: sum(d.values()))
data = data[data["highestVotedGenres"].map(len) >= 10]
data

Unnamed: 0,bookTitle,bookRating,ratingCount,reviewCount,bookGenres,bookISBN,dictGenres,highestVotedGenres,genreVotes
0,The Hunger Games,4.32,6717635,176054,"[Young Adult/31,498, Fiction/17,878, Science F...",9780439023481,"{'Young Adult': 31498, 'Fiction': 17878, 'Scie...","(Fiction, Fantasy, Science Fiction-Dystopia, Y...",103407
1,Twilight,3.61,5231000,107619,"[Young Adult/19,982, Fantasy/19,312, Romance/1...",9780316015844,"{'Young Adult': 19982, 'Fantasy': 19312, 'Roma...","(Fiction, Fantasy, Young Adult, Romance, Paran...",80856
2,The Book Thief,4.38,1954165,117307,"[Historical-Historical Fiction/20,161, Fiction...",9780375831003,"{'Historical-Historical Fiction': 20161, 'Fict...","(Fiction, War-World War II, World War II-Holoc...",59070
3,Animal Farm,3.96,2926888,61574,"[Classics/34,158, Fiction/19,552, Science Fict...",9780452284241,"{'Classics': 34158, 'Fiction': 19552, 'Science...","(Fiction, Fantasy, Science Fiction-Dystopia, L...",73590
4,The Chronicles of Narnia,4.26,548649,10743,"[Fantasy/11,823, Classics/4,396, Fiction/3,903...",9780066238500,"{'Fantasy': 11823, 'Classics': 4396, 'Fiction'...","(Fiction, Fantasy, Childrens-Middle Grade, You...",26376
...,...,...,...,...,...,...,...,...,...
8462,Civil War: A Marvel Comics Event,4.08,39098,1811,"[Sequential Art-Comics/1,898, Sequential Art-G...",9780785121794,"{'Sequential Art-Comics': 1898, 'Sequential Ar...","(Fiction, Superheroes-Marvel, Fantasy, Graphic...",4324
8463,Peter the Great: His Life and World,4.09,18746,807,"[History/1,022, Biography/718, Nonfiction/386,...",9781842121160,"{'History': 1022, 'Biography': 718, 'Nonfictio...","(History-Russian History, History, Nonfiction,...",2660
8464,"Owl at Home (I Can Read, Level 2)",4.20,6402,332,"[Childrens/108, Childrens-Picture Books/103, F...",9780064440349,"{'Childrens': 108, 'Childrens-Picture Books': ...","(Fiction, Short Stories, Animals, Childrens-Ch...",351
8465,The People in the Trees,3.71,20385,2952,"[Fiction/794, Historical-Historical Fiction/28...",9780385536776,"{'Fiction': 794, 'Historical-Historical Fictio...","(Fiction, Fantasy, Contemporary, Magical Reali...",1732


In [22]:
network = data[["bookTitle", "bookRating", "ratingCount", "reviewCount", "bookISBN", "highestVotedGenres", "genreVotes"]]
network = network[~network.duplicated(keep='last')]

In [23]:
dic = network.to_dict("list")

def list_duplicates(seq):
    tally = defaultdict(list)
    for i, item in enumerate(seq):
        tally[item].append(i)
    return ((key, locs) for key, locs in tally.items() if len(locs)>1)

idups = dict(sorted(list_duplicates(dic["bookISBN"])))
bdups = dict(sorted(list_duplicates(dic["bookTitle"])))

In [24]:
new_dic = {
    "bookTitle": [], 
    "bookRating": [], 
    "ratingCount": [], 
    "reviewCount": [], 
    "bookISBN": [], 
    "highestVotedGenres": [], 
    "genreVotes": []
}

for i in range(len(dic["bookTitle"])):
    isbn = dic["bookISBN"][i]
    name = dic["bookTitle"][i]
    
    if isbn in idups.keys():
        if isbn in new_dic["bookISBN"]:
            continue
            
        else:    
            new_dic["bookTitle"].append(name)
            new_dic["bookRating"].append(dic["bookRating"][i])
            new_dic["ratingCount"].append(dic["ratingCount"][i])
            new_dic["reviewCount"].append(dic["reviewCount"][i])
            new_dic["bookISBN"].append(isbn)
            new_dic["highestVotedGenres"].append(tuple(dic["highestVotedGenres"][i]))
            votes = 0
            for index in idups[isbn]:
                votes += dic["genreVotes"][index]
            new_dic["genreVotes"].append(votes)
    
    elif name in bdups.keys():
        if name in new_dic["bookTitle"]:
            continue
        
        else:
            new_dic["bookTitle"].append(name)
            new_dic["bookRating"].append(dic["bookRating"][i])
            new_dic["ratingCount"].append(dic["ratingCount"][i])
            new_dic["reviewCount"].append(dic["reviewCount"][i])
            new_dic["bookISBN"].append(isbn)
            new_dic["highestVotedGenres"].append(tuple(dic["highestVotedGenres"][i]))
            votes = 0
            for index in bdups[name]:
                votes += dic["genreVotes"][index]
            new_dic["genreVotes"].append(votes)
        
    else:
        new_dic["bookTitle"].append(name)
        new_dic["bookRating"].append(dic["bookRating"][i])
        new_dic["ratingCount"].append(dic["ratingCount"][i])
        new_dic["reviewCount"].append(dic["reviewCount"][i])
        new_dic["bookISBN"].append(isbn)
        new_dic["highestVotedGenres"].append(tuple(dic["highestVotedGenres"][i]))
        new_dic["genreVotes"].append(dic["genreVotes"][i])
        
network = pd.DataFrame.from_dict(new_dic)
network

Unnamed: 0,bookTitle,bookRating,ratingCount,reviewCount,bookISBN,highestVotedGenres,genreVotes
0,The Hunger Games,4.32,6717635,176054,9780439023481,"(Fiction, Fantasy, Science Fiction-Dystopia, Y...",206814
1,Twilight,3.61,5231000,107619,9780316015844,"(Fiction, Fantasy, Young Adult, Romance, Paran...",85219
2,The Book Thief,4.38,1954165,117307,9780375831003,"(Fiction, War-World War II, World War II-Holoc...",177210
3,Animal Farm,3.96,2926888,61574,9780452284241,"(Fiction, Fantasy, Science Fiction-Dystopia, L...",294360
4,The Chronicles of Narnia,4.26,548649,10743,9780066238500,"(Fiction, Fantasy, Childrens-Middle Grade, You...",26376
...,...,...,...,...,...,...,...
7836,Civil War: A Marvel Comics Event,4.08,39098,1811,9780785121794,"(Fiction, Superheroes-Marvel, Fantasy, Graphic...",4324
7837,Peter the Great: His Life and World,4.09,18746,807,9781842121160,"(History-Russian History, History, Nonfiction,...",2660
7838,"Owl at Home (I Can Read, Level 2)",4.20,6402,332,9780064440349,"(Fiction, Short Stories, Animals, Childrens-Ch...",351
7839,The People in the Trees,3.71,20385,2952,9780385536776,"(Fiction, Fantasy, Contemporary, Magical Reali...",1732


In [25]:
network["genreVotes"].describe()

count      7841.000000
mean       4930.010203
std       11755.443468
min          37.000000
25%         949.000000
50%        2111.000000
75%        4743.000000
max      313044.000000
Name: genreVotes, dtype: float64

In [27]:
network = network[network["genreVotes"] >= 2500]
network.reset_index(drop=True, inplace=True)
network["Popularity"] = network[["ratingCount", "reviewCount", "genreVotes"]].mean(axis=1)
network = network.drop(["ratingCount", "reviewCount", "genreVotes"], axis=1)

In [29]:
network

Unnamed: 0,bookTitle,bookRating,bookISBN,highestVotedGenres,Popularity
0,The Hunger Games,4.32,9780439023481,"(Fiction, Fantasy, Science Fiction-Dystopia, Y...",2.366834e+06
1,Twilight,3.61,9780316015844,"(Fiction, Fantasy, Young Adult, Romance, Paran...",1.807946e+06
2,The Book Thief,4.38,9780375831003,"(Fiction, War-World War II, World War II-Holoc...",7.495607e+05
3,Animal Farm,3.96,9780452284241,"(Fiction, Fantasy, Science Fiction-Dystopia, L...",1.094274e+06
4,The Chronicles of Narnia,4.26,9780066238500,"(Fiction, Fantasy, Childrens-Middle Grade, You...",1.952560e+05
...,...,...,...,...,...
3473,Match Me If You Can,4.13,9780060734565,"(Fiction, Womens Fiction-Chick Lit, Romance-Co...",1.622067e+04
3474,"Scott Pilgrim, Volume 5: Scott Pilgrim vs. the...",4.33,9781934964101,"(Fiction, Fantasy, Young Adult, Graphic Novels...",1.296433e+04
3475,Pompeii,3.82,9780812974614,"(Fiction, Roman, Cultural-Italy, Thriller, His...",1.436900e+04
3476,Civil War: A Marvel Comics Event,4.08,9780785121794,"(Fiction, Superheroes-Marvel, Fantasy, Graphic...",1.507767e+04


In [30]:
rede1 = {}
    
for i in range(len(network["bookTitle"])):
    genres = network["highestVotedGenres"][i]
    if genres not in rede1.keys():
        rede1[genres] = []
    rede1[genres].append(network["bookISBN"][i])

# Obtendo todas as combinacoes possiveis
# https://stackoverflow.com/questions/464864/how-to-get-all-possible-combinations-of-a-list-s-elements
combs1 = []

for genres in rede1.keys():
    combs1 += list(combinations(rede1[genres], 2))

len(combs1)

1920

In [31]:
edges = []
books = []

for c in combs1:
    b1 = c[0]
    b2 = c[1]
    books.append(b1)
    books.append(b2)
    edges += [(b1, b2)]

books = list(set(books))
print(len(books))
print(len(edges))

1119
1920


In [None]:
with open("books_genres.gml", "w+") as file:
    file.write("graph [\n")
    file.write("    directed 0\n")
    
    for b in books:
        isbn = b
        title = list(network[network["bookISBN"] == b]["bookTitle"])[0]
        
        file.write("    node [\n")
        file.write(f"        id {isbn}\n")
        file.write(f'        label "{title}"\n')
        file.write("    ]\n")
            
    for tup in edges:
        file.write("    edge [\n")
        file.write(f"        source {tup[0]}\n")
        file.write(f"        target {tup[1]}\n")
        file.write("    ]\n")
    
    file.write("]\n")

In [None]:
g = fm.load("books_genres.gml")
# g.label_nodes()
for n in g.nodes:
    g.nodes[n]['labpos'] = 'hover'
    g.nodes[n]['size'] = 5
g.move('kamada_kawai')
g.draw()

In [None]:
# Codigo de coreness obtido de https://github.com/isnotinvain/nodens/blob/master/coreness.py

from scipy import optimize
from scipy.stats.stats import pearsonr
import numpy
import networkx as nx

def core_correlation(A, C):
    cMat = numpy.matrix(C)
    Cij = numpy.multiply(cMat,cMat.transpose())
    return pearsonr(A.flat,Cij.flat)

def _core_fitness(C, *args):
    return core_correlation(args[0],C)[0] * -1.0

def get_optimization(G):
    A = nx.to_numpy_matrix(G)

    # need a starting point for the optimizer, for now using a random starting point.
    initialC = numpy.random.rand(len(A)) # can we do better? Is it important? Maybe use constraint or centrality? 

    # run a bfgs optimizers that optimizes correlation between calculated coreness scores and the ideal model
    best = optimize.fmin_l_bfgs_b(_core_fitness, initialC,args=(A,None),approx_grad=True,bounds=[(0.0,1.0) for i in range(len(A))])

    return best

best = get_optimization(g)

In [None]:
def coreness(G, best, return_correlation=False):
    part = {}
    for node in G:
        nodes = list(G.nodes())
        idx = nodes.index(node)
        part[node] = best[0][idx]
        
    # return correlation to ideal if return_correlation is set
    if return_correlation: 
        return part,best[1] * -1.0
    return part

cent = coreness(g, best)