In [2]:
import pandas as pd
import nltk
import gensim
from gensim import downloader
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from scipy.spatial import distance
import re
import numpy as np

glove_vectors = gensim.downloader.load('word2vec-google-news-300')
stop_words = set(stopwords.words('english'))

In [3]:
def Compute_ISBN_to_Title(df_merged):
    ISBN_to_Title = {}
    for row in df_merged.index:
        title = df_merged["Book-Title"][row]
        isbn = df_merged["ISBN"][row]
        ISBN_to_Title[isbn] = title
    return ISBN_to_Title

def merge_book_and_summary(df_summary, df_books):
    df_merged = pd.merge(df_summary, df_books, how='inner', on =['Book-Title'])
    df_merged = df_merged.drop(['Code1','Code2','Code3','Publisher', 'Year-Of-Publication_x', 'Image-URL-S', 'Image-URL-M', 'Image-URL-L', 'Book-Author_y', 'Year-Of-Publication_y'], axis=1)
    df_merged = df_merged[df_merged['Book-Title'] != 'Deathstalker Rebellion']
    return df_merged

def compute_Book2Vec(df_merged):
    title_dict = {}
    summary_list = []

    for row in df_merged.index:
        title = df_merged['Book-Title'][row]
        summary = df_merged['Book-Summary'][row]
        summary_nonum = re.sub(r'\d+', '', summary) #remove number
        tokenizer = RegexpTokenizer(r'\w+') #remove punctuation
        summary_tokens = tokenizer.tokenize(summary_nonum) 
        filtered_summary_tokens = [w for w in summary_tokens if not w.lower() in stop_words] #remove stop words

        summary_vectors = np.zeros(300)
    
        n = 0
        for summary_token in filtered_summary_tokens:
            try:
                summary_token_vec = glove_vectors[summary_token]
            except:
                summary_token_vec = np.zeros(300)
            summary_vectors += summary_token_vec
            n += 1
        summary_list.append (summary_vectors / n)
    
    return summary_list

'''
df_books = pd.read_csv("Books.csv")
df_summary = pd.read_csv('booksummaries.txt', sep = '\t')
df_summary.columns = ['Code1', 'Code2', 'Book-Title', 'Book-Author', 'Year-Of-Publication', 'Code3', 'Book-Summary']
df_merged = merge_book_and_summary(df_summary, df_books)
df_merged.to_csv('Books_Summary_merged.csv')
'''

df_summary = pd.read_csv('book-summary-group-bert.csv')
summary_list = compute_Book2Vec(df_summary)
#ISBN_to_Title = Compute_ISBN_to_Title(df_merged)

In [4]:
summary_list

[array([ 4.51489369e-02,  6.01137093e-02, -1.26928838e-02,  4.32012652e-02,
        -1.83839328e-02,  9.67633259e-03,  1.06974692e-03, -7.39854382e-02,
         6.42520406e-02,  6.98175608e-02,  1.48870585e-02, -9.51987041e-02,
        -1.20863728e-02,  7.46699335e-02, -8.84843836e-02,  4.44471298e-02,
         3.73266638e-02,  1.14315298e-01,  2.88187684e-02, -7.15396214e-02,
         3.24258497e-02,  3.88648417e-02,  2.77610137e-02,  7.89003097e-03,
         3.32174512e-02, -1.00668825e-01, -5.01051160e-02,  6.38020877e-02,
         2.82911809e-02,  9.84927794e-04,  5.53220156e-05, -2.52107708e-02,
        -3.94478816e-02,  2.42501277e-02, -1.44681882e-02, -3.16978821e-02,
         4.90595651e-02,  1.17621851e-02, -1.06265096e-02,  5.85754084e-02,
         7.49068924e-02, -5.43385167e-02,  8.88797067e-02, -1.39013333e-02,
        -9.81881056e-03, -3.25640977e-02, -6.87814071e-02, -2.16237798e-02,
         4.11763216e-03,  2.53305646e-02, -2.47913192e-02,  6.88307216e-02,
        -1.7

In [15]:
df_summary

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Book-Title,Book-Author_x,Book-Summary,ISBN,group,vector
0,0,0,A Clockwork Orange,Anthony Burgess,"Alex, a teenager living in near-future Englan...",345316413X,23,"[-0.29636892676353455, 0.8470680117607117, 0.7..."
1,5,5,The Plague,Albert Camus,The text of The Plague is divided into five p...,0075536498,47,"[0.2956414222717285, 1.319756031036377, 0.3554..."
2,7,7,An Enquiry Concerning Human Understanding,David Hume,The argument of the Enquiry proceeds by a ser...,0872202291,5,"[1.2887966632843018, 1.2676935195922852, 1.117..."
3,8,8,All Quiet on the Western Front,Erich Maria Remarque,"The book tells the story of Paul Bäumer, a Ge...",0449213943,8,"[0.2628381550312042, 1.452850103378296, 1.0604..."
4,12,12,A Wizard of Earthsea,Ursula K. Le Guin,"Ged is a young boy on Gont, one of the larger...",0689317204,32,"[-0.12000640481710434, 1.7298859357833862, 0.7..."
...,...,...,...,...,...,...,...,...
5845,13775,13776,Gates of Paradise,V. C. Andrews,"A novel about Annie Stonewall, the daughter o...",0671670646,14,"[0.37652456760406494, 1.5239266157150269, 0.91..."
5846,13777,13778,Shelter,Harlan Coben,"After Mickey Bolitar moves in with his uncle,...",0385313896,11,"[-0.22981029748916626, 1.2231945991516113, 1.8..."
5847,13780,13781,Remote Control,Andy McNab,The series follows the character of Nick Ston...,0345428056,43,"[0.23090682923793793, 0.9125347137451172, 0.59..."
5848,13784,13785,The Simpsons: A Complete Guide to Our Favorite...,Matt Groening,"{| class=""wikitable"" |- !Seasons covered !Boo...",0060952520,3,"[0.0948098748922348, 1.4162523746490479, 0.936..."


In [22]:
#Process dataframe
df_summary = df_summary.drop(['Unnamed: 0.1', 'Unnamed: 0'], axis=1)
df_summary = df_summary.rename(columns={"vector": "BERT"})
df_summary["word2vec"] = summary_list
df_summary

Unnamed: 0,Book-Title,Book-Author_x,Book-Summary,ISBN,group,BERT,word2vec
0,A Clockwork Orange,Anthony Burgess,"Alex, a teenager living in near-future Englan...",345316413X,23,"[-0.29636892676353455, 0.8470680117607117, 0.7...","[0.045148936920943204, 0.060113709325094175, -..."
1,The Plague,Albert Camus,The text of The Plague is divided into five p...,0075536498,47,"[0.2956414222717285, 1.319756031036377, 0.3554...","[0.05818041191726434, 0.0684539834006888, 0.00..."
2,An Enquiry Concerning Human Understanding,David Hume,The argument of the Enquiry proceeds by a ser...,0872202291,5,"[1.2887966632843018, 1.2676935195922852, 1.117...","[0.06308863513119571, 0.024078914002105073, 0...."
3,All Quiet on the Western Front,Erich Maria Remarque,"The book tells the story of Paul Bäumer, a Ge...",0449213943,8,"[0.2628381550312042, 1.452850103378296, 1.0604...","[0.08981435439165901, 0.08089733888758695, 0.0..."
4,A Wizard of Earthsea,Ursula K. Le Guin,"Ged is a young boy on Gont, one of the larger...",0689317204,32,"[-0.12000640481710434, 1.7298859357833862, 0.7...","[0.07931866853133492, 0.07550479104553444, -0...."
...,...,...,...,...,...,...,...
5845,Gates of Paradise,V. C. Andrews,"A novel about Annie Stonewall, the daughter o...",0671670646,14,"[0.37652456760406494, 1.5239266157150269, 0.91...","[0.02671554240774601, -0.012985878802360372, -..."
5846,Shelter,Harlan Coben,"After Mickey Bolitar moves in with his uncle,...",0385313896,11,"[-0.22981029748916626, 1.2231945991516113, 1.8...","[0.07016823508522728, 0.04757967862215909, -0...."
5847,Remote Control,Andy McNab,The series follows the character of Nick Ston...,0345428056,43,"[0.23090682923793793, 0.9125347137451172, 0.59...","[0.04248046875, 0.0719638400607639, -0.0039070..."
5848,The Simpsons: A Complete Guide to Our Favorite...,Matt Groening,"{| class=""wikitable"" |- !Seasons covered !Boo...",0060952520,3,"[0.0948098748922348, 1.4162523746490479, 0.936...","[-0.021460793235085228, -0.04207264293323864, ..."


In [24]:
df_summary.to_csv("book-vec-group.csv")

In [104]:
def get_mostSimilar_title(title, summary_dict, books_distance):
    summary_list = list(summary_dict)
    input_distance = books_distance[summary_list.index(title)]
    closest_book_idx = np.argmin(input_distance)
    return summary_list[closest_book_idx]

def get_mostSimilar_isbn(isbn, summary_dict, books_distance, ISBN_to_Title):
    title = ISBN_to_Title[isbn]
    return get_mostSimilar_title(title, summary_dict, books_distance)

get_mostSimilar_isbn("345316413X", summary_dict, books_distance, ISBN_to_Title)

'Little Boy Blue'