In [1]:
import json
import os

import datetime
from math import log10, sqrt
import string
from collections import namedtuple, defaultdict, Counter

from load import PrepareBooks
from txtai.embeddings import Embeddings

from IPython.display import display, HTML
import re

from nltk.tokenize import word_tokenize
from nltk.stem.snowball import EnglishStemmer
import nltk
nltk.download('punkt')


[nltk_data] Downloading package punkt to
[nltk_data]     D:\Users\26101742\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
obj = PrepareBooks()
obj.loadBooks("data/book_best.jl")

books = obj.books
data = obj.getDescription()
data[:3]

['Harry Potter and the Goblet of Fire (Harry Potter, #4) by J.K. Rowling, Jim Kay, Mary GrandPré. Places are Scotland, United Kingdom, Hogwarts School of Witchcraft and Wizardry.             The characters are Cho Chang, Molly Weasley, Hermione Granger, Neville Longbottom, Arthur Weasley, Sybil Trelawney, George Weasley, Ginny Weasley, Harry Potter, Dudley Dursley, Severus Snape, Alastor Moody, Draco Malfoy, Cedric Diggory, Rita Skeeter, Bartemius Crouch, Jr., Bill Weasley, Sirius Black, Petunia Dursley, Lord Voldemort, Percy Weasley, Fred Weasley, Viktor Krum, Albus Dumbledore, Vernon Dursley, Rubeus Hagrid, Minerva McGonagall, Cornelius Fudge, Ron Weasley, Bellatrix Lestrange, Bartemius Crouch, Charlie Weasley, Ludo Bagman, Fleur Delacour. The genres are Fiction, Adventure, Fantasy, Young Adult, Science Fiction Fantasy, Classics, Middle Grade, Magic, Childrens, Audiobook.             Summary: It is the summer holidays and soon Harry Potter will be starting his fourth year at Hogwarts

In [3]:
reviews = obj.loadReviews("data/reviews.json")

In [4]:
def display_summary( url, show_abstract=False, show_id=True, extra_text='' ):
    """
    Function for printing a paper's summary through IPython's Rich Display System.
    Trims long author lists, and adds a link to the paper's DOI (when available).
    """
    book = books[url]
    lines = []
    title = book["title"]

    title = f'<a href="{url}">{title}</a>'  # Include the title within the anchor tag
    title = '<strong>' + title + '</strong>'
    lines.append(title)
    authors = ', '.join( book["author"][:20] ) + ('' if len(book["author"]) <= 20 else ', ...')
    
    if type(book["publishDate"]) == int and book["publishDate"] > 0:
        timestamp = book["publishDate"]/1000
        date = datetime.datetime.fromtimestamp(timestamp).date()
        date = date.strftime('%d-%m-%Y')
    else:
        date = "NA"

    lines.append(str(date) + '. ' + authors)
    if (show_abstract):
        lines.append(f'<small><strong>Abstract:</strong> <em>{book["summary"]}</em></small>')
    if (extra_text != ''):
         lines.append(extra_text)
    display( HTML('<br>'.join(lines)) )

display_summary('https://www.goodreads.com/book/show/6.Harry_Potter_and_the_Goblet_of_Fire', show_abstract=True)


In [9]:
def smarter_tokenize_and_preprocess(strings):
    stemmer = EnglishStemmer()
    return [stemmer.stem(term) for term in word_tokenize(strings)]

print(smarter_tokenize_and_preprocess('''Good muffins cost $3.88\nin New York.  Please buy me two of them.\n\nThanks.'''))

['good', 'muffin', 'cost', '$', '3.88', 'in', 'new', 'york', '.', 'pleas', 'buy', 'me', 'two', 'of', 'them', '.', 'thank', '.']


### Create an Inverted Index

In [10]:
# Below, we create our smarter index
smarter_index = defaultdict(list)

# Here we define the subset (somewhat arbitrary):
subset_of_ids = list(key for key in books.keys())

# Building our smarter index:
for id in sorted(subset_of_ids):
    term_set = set(smarter_tokenize_and_preprocess(books[id]["title"]))
    term_set.update(smarter_tokenize_and_preprocess(books[id]["summary"]))
    for term in term_set:
        smarter_index[term].append(id)

In [11]:
# Smarter and_query based on the smarter tokenize and preprocess functions
def and_merge(sorted_list1, sorted_list2):  
    merged_list = []  
    # first we make copies of the lists, so we don't modify the existing lists in the index:
    list1 = list(sorted_list1)  
    list2 = list(sorted_list2)  
    while (list1 and list2):  
        if (list1[0] < list2[0]):  
            list1.pop(0)  
        elif (list1[0] > list2[0]):  
            list2.pop(0)  
        else:  
            merged_list.append(list1[0])  
            list1.pop(0)  
            list2.pop(0)  
    return merged_list  

def smarter_and_query(query_string):  
    query_words = smarter_tokenize_and_preprocess(query_string)  
    first_word = query_words[0]  
    remaining_words = query_words[1:]  
    and_list = smarter_index[first_word]  
    for t in remaining_words:  
        and_list = and_merge(and_list, smarter_index[t])  
    return and_list  


In [30]:
text = 'Heart of Istanbul'
postings = smarter_and_query(text)
for id in postings[:5]:
    display_summary(id, show_abstract=True)

Another important method to improve our search results is to rank them, which can be done by calculating a score for each document based on the matching terms from the query. One such scoring method is *tf-idf*, which comes with several variants, as explained in the lecture slides.

In order to quickly calculate the scores for a term/document combination, we'll need quick access to a couple of things:

- tf(t,d): How often does a term occur in a document
- df(t): In how many documents does a term occur
- num_documents: The number of documents in our index

In [14]:
tf_matrix = defaultdict(Counter)

for doc_id in books.keys():
    tokens = smarter_tokenize_and_preprocess(books[doc_id]["title"])
    tokens.extend(smarter_tokenize_and_preprocess(books[doc_id]["summary"]))
    tf_matrix[doc_id] = Counter(tokens)

def tf(t,d):
    return float(tf_matrix[d][t])

def df(t):
    return float(len(smarter_index[t]))

num_documents = float(len(books))

In [15]:
print(tf('ghana', "https://www.goodreads.com/book/show/25322449-radio-silence"))
print(df('ghana'))
print(num_documents)

0.0
2.0
10000.0


In [16]:
def idf(t):
    return log10((num_documents + 1)/(df(t) + 1))

def tfidf(t, d):
    return tf(t, d)*idf(t)

In [31]:
def query_ntn_nn(query_string):
    query_list = smarter_tokenize_and_preprocess(query_string)
    docs = smarter_and_query(query_string)
    results = score_ntn_nnn(query_list, docs)
    return results

def score_ntn_nnn(t, d):
    rank = defaultdict(float)
    for term in t:
        for doc in d:
            rank[doc] += tfidf(term, doc)
    return sorted(rank.items(), key=lambda i:i[1], reverse=True)

In [33]:
results = query_ntn_nn("Heart of Istanbul")
K = 5
for id, score in results[:K]:
    display_summary(id, show_abstract=True)
    print(f'Score: {score}')

Score: 19.63915705325775


Score: 4.618603945802349


In [35]:
tfidf_length_values = defaultdict(int)

for doc_id in books.keys():
    l = 0
    for t in tf_matrix[doc_id].keys():
        l += tfidf(t,doc_id) ** 2
    tfidf_length_values[doc_id] = sqrt(l)

def tfidf_length(d):
    return tfidf_length_values[d]

In [36]:
def query_ntc_ntc(query_string):
    terms = smarter_tokenize_and_preprocess(query_string)
    results = score_ntc_ntc(terms)
    return results
    #displaying top K documents

def score_ntc_ntc(terms):
    documents_ranking = defaultdict(float)
    for doc_id in books.keys():
        score = 0
        for term in terms:
            score += tfidf(term, doc_id)
        try:
            documents_ranking[doc_id] = score/tfidf_length(doc_id)
        except ZeroDivisionError:
            continue
    documents_ranking = sorted(documents_ranking.items(), key=lambda i:i[1], reverse=True)
    return documents_ranking

In [37]:
results = query_ntc_ntc("happiness of young adults")
K = 5
for id, score in results[:K]:
    display_summary(id, show_abstract=True)
    print(f'Score: {score}')

Score: 0.6712128742651122


Score: 0.32138166651723676


Score: 0.31369733479331924


Score: 0.28224058651428807


Score: 0.2753803916681648


In [41]:
embeddings = Embeddings()

# Load the saved model
embeddings.load("models")

# Now you can use the loaded model
res = embeddings.search("Heart of Istanbul", 5)

index = []
similarities = []
for r in res:
    index.append(r[0])
    similarities.append(r[1])

for k, i in enumerate(index):
    url = list(books.keys())[i]
    display_summary(url, show_abstract=True)
    print(f'Similarity Score: {similarities[k]}')

Similarity Score: 0.4954030513763428


Similarity Score: 0.42500853538513184


Similarity Score: 0.416370153427124


Similarity Score: 0.3933972716331482


Similarity Score: 0.39217865467071533
