## 0.1. Import Packages

In [1]:
# Run cell
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import os
import csv
import re
import numpy as np
import pandas as pd
import json
from scipy import spatial
import spacy
import collections
import heapq
from IPython.display import HTML
from bs4 import BeautifulSoup
import time
from selenium import webdriver
from webdriver_manager.firefox import GeckoDriverManager
import webbrowser
from langdetect import detect
from multiprocessing import Pool
#import multi_processing_functions
import dask.dataframe as ddf

In [2]:
# Run cell
from __future__ import print_function

In [3]:
# Run cell
from dask.distributed import Client
client = Client()
client

Port 8787 is already in use. 
Perhaps you already have a cluster running?
Hosting the diagnostics dashboard on a random port instead.


0,1
Client  Scheduler: tcp://127.0.0.1:61152  Dashboard: http://127.0.0.1:61155/status,Cluster  Workers: 4  Cores: 4  Memory: 8.50 GB


## 0.2. Useful Functions (read, write, etc.)

We will define a few basic functions that will be used throughout the whole notebook

In [4]:
# Run cell
def write_file(filename, content):
    os.makedirs(os.path.dirname(filename), exist_ok=True)
    with open(filename, "w", encoding='utf-8' ) as f:
        f.write(str(content))

In [5]:
# Run cell
def write_tsv(filename, content):
    os.makedirs(os.path.dirname(filename), exist_ok=True)
    with open(filename, "wt", newline='', encoding='utf-8' ) as out_file:
        tsv_writer = csv.writer(out_file, delimiter='\t')
        tsv_writer.writerow(content)

In [6]:
# Run cell
def read_tsv(filename, type_='utf-8'):
    # cp850
    with open(filename, encoding = type_) as tsvfile:
        reader = csv.reader(tsvfile, delimiter='\t')
        for data in reader:
            return data

In [7]:
# Run cell
def write_json(file_name, content):
    os.makedirs(os.path.dirname(file_name), exist_ok=True)
    with open(file_name, 'w') as outfile:
        json.dump(content, outfile, sort_keys=True, indent=4)

In [8]:
# Run cell
def jsonKeys2int(x):
    if isinstance(x, dict):
            return {int(k):v for k,v in x.items()}
    return x

In [9]:
# Run cell
def read_json(file_name):
    with open(file_name) as json_file:
        data_dict = json.load(json_file, object_hook=jsonKeys2int)
        return data_dict

In [10]:
# Run cell
def read_json_simple(file_name):
    with open(file_name) as json_file:
        data_dict = json.load(json_file)
        return data_dict

In [11]:
# Run cell
def tsv_files_to_df(path, destiny='data/tsv_files/tsv_files.tsv'):
    """  
    Convert all tsv files in the given path into a dataframe, which will also be saved as a .tsv file
    """
    tsv_files = os.listdir(path)
    list_tsv = []
    for filename in tsv_files:
        d_id = int(re.findall(r'\d+', filename)[0])
        filename = path + filename
        book_data = read_tsv(filename)
        book_data.insert(0,d_id)
        list_tsv.append(book_data)
    df = pd.DataFrame(list_tsv)
        
    df.to_csv(destiny, sep='\t')
    return df

## 1. Data collection

### 1.1. Get the list of books

In [13]:
# Initalize web browser for crawling
driver = webdriver.Firefox(executable_path=GeckoDriverManager().install())

[WDM] - Getting latest mozilla release info for v0.28.0
[WDM] - Trying to download new driver from https://github.com/mozilla/geckodriver/releases/download/v0.28.0/geckodriver-v0.28.0-win64.zip
[WDM] - Driver has been saved in cache [C:\Users\ADMIN\.wdm\drivers\geckodriver\win64\v0.28.0]


In [14]:
def crawl_urls(href):
    """
    Given a specific url (href) crawl through the different list items and return the page url
    """
    driver.get(href)
    time.sleep(5)
    
    page_soup = BeautifulSoup(driver.page_source, features="lxml")
    links = page_soup.find_all('a',{'class': 'bookTitle'}, itemprop="url")
    
    urls = []
    # Loop over all links in the href page
    for link in links:
        url = link.get('href')
        url = 'https://www.goodreads.com' + url
        urls.append(url)
    
    urls = '\n'.join(urls)+'\n'
    
    return urls

Apply previous function over all books that we are interested in downloading

In [None]:
url = 'https://www.goodreads.com/list/show/1.Best_Books_Ever?page='
path = 'data/book_urls.txt'

urls = ''

for i in range(0,30000):
    href = url+str(i+1)
    urls += crawl_urls(href)

write_file(path, urls)

### 1.2. Crawl books

Given a set of links (generated by the previous function) download their html individually

In [None]:
def scrap_book(href):
    driver.get(href)       
    time.sleep(5)
    return driver.page_source

In [None]:
filename = 'data/book_urls.txt'

book_urls = open(filename, 'r')
for url in book_urls:
    page_number = int((count-1)/100)+1
    html = scrap_book(url)
    path = 'data/page_'+str(page_number)+'/article_'+str(count)+'.html'
    write_file(path, html)

### 1.3 Parse downloaded pages

Given all the downloaded htmls, parse them individually

In [15]:
def remove_html_tags(text):
    """Remove html tags from a string"""
    clean = re.compile('<.*?>')
    return re.sub(clean, '', text)

In [16]:
def parse_html_in_folder(path):
    for html_file in os.listdir(path):
        print(html_file)
        if os.path.exists('data/final_tsv_files/' + re.findall(r'\d+', html_file)[0] + '.tsv'):
            print('not parsed')
            continue
        else:
            with open(path + '/' + html_file, encoding='utf8') as infile:
                print('parsing...')
                soup = BeautifulSoup(infile, features="lxml")
                try:
                    Plot = ' '.join([remove_html_tags(str(c)) for c in soup.find_all('div', id="description")[0].contents[3].contents ])
                except Exception:
                    if not soup.find_all('div', id="description"):
                        Plot = ''
                    else:
                        Plot = ' '.join([remove_html_tags(str(c)) for c in soup.find_all('div', id="description")[0].contents[1].contents ])
                if Plot:
                    if detect(Plot) != 'en':
                        print('Article removed:', html_file)
                        continue
                try:
                    bookTitle = soup.find_all('h1')[0].contents[0].replace('\n', '').strip()
                except:
                    print('Wrong html file')
                    continue
                bookSeries = soup.find_all('h2', id='bookSeries')[0].text.replace('\n', '').strip()
                bookAuthors = ', '.join([soup.find_all('span', itemprop='name')[i].contents[0] for i in range(
                    len(soup.find_all('span', itemprop='name')))])
                ratingValue = soup.find_all('span', itemprop='ratingValue')[0].contents[0].replace('\n', '').strip()
                ratingCount = soup.find_all('meta', itemprop="ratingCount")[0]['content']
                reviewCount = soup.find_all('meta', itemprop="reviewCount")[0]['content']
                try:
                    NumberofPages = re.findall(r'\d+', soup.find_all('span', itemprop="numberOfPages")[0].contents[0])[0]
                except:
                    if not soup.find_all('span', itemprop="bookFormat"):
                        NumberofPages = ''
                    else:
                        NumberofPages = 0
                try:
                    temp_date = soup.find_all('div', id='details')[0].find_all('div', {"class": "row"})[1].text.split('\n')[
                        2].split()
                    if not temp_date:
                        temp_date = soup.find_all('div', id='details')[0].find_all('nobr', {"class": "greyText"})[0].contents[0].split('\n')[1].split()[-3:]
                except:
                    try:
                        temp_date = soup.find_all('div', id='details')[0].find_all('div', {"class": "row"})[0].contents[0].split('\n')[
                            2].split()
                    except:
                        temp_date = ''
                PublishingDate = ' '.join(temp_date)
                characters = []
                settings = []
                for i in range(1, len(soup.find_all('div', id="bookDataBox")[0].find_all('a'))):
                    if re.match(r'/characters/', soup.find_all('div', id="bookDataBox")[0].find_all('a')[i].attrs['href']):
                        characters.append(soup.find_all('div', id="bookDataBox")[0].find_all('a')[i].text)
                    elif re.match(r'/places/', soup.find_all('div', id="bookDataBox")[0].find_all('a')[i].attrs['href']):
                        settings.append(soup.find_all('div', id="bookDataBox")[0].find_all('a')[i].text)
                characters = ', '.join(characters)
                settings = ', '.join(settings)
                url = soup.find_all('link', rel='canonical')[0].attrs['href']

                final_list = [bookTitle, bookSeries, bookAuthors, ratingValue, ratingCount, reviewCount,
                              Plot, NumberofPages, PublishingDate, characters, settings, url]

                filename = 'data/final_tsv_files/' + re.findall(r'\d+', html_file)[0] + '.tsv'

                write_tsv(filename, final_list)


Apply function over all folders containing the downloaded html files

In [None]:
# None parallel approach
for i in range(0,301):
    print(i)
    parse_html_in_folder('../data_html/page_' + str(i))

In [None]:
# Parallel approach
if __name__ == '__main__':
    with Pool(8) as p:
        print(p.map(multi_processing_functions.parse_html_in_folder, 
                    ['../data_html/' + i for i in os.listdir('../data_html')]))

In [None]:
tsv_files_to_df(path='data/final_tsv_files/', destiny='data/tsv_files/final_tsv_files.tsv')

In [225]:
df_dirty.to_csv('data/tsv_files/final_tsv_files.tsv', sep='\t', index=False)

## 2. Search Engine

### 2.0. Pre-process of information

In [23]:
# Run cell
stopwords = set(stopwords.words('english'))
tokenizer = nltk.RegexpTokenizer(r"\w+")
nlp = spacy.load("en_core_web_sm")

In [24]:
# Run cell
def remove_stop_words(text):
    """
    This allow us to identify stop word in english and remove them. We are also removing character with single length (e.g. "s")
    """    
    word_tokens = word_tokenize(text)
    filtered_sentence = [w.lower() for w in word_tokens if w.lower() not in stopwords and not(len(w) == 1 and w.isalpha())]

    text = ' '.join(filtered_sentence)
    return text

In [25]:
# Run cell
def remove_punctuation(text): 
    """
    Remove puntuation from input string
    """
    text = tokenizer.tokenize(text)
    clean_punctuation = ' '.join(text)
    return clean_punctuation

In [26]:
# Run cell
def remove_stemming(text):
    """
    Apply stemming procedure over input text
    """
    ps = PorterStemmer()
    words = word_tokenize(text)
    stem_sentence=[]
    
    for w in words:
        stem_sentence.append(ps.stem(w))

    text = " ".join(stem_sentence)
    return text 

In [27]:
# Run cell
def remove_lemma(text):
    """
    Apply lemmanization procedure over input text
    """
    doc = nlp(text)
    lemma = []
    for token in doc:
        lemma.append(token.lemma_)
    text = ' '.join(lemma)
    return text

In [28]:
# Run cell
def parse_pulishing_date(publishingDate):
    """
    Only keep last 4 digits of publishing Date (Year of publication)
    """
    return publishingDate[-4:]

In [29]:
# Run cell
def global_pre_process(text):
    """ 
    Function to process everything at once 
    """
    text = remove_punctuation(text)
    text = remove_stop_words(text)
    text = remove_lemma(text)
    # This makes sure that we also remove strange letters that have not been removed with the previous packages 
    # (e.g. arabic letters)
    text = re.sub(r'[^a-zA-Z0-9]', ' ', text).strip()
    return text

In [30]:
# Run cell
def clean_text(df):
    """
    Take in a Dataframe, and clean based on the previously defined functions (each column is cleaned individually)
    """
    df['BookTitle'] = df.BookTitle.map(global_pre_process)
    df['BookSeries'] = df.BookSeries.map(global_pre_process)
    df['BookAuthors'] = df.BookAuthors.map(global_pre_process)
    df['Plot'] = df.Plot.map(global_pre_process)
    df['Characters'] = df.Characters.map(global_pre_process)
    df['PublishingDate'] = df.PublishingDate.map(parse_pulishing_date)
    return df

Read data frame which still has not been pre-processed

In [12]:
# Run cell
df_dirty = pd.read_csv('data/tsv_files/final_tsv_files.tsv', sep='\t', keep_default_na=False)

In [73]:
df_dirty.head(5)

Unnamed: 0,BookID,BookTitle,BookSeries,BookAuthors,RatingValue,RatingCount,ReviewCount,Plot,NumberofPages,PublishingDate,Characters,Settings,Url
0,1,The Hunger Games,(The Hunger Games #1),Suzanne Collins,4.33,6409198,172562,"Could you survive on your own in the wild, wit...",374,September 14th 2008,"Katniss Everdeen, Peeta Mellark, Cato (Hunger ...","District 12, Panem, Capitol, Panem, Panem",https://www.goodreads.com/book/show/2767052-th...
1,10,The Fault in Our Stars,,John Green,4.2,3572895,155821,Despite the tumor-shrinking medical miracle th...,313,January 10th 2012,"Augustus Waters, Isaac","Indianapolis, Indiana, Amsterdam",https://www.goodreads.com/book/show/11870085-t...
2,100,A Prayer for Owen Meany,,John Irving,4.23,286642,13845,"Eleven-year-old Owen Meany, playing in a Littl...",637,1990,John Wheelwright,"Gravesend, New Hampshire, Toronto, Ontario",https://www.goodreads.com/book/show/4473.A_Pra...
3,1000,Helter Skelter: The True Story of the Manson M...,,"Vincent Bugliosi, Curt Gentry",4.04,126139,4019,"Prosecuting attorney in the Manson trial, Vinc...",689,December 17th 2001,,,https://www.goodreads.com/book/show/105992.Hel...
4,10000,"Henry and June: From ""A Journal of Love"": The ...","(From ""A Journal of Love"" #1)",Anaïs Nin,3.89,10581,624,"Taken from the original, uncensored journals o...",304,October 29th 1990,"Henry Miller, Anaïs Nin",Paris,https://www.goodreads.com/book/show/11038.Henr...


Apply clean_text function over the "dirty" dataframe using dask to optimize running time

In [74]:
dask_dataframe = ddf.from_pandas(df_dirty, npartitions=20)

In [75]:
%%time
df_clean = dask_dataframe.map_partitions(clean_text, meta=df_dirty).compute()

Wall time: 9min 30s


In [76]:
df_clean

Unnamed: 0,BookID,BookTitle,BookSeries,BookAuthors,RatingValue,RatingCount,ReviewCount,Plot,NumberofPages,PublishingDate,Characters,Settings,Url
0,1,hunger game,hunger game 1,suzanne collins,4.33,6409198,172562,could survive wild every one make sure live se...,374,2008,katniss everdeen peeta mellark cato hunger gam...,"District 12, Panem, Capitol, Panem, Panem",https://www.goodreads.com/book/show/2767052-th...
1,10,fault star,,john green,4.20,3572895,155821,despite tumor shrink medical miracle buy year ...,313,2012,augustus water isaac,"Indianapolis, Indiana, Amsterdam",https://www.goodreads.com/book/show/11870085-t...
2,100,prayer owen meany,,john irving,4.23,286642,13845,eleven year old owen meany play little league ...,637,1990,john wheelwright,"Gravesend, New Hampshire, Toronto, Ontario",https://www.goodreads.com/book/show/4473.A_Pra...
3,1000,helter skelter true story manson murder,,vincent bugliosi curt gentry,4.04,126139,4019,prosecute attorney manson trial vincent buglio...,689,2001,,,https://www.goodreads.com/book/show/105992.Hel...
4,10000,henry june journal love unexpurgate diary ana ...,journal love 1,ana s nin,3.89,10581,624,take original uncensored journal ana s nin hen...,304,1990,henry miller ana s nin,Paris,https://www.goodreads.com/book/show/11038.Henr...
...,...,...,...,...,...,...,...,...,...,...,...,...,...
27146,9993,catch true story real fake,,frank abagnale stan redding,4.05,51450,2478,stole every nickel blow fine thread luxurious ...,224,2003,sean riley,,https://www.goodreads.com/book/show/138269.Cat...
27147,9995,rake,lesson love 1,suzanne enoch,3.86,7694,369,three determined young lady vow give three lon...,375,2002,greydon brakenridge duke wycliffe georgina hal...,,https://www.goodreads.com/book/show/823583.The...
27148,9996,manfred,,lord byron,3.81,1856,109,manfred contain supernatural element keep popu...,84,2009,abbot st maurice manfre manfred herman manfre ...,,https://www.goodreads.com/book/show/3730956-ma...
27149,9997,world representation vol 1,world representation 1,arthur schopenhauer judith norman payne alista...,4.19,8415,192,arthur schopenhauer die welt als wille und vor...,534,1966,,,https://www.goodreads.com/book/show/19506.The_...


In [79]:
df_clean.to_csv('data/clean_tsv_files/clean_final.tsv', sep='\t', index=False)

### 2.1. Conjunctive query

In [13]:
# Run cell
# Read Clean tsv file
df_clean = pd.read_csv('data/clean_tsv_files/clean_final.tsv', sep='\t', keep_default_na=False)

#### 2.1.1. Create your index!

In [81]:
def get_vocabulary_inverted_index(df, columns):
    """
    This function returns a dictionary with all the words in the dataframe (and specifically the provided columns) 
    and its inverted index
    Example:
    vocabulary_dict = {'river': 1, 'game': 2, 'friend': 3, ...}
    inverted_index = {1: [1, 4, 7], 2: [3, 6, 9], 3: [2, 7, 8]} where the list contains the documents in which 
    the word 1 (river) appears in
    
    df: Clean Dataframe
    columns: Columns over which the vocabulary and inverted index dictionaries will be generated
    """
    vocabulary = {}
    count = 1
    inverted_index = {}
    for index, row in df.iterrows():
        d_id = row['BookID']
        if isinstance(columns, list):
            text = (' '.join([row[i] for i in columns])).split(' ')
        else:
            raise('Column must be a list')
            
        for word in text:
            if word not in vocabulary: 
                vocabulary[word] = count
                inverted_index[count] = [d_id]
                count +=1
            else:
                key = vocabulary[word]
                if d_id not in inverted_index[key]:
                     inverted_index[key].append(d_id)
    return vocabulary, inverted_index

In [82]:
%%time
vocabulary, inverted_index = get_vocabulary_inverted_index(df_clean, columns=['Plot'])

Wall time: 37.2 s


Save dictionaries as json files for future use

In [83]:
write_json('data/inverted_index.json', inverted_index)
write_json('data/vocabulary_dict.json', vocabulary)

#### 2.1.2. Execute the query

Functionality to get intersection of documents in which query appears

In [14]:
# Run cell
def get_pointer_values(pointer, index_list):
    """ Based on a set of pointer values get the documents """
    values = []
    for i in range(len(pointer)):
        values.append(index_list[i][pointer[i]])
    return values

In [15]:
# Run cell
def update_pointer(values, pointer):
    """ Given the values, compute the minimum and update the pointer accordingly based on their minimum """
    mins = np.where(values == np.min(values))[0]
    for i in range(0, len(mins)):
        pointer[mins[i]] = pointer[mins[i]] + 1 
    return pointer

In [16]:
# Run cell
def query_function(query, index, vocabulary):
    """ 
    Given a query find the documents in which these appear based on the index 
    query: query string
    index: inverted index as dictionary
    vocabulary: vocabulary dictionary
    
    """
    
    # Pre-process query 
    query = global_pre_process(query)
    
    # Query to list of strings
    query_list = query.split()
    
    # Map strings to integer based on dict
    try:
        integer_list = [vocabulary[i] for i in query_list]
    except:
        return []
    
    # Start to look for the intersection of the query in the index
    total_query_documents = [sorted(index[i]) for i in integer_list]
    
    # Generate a list with the pointer values
    pointers = np.full(len(total_query_documents), 0)
    values = np.full(len(total_query_documents), 0)
    
    # List where intersection documents will be stored
    intersection = []

    # Compute the document in which the search should stop
    max_list = np.array([max(total_query_documents[i]) for i in range(len(total_query_documents))])

    try:
        # Loop over all elements stopping at the minimum between all documents
        while np.any(values != max_list):
            # Get the documents based on the pointer
            values = get_pointer_values(pointer = pointers, 
                                        index_list = total_query_documents)
            # If all values are equal we have found a match and all the pointer values are increased by one
            if len(set(values)) == 1:
                intersection.append(values[0])
                pointers += 1
            # If all values are not equal increase the values of the minimum pointers
            else:
                pointers = update_pointer(values, pointers)
    except:
        intersection = sorted(list(set.intersection(*map(set,total_query_documents))))
    
    assert intersection == sorted(list(set.intersection(*map(set,total_query_documents)))), 'Algorithm is not returning same result as python implementation'
    
    return intersection

In [17]:
# Run cell
def path_to_image_html(path):
    return '<img src="'+ path + '" style=max-height:124px;"/>'

In [18]:
# Run cell
def show_results(book_ids, df):
    """
    Get relevant information which will be shown in the final dataframe for the books in book_ids
    df: This dataframe should not be pre-processed
    book_ids: list of books
    """
    output = df[df['BookID'].isin(book_ids)][['BookTitle', 'Plot', 'Url']]
    return output

In [19]:
# Run cell
def search_engine_1(query, inverted_index, vocabulary, df):
    """
    Basic search engine which returns all books with the provided query
    query: Query of user
    df: This dataframe should not be pre-processed
    """
    query_results = query_function(query, inverted_index, vocabulary)
    if len(query_results) == 0:
        print('There are no results for the search')
    else:
        output = show_results(query_results, df)
        output = HTML(output.to_html(escape=False,
                                     formatters=dict(column_name_with_image_links=path_to_image_html)))
        return output

##### Read json files (vocabulary and inverted index)

In [20]:
# Run cell
inverted_index = read_json('data/inverted_index.json')
vocabulary = read_json_simple('data/vocabulary_dict.json')

#### Run First Search Engine

In [21]:
# Run cell
input_query = 'heart break'

In [31]:
# Run cell
search = search_engine_1(input_query, inverted_index, vocabulary, df_dirty)
search

Unnamed: 0,BookTitle,Plot,Url
15,The Meursault Investigation,"He was the brother of “the Arab” killed by the infamous Meursault, the antihero of Camus’s classic novel. Seventy years after that event, Harun, who has lived since childhood in the shadow of his sibling’s memory, refuses to let him remain anonymous: he gives his brother a story and a name—Musa—and describes the events that led to Musa’s casual murder on a dazzlingly sunny beach. In a bar in Oran, night after night, he ruminates on his solitude, on his broken heart, on his anger with men desperate for a god, and on his disarray when faced with a country that has so disappointed him. A stranger among his own people, he wants to be granted, finally, the right to die. The Stranger is of course central to Daoud’s story, in which he both endorses and criticizes one of the most famous novels in the world. A worthy complement to its great predecessor, The Meursault Investigation is not only a profound meditation on Arab identity and the disastrous effects of colonialism in Algeria, but also a stunning work of literature in its own right, told in a unique and affecting voice.",https://www.goodreads.com/book/show/25263557-the-meursault-investigation
112,Rules of Attraction,"Carlos Fuentes doesn't want any part of the life his older brother, Alex, has laid out for him in Boulder, Colorado. He wants to keep living on the edge, and carve his own path-just like Alex did. Unfortunately, his ties to a Mexican gang aren't easy to break, and he soon finds himself being set up by a drug lord. When Alex arranges for Carlos to live with his former professor and his family to keep him from being sent to jail, Carlos feels completely out of place. He's even more thrown by his strong feelings for the professor's daughter, Kiara, who is nothing like the girls he's usually drawn to. But Carlos and Kiara soon discover that in matters of the heart, the rules of attraction overpower the social differences that conspire to keep them apart. As the danger grows for Carlos, he's shocked to discover that it's this seemingly All-American family who can save him. But is he willing to endanger their safety for a chance at the kind of life he's never even dreamed possible?",https://www.goodreads.com/book/show/7137775-rules-of-attraction
229,Black Rose,"A Harper has always lived at Harper House, the centuries-old mansion just outside of Memphis. And for as long as anyone alive remembers, the ghostly Harper Bride has walked the halls, singing lullabies at night... At forty-seven, Rosalind Harper is a woman whose experiences have made her strong enough to bend without breaking--and weather any storm. A widow with three grown sons, she survived a disastrous second marriage and built her In The Garden nursery from the ground up. Through the years, In The Garden has become more than just a thriving business--it is a symbol of hope and independence to Roz, and to the two women she shares it with. Newlywed Stella and new mother Hayley are the sisters of her heart, and together the three of them are the future of In The Garden. But now the future is under attack, and Roz knows they can't fight this battle alone. Hired to investigate Roz's Harper ancestors, Dr. Mitchell Carnegie finds himself just as intrigued with Roz herself. And as they being to unravel the puzzle of the Harper Bride's identity, Roz is shocked to find herself falling for the fascinating genealogist. Now it is a desperate race to discover the truth before the unpredictable apparition lashes out at the one woman who can help her rest in peace...",https://www.goodreads.com/book/show/86321.Black_Rose
254,Whatever Life Throws at You,"Life loves a good curveball… Seventeen-year-old Annie Lucas's life is completely upended the moment her dad returns to the major leagues as the new pitching coach for the Kansas City Royals. Now she's living in Missouri (too cold) , attending an all-girls school (no boys) , and navigating the strange world of professional sports. But Annie has dreams of her own—most of which involve placing first at every track meet…and one starring the Royals' super-hot rookie pitcher. But nineteen-year-old Jason Brody is completely, utterly, and totally off-limits. Besides, her dad would kill them both several times over. Not to mention Brody has something of a past, and his fan club is filled with C-cupped models, not smart-mouthed high school “brats” who can run the pants off every player on the team. Annie has enough on her plate without taking their friendship to the next level. The last thing she should be doing is falling in love. But baseball isn't just a game. It's life. And sometimes, it can break your heart…",https://www.goodreads.com/book/show/20757528-whatever-life-throws-at-you
427,Fear of Falling,"I can’t remember the last time I felt completely safe. Security seemed more like a luxury to me, reserved for those who were fortunate enough to have picture perfect childhoods. For those who didn’t bear the ugly scars that keep me bound in constant, debilitating fear. I’ve run from that fear my entire life. But when I met him, for once, I couldn’t run anymore. He scared the hell out of me in a way that excited every fiber of my being. It wasn’t the tattoos or the piercings. It wasn’t the warmth that seemed to radiate from his frame and blanket me whenever he was near. It was just…him. The scary beautiful man that threatened to alter 23 years of routine and rituals, and make me face my crippling fear. My name is Kami and I am constantly afraid. And the thing that scares me the most is the very thing I want. “Don’t worry,” he smiled, pulling me into the hard warmth of his chest. “I’ve got you. I’ll always catch you when you fall.”And just like that, Blaine had staked his claim on the untouched part of me that no living soul had ever moved. He had captured every fear, every reservation, and crushed them in the palm of his inked hand. Author Disclaimer: Abuse is real- verbal, emotional, physical and sexual. It takes place all around us; it doesn’t discriminate against race or gender, wealth or poverty. It affects us all- those of us who’ve had to live through it, or watch it happen, even those of us who’ve only heard about it. We are all affected. We are all forever changed. This is not a story for the faint of heart; this is the story of one woman’s very real struggle through a world against her, the people who hurt her, her real life demons and the people who showed her that every gray sky, no matter how dark, has a sun waiting to break through. ***Inspired by true events***",https://www.goodreads.com/book/show/17254226-fear-of-falling
468,It Gets Worse: A Collection of Essays,"THE INSTANT #1 NEW YORK TIMES BESTSELLER New York Times bestselling author, director, actor, and YouTube superstar Shane Dawson returns with another highly entertaining and uproariously funny essay collection, chronicling a mix of real life moments both extraordinary and mortifying, yet always full of heart. Shane Dawson shared some of his best and worst experiences in I Hate Myselfie, the critically acclaimed book that secured his place as a gifted humorist and keen observer of millennial culture. Fans felt as though they knew him after devouring the New York Times , Publishers Weekly, Los Angeles Times , and Wall Street Journal bestseller. They were right… almost. In this new collection of original personal essays, Shane goes even deeper, sharing never-before-revealed stories from his life, giving readers a no-holds-barred look at moments both bizarre and relatable, from cult-like Christian after-school activities, dressing in drag, and losing his virginity, to hiring a psychic, clashes with celebrities, and coming to terms with his bisexuality. Every step of the way, Shane maintains his signature brand of humor, proving that even the toughest breaks can be funny when you learn to laugh at yourself. This is Let's Pretend This Never Happened and Running With Scissors for the millennial generation: an inspiring, intelligent, and brutally honest collection of true stories by a YouTube sensation-turned one of the freshest new voices out there.",https://www.goodreads.com/book/show/27274328-it-gets-worse
531,Genius Squad,"Now that the Axis Institute for World Domination has been blown up; the founder, Dr. Phineas Darkkon, has died; and Prosper English (who enrolled Cadel in the first place) is in jail for myriad offenses, Cadel Piggott has round-the-clock surveillance so he'll be safe until he testifies against Prosper English. But nobody seems to want Cadel. Not Fiona, his social worker; not Saul Greeniaus, the detective assigned to protect him. When he is approached by the head of Genius Squad--a group formed to investigate GenoME, one of Darkkon's pet projects--Cadel is dubious Genius Squad can offer him a real home and all the technology his heart desires. But why can't he bring himself to tell Saul what the group is really up to? And how can Genius Squad protect Cadel once Prosper English breaks out of jail?",https://www.goodreads.com/book/show/2175645.Genius_Squad
598,Promised,"Livy notices him the moment he walks into the coffee shop. He's heart-stoppingly stunning, with a blue-eyed gaze so piercing she's almost too distracted to take his order. When he walks out the door, she thinks she'll never see him again. Then she finds the note he left on his napkin . . . signed M . All he wants is one night to worship her. No feelings, no commitment, nothing but pleasure. Every defense mechanism Livy has adopted during her solitary life is at risk of being obliterated by this confounding man. He's obnoxious but well-mannered. He's a gentleman but aloof. He's passionate but emotionless. Yet the fascination is so powerful, Livy can't deny him... or herself. M awakens something in Livy, something deep and addictive that she never knew existed -- and that she fears only he can satisfy. But she senses that behind the fast cars, fancy suits, and posh apartment, he's aching inside. To have him, body and soul, she'll have to brave his dark secrets. Delving into his world and breaking down his defenses become her obsession - an obsession that could shatter her heart beyond repair.. Releases 7/17 in UK and 8/5 in US.",https://www.goodreads.com/book/show/21795430-promised
634,Acts of Faith,"They met as children, innocents from two different worlds. And from that moment their lives were fated to be forever entwined. Timothy : Abandoned at birth, he finds a home--and a dazzling career--within the Catholic Church. But the vows he takes cannot protect him from one soul-igniting passion. Daniel : The scholarly son of a great rabbi, he is destined to follow in his father's footsteps. And destined to break his father's heart. Deborah : She was raised to be docile and dutiful--the perfect rabbi's wife--but love will lead her to rebellion. And into world's the patriarch would never dare imagine. Reaching across more than a quarter of a century, from the tough streets of Brooklyn to ultramodern Brasilia to an Israeli kibbutz, and radiating the splendor of two holy cities, Rome and Jerusalem, here is Erich Segal's most provocative and ambitious novel to date--the unforgettable story of three extraordinary lives...and one forbidden love.",https://www.goodreads.com/book/show/27594.Acts_of_Faith
646,Hardpressed,"In HARDPRESSED, the highly anticipated second book of the Hacker Series that began with Hardwired , Blake and Erica face threats that put both their love and their lives on the line. Despite Blake Landon's controlling ways, the young and wealthy hacker finally won the trust of the woman he loves. Internet entrepreneur Erica Hathaway broke down the walls that kept her from opening her heart and her business to Blake. Ready to start this new chapter in her life, Erica is determined not to let anything come between them, even if that means giving Blake back some of the control he craves in and out of the bedroom. But when demons from her past threaten their future, Erica makes a decision that could change their lives forever.",https://www.goodreads.com/book/show/25312405-hardpressed


### 2.2. Conjunctive query & Ranking score

#### 2.2.1. Inverted index

In [93]:
def vectorize_tfidf(df, vocabulary, inverted_index, json_name='tfidf.json', columns=['Plot']):
    '''
    Vecterize Plots
    This function, given a vocabulary dictionary, inverted index and pre-processed dataframe return a dictionary with 
    tfidf scores
    Example: {1: {1: 0.7, 5: 3.7}, 2: {3: 1.7, 6: 5.7}} Where 1 and 2 denote the book_id and 1, 5, 3, 6 denote the word.
    df: Pre-processed data frame (df_clean)
    param column: If a list is provided the score will be computed over several columns
    '''

    no_of_documents = len(df)
    
    # number of words in vacabulary
    no_of_words_in_vocab = len(vocabulary)
    
    tfidfDicts = {}
    
    for index, row in df.iterrows():
        d_id = row['BookID']

        if isinstance(columns, list):
            text = (' '.join([row[i] for i in columns])).split(' ')
        else:
            raise('Column must be a list')
            
        no_of_words_in_plot = len(text)
        # Create a vector
        tfDict = dict.fromkeys((i for i in range(1, no_of_words_in_vocab+1)), 0)
        
        
        for word in text:
            index = vocabulary[word]
            tfDict[index] +=1
        
        tfidfDict = {}
        
        for key, value in tfDict.items():
            if value != 0:
                
                no_of_documents_appeared = len(inverted_index[key])

                tfidf = (value/no_of_words_in_plot) * np.log(no_of_documents/no_of_documents_appeared)

                tfidfDict[key] = float('{:.4f}'.format(tfidf))
                        
        tfidfDicts[d_id] = tfidfDict
        
    documents = collections.OrderedDict(sorted(tfidfDicts.items()))
    write_json('data/' + json_name, documents)

In [94]:
%%time
# Generate tfidf based only on the 'Plot' of the books (vocabulary and inverted index have been generated only over PLot as 
# well) 
vectorize_tfidf(df_clean, vocabulary, inverted_index, json_name='tfidf.json', columns=['Plot'])

Wall time: 10min 49s


In [32]:
# Run cell
def get_cosine(doc, query):
    """
    Given two vectors, return a float which is the cosine similarity score
    doc: dictionary vector 
    query: dictionary vector
    """
    intersection = set(doc.keys()) & set(query.keys())
    numerator = sum([doc[x] * query[x] for x in intersection])

    sum1 = sum([doc[x] ** 2 for x in list(doc.keys())])
    sum2 = sum([query[x] ** 2 for x in list(query.keys())])
    denominator = np.sqrt(sum1) * np.sqrt(sum2)

    if not denominator:
        return 0.0
    else:
        return float(numerator) / denominator

In [33]:
# Run cell
def show_results_cosine_similarity(book_id, cosine_similarity, df):
    """
    Generate a tuple with the relevant information about a book that will be displayed in the search engine, 
    and the cosine similarity score
    df: Not pre-processed data set
    """
    data = df[df.BookID == book_id][['BookTitle', 'Plot', 'Url']].values.tolist()[0]
    output = (cosine_similarity, data)
    return output

In [34]:
# Run cell
def search_engine_2(query, inverted_index, vocabulary, tfidf_scores_dict, df, k = 10):
    """
    Cosine similarity search engine which returns all books with the provided query in order of highest cosine 
    value, and displaying top k books
    
    query: Query of user
    df: This dataframe should not be pre-processed
    """
    output = pd.DataFrame(columns=['BookTitle', 'Plot', 'Url', 'Similarity'])
    documents_with_query_words = query_function(query, inverted_index, vocabulary)
    queryed_documents_tfidf = {key: value for key, value in tfidf_scores_dict.items() if key in documents_with_query_words}
    heap_data = []
    
    if len(documents_with_query_words) == 0:
        print('There are no results for the search')
    else:
    
        # pre-process query
        query = global_pre_process(query)

        # vectorize query
        vector_query = {}
        for word in query.split(' '):
            index = vocabulary[word]
            vector_query[index] = 1

        for i in queryed_documents_tfidf.keys():
            similarity = get_cosine(queryed_documents_tfidf[i], vector_query)
            x = show_results_cosine_similarity(i, similarity, df)
            if len(heap_data) < k:
                heapq.heappush(heap_data, x)
            else:
                heapq.heappushpop(heap_data, x)

        for i in range(len(heap_data)):
            output = output.append(pd.Series([heap_data[-(i+1)][1][0], heap_data[-(i+1)][1][1], 
                                              heap_data[-(i+1)][1][2], heap_data[-(i+1)][0]], 
                                             index=output.columns), ignore_index=True) 
        output = output.sort_values(by='Similarity', ascending=False)
        output = HTML(output.to_html(escape=False,
                                     formatters=dict(column_name_with_image_links=path_to_image_html)))
        return output

In [35]:
# Run cell
tfidfDicts = read_json('data/tfidf.json')

In [36]:
# Run cell
input_query = 'break heart'

In [37]:
# Run cell
search_engine_2(query = input_query, inverted_index = inverted_index, 
                vocabulary=vocabulary, tfidf_scores_dict=tfidfDicts,
                df = df_dirty, k = 10)

Unnamed: 0,BookTitle,Plot,Url,Similarity
2,Against All Odds,"Our lives shattered... Our hearts broken... Our souls torn to pieces... He was my world, my whole life. My reason for breathing. I had a perfect marriage, a baby on the way, and I felt fulfilled—almost invincible. Until the day life hit, leaving me broken, vulnerable, and alone. She was my life. My ray of hope on the cloudiest day. With her, I thought I had the ultimate safety. A love that would never hurt or betray me. I gave her my heart, my body, and my soul. Until she broke me, destroying every dream and illusion I had about life, love, and marriage. In our grief, we made a mistake. A mistake I'm not sure we can come back from.",https://www.goodreads.com/book/show/18803442-against-all-odds,0.293084
4,The Knight of the Rose,"Book no longer in print. Refer now to the combined Tears of the Broken and The Knight of the Rose as ""Dark Secrets"". Sequel to the internationally successful vampire novel, Tears of the Broken. Love was only the beginning of her nightmares. When Ara discovered the existence of vampires, she was given the choice between a life as one of them, or a life without her true love. But fate has a funny way of making choices for you. After breaking the heart of the boy she loves with a truth he cannot bear, Ara will find herself in the arms of a predator who will steal her innocence and force the hand of fate. Will David Knight become her rescuer once again, or will he be too late?",https://www.goodreads.com/book/show/13570791-the-knight-of-the-rose,0.220297
0,Almost Broken,"Lauren Brooks fell in love with Cal Scott at 21, married him at 22 and had her heart broken at 23 when he walked out of their marriage. At 25, though raising his daughter on her own, Lauren was finally moving on with her life. Until, she learned the reason for Cal's abandonment, the walls she’d carefully built around her collapsed.. The day she meets Chris, all those feelings she thought she had bottled up come spilling out. She can’t afford to give into her heart’s desire again. Love nearly broke her once, and her daughter doesn't need two broken parents. Christopher Scott is in love, newly engaged to Jenna, who saw him through an illness he didn’t think he could survive. He’s finally settling into the life he’s always wanted, making plans he only dared to dream before now. Until, a woman named Lauren arrives on his doorstep. She’s intriguing, beautiful and, try as he might, he can’t stay away. The closer he gets to her, the more his rock-solid plans begin to crumble. All he knows is that Lauren is the missing piece to a puzzle he must solve. For him to put all the pieces in place, he’ll have to follow his heart, and that might cost him everything...",https://www.goodreads.com/book/show/22667507-almost-broken,0.21701
5,A Love Surrendered,"Orphaned in Iowa, Annie Kennedy moves to Boston to stay with her spinster aunt. She longs for romance to fill the void left by her parents' death. But when she falls hard for Steven O'Connor, the man who broke an engagement to her sister, Annie is worried. Will he break her heart too when he discovers who she really is?",https://www.goodreads.com/book/show/13498999-a-love-surrendered,0.216607
1,Broken for You,"National best seller and Today show Book Club selection, Broken for You is the story of two women in self-imposed exile whose lives are transformed when their paths intersect. Stephanie Kallos's debut novel is a work of infinite charm, wit and heart. It is also a glorious homage to the beauty of broken things. When we meet septuagenarian Margaret Hughes, she is living alone in a mansion in Seattle with only a massive collection of valuable antiques for company. Enter Wanda Schultz, a young woman with a broken heart who has come west to search for her wayward boyfriend. Both women are guarding dark secrets and have spent many years building up protective armor against the outside world. As their tentative friendship evolves, the armor begins to fall away and Margaret opens her house to the younger woman. This launches a series of unanticipated events, leading Margaret to discover a way to redeem her cursed past, and Wanda to learn the true purpose of her cross-country journey. Both funny and heartbreaking, Broken for You is a testament to the saving graces of surrogate families and shows how far the tiniest repair jobs can go in righting the world's wrongs.",https://www.goodreads.com/book/show/96702.Broken_for_You,0.205746
6,Bittersweet Moments,"Can the embers of an old life ignite the flames of a new love? Six years ago, Melisa Bergfeld’s husband died. As the grief of losing him tore into her, she lost his last gift to her—their unborn child—and her hopes and dreams turned to ashes. Left with a life she no longer wants, she seeks salvation in a homeless shelter. For a while, that’s more than enough. But when a fire breaks out, in walks the man who will try to save her life—if she’ll let him. Florian “Heat” Dane has left behind a trail of broken hearts in his wake, including pieces of his own. For all the girls he’s used to fill the hole in his heart, there has been just one he could never erase from his memories. But when Melisa married his best friend Scott Bergfeld, he knew she would never be his the way she’d been the one unforgettable night they spent together. Now that she’s back in his life, he will do anything to recapture her heart, even if it means giving away his own. Heat still has the power to ignite passion in Melisa, something she both desires and rejects. He’s a known heartbreaker, and if there is one thing Melisa doesn’t need, it’s another crack in her heart. But when he confesses his love for her, she fears her secrets from the past will surface. And she might be the one to break his heart this time.",https://www.goodreads.com/book/show/20895867-bittersweet-moments,0.193598
3,My Existence Craves Yours,"My Existence Craves Yours is about how one heart seeks out the other in love, as if you’re drought and they’re rain. It’s a story that contains true love, trauma of a broken heart, mental illness, imprisonment of one’s soul and lessons of life. Amna Dhanani says, “I went through my work trying to come up with a theme, I wrecked my brain for weeks until I saw a pattern for a story. I’ve arranged the poems in a way that each poem has a place in the flow, even though the order that I’ve made is fictional but I’ve not only felt but lived every word that is in here, some by me and some by others as I couldn’t stop myself from writing what my eyes saw, what my ears heard and what my heart felt through the pain of those around me. It often made my own existence suffer from their grief. After the story, I’ve shared bits and pieces about my suffering and survival, ending on The Words chapter.”",https://www.goodreads.com/book/show/42356004-my-existence-craves-yours,0.18073
7,Rock Hard,"An ultimatum can break your heart... Every night lead singer, Sed Lionheart whips thousands of women into a frenzy with his voice alone. But the stage is the only place Sed feels any passion since he lost Jessica... If you’re not willing to break all the rules... It shattered her heart, but law student Jessica broke off her engagement to Sed, determined to be successful on her own terms. But no other man can ever hold a candle to Sed... Then a chance meeting and tortuously close quarters lead to uncontrollable flares of passion and rediscovery of their unique penchant for public encounters. Now, in addition to the risk of mutual heartbreak every time they get together, they’re in danger of truly scandalous public exposure...",https://www.goodreads.com/book/show/9442157-rock-hard,0.179152
8,Half Hearts,"A promise broken Losing her family at a young age, and then broken promises from the man she’d loved all her life, Charlie McCarty rarely allows anyone to get close to her. Resolved to live her life without love and determined to become a top-notch Veterinarian, she begins her residency in Redfield. Fate, however, has a way of stepping in to change even the most obstinate set plans and forces Charlie to face her past, push the boundaries of her control and her heart to the brink of destruction. A passion fueled desire It started out as a celebration, a chance for Charlie to let her hair down and just let go of her firm control for just one evening, but meeting a sexy as hell cowboy—and his familiar best friend—ambush everything. With relentless determination, both cowboys set out to show her that she is everything they want to complete their lives. Charlie begins to dream, once again, for the future she thought lost to her years ago. A Journey of the Heart When a terrifying figure from the past steps into their fragile romance, is their love enough to overcome the horror about to be unleashed or will it leave them with hearts broken in half?",https://www.goodreads.com/book/show/10043433-half-hearts,0.178943
9,Coast,"One life-changing summer. One boy. The boy. The boy who offered me safe touches and heart-stopping smiles - smiles he shared with his son. We filled our days with porch-step kisses, filled our ears with laughter, filled our hearts with love. Deep, soul-aching, desperate love. But love is misleading. It's an invisible, fleeting moment. Somewhere between false adoration and pure hatred comes an emotion, a vulnerable need, a single desire. It lives within the ones who miss it, who crave it, who know better than to expect it. Love is relentless. Even when that love turns to hate, turns to loathing, turns to pain. Love should heal you. But it can also break you. Believe me, I know... Because I'm Becca Owens - a broken girl... ...And he's Josh Warden - the boy who broke me.",https://www.goodreads.com/book/show/30192405-coast,0.174375


## 3. Define a new score!

Based on the information we have at our disposal, we have decided to explore two different approaches for our final search engine:
1. Weighted average of the cosine_similarity, ratingValue, ratingCount and reviewCount. This is not the ideal scenario since weights are not justifiable without any data on the users search history (weights are completely based on human judgement). With more information on the user history we could continuously update the weights.

\begin{align}
score_{new} = \frac{CosineSimilarity \cdot \omega_{1} + ratingValue \cdot \omega_{2} + ratingCount \cdot \omega_{3} + reviewCount \cdot \omega_{4}}{max(ratingValue) + max(ratingCount) + max(reviewCount)}
\end{align}


2. Normalizing the variables ratingValue, ratingCount and reviewCount upon their maximum values of the queried book ids and multiply their sum against the cosine similarity value.

\begin{align}
score_{new} = CosineSimilarity \cdot \bigg(\frac{ratingValue}{max(ratingValue)} + \frac{ratingCount}{max(ratingCount)} + \frac{reviewCount}{max(reviewCount)} \bigg)
\end{align}

In [101]:
%%time
vocabulary_question3, inverted_index_question3 = get_vocabulary_inverted_index(df_clean,
                columns=['BookTitle', 'BookSeries', 'BookAuthors', 'Plot', 'PublishingDate', 'Characters'])

write_json('data/inverted_index_question3.json', inverted_index_question3)
write_json('data/vocabulary_dict_question3.json', vocabulary_question3)

vectorize_tfidf(df_clean, vocabulary_question3, 
                inverted_index_question3, json_name='tfidf_question3.json',
                columns=['BookTitle', 'BookSeries', 'BookAuthors', 'Plot', 'PublishingDate', 'Characters'])

Wall time: 19min 3s


In [38]:
# Run cell
def show_results_cosine_similarity_and_ratings(book_id, cosine_similarity, df):
    """
    Generate a tuple with the relevant information about a book that will be displayed in the search engine, 
    the cosine similarity score and relevant quantitative information about the book (rating, reviewcount, ratingcount)
    df: Not pre-processed data set
    """
    data = df[df.BookID == book_id][['BookTitle', 'Plot', 'Url']].values.tolist()[0]
    ratingValue = float(df[df.BookID == book_id][['RatingValue']].values[0])
    ratingCount = float(df[df.BookID == book_id][['RatingCount']].values[0])
    reviewCount = float(df[df.BookID == book_id][['ReviewCount']].values[0])
    output = (cosine_similarity, ratingValue, ratingCount, reviewCount, data)
    return output

In [39]:
# Run cell
inverted_index_question3 = read_json('data/inverted_index_question3.json')
vocabulary_question3 = read_json_simple('data/vocabulary_dict_question3.json')
tfidfDicts_question3 = read_json('data/tfidf_question3.json')

In [40]:
# Run cell
def search_engine_3(query, inverted_index, vocabulary, tfidf_scores_dict, df, k = 10,
                    new_score='cosine_normalizer', weights=None):
    """
    df: Not pre-processed data set!
    There are currently two alterantives for the computation of the new score:
    - cosine_normalizer: Normalize all quantitative values and multiply against cosine similiarity
    - weighted_average: Give weights to all features based on expert judgement (provide weights as list required!!)
    - weights: list with weights:
        weights[0]: weight for cosine_similiarity
        weights[1]: weight for ratingValue
        weights[2]: weight for ratingCount
        weights[3]: weight for reviewCount

    """
   
    output = pd.DataFrame(columns=['BookTitle', 'Plot', 'Url', 'Score'])
    documents_with_query_words = query_function(query, inverted_index, vocabulary)
    queryed_documents_tfidf = {key: value for key, value in tfidf_scores_dict.items() if key in documents_with_query_words}
    heap_data = []
    
    if len(documents_with_query_words) == 0:
        print('There are no results for the search')
    else:
        
        # pre-process query
        query = global_pre_process(query)

        # vectorize query
        vector_query = {}
        for word in query.split(' '):
            index = vocabulary[word]
            vector_query[index] = 1

        # Get max_ratingCount and max_reviewCount
        ratingValue_list = []
        ratingCount_list = []
        reviewCount_list = []
        ratings_df = df[df.BookID.isin(documents_with_query_words)]

        max_ratingValue = max(list(map(float, df.RatingValue)))
        max_ratingCount = max(list(map(int, df.RatingCount)))
        max_reviewCount = max(list(map(int, df.ReviewCount)))

        # Compute cosine over all intersected documents
        for i in queryed_documents_tfidf.keys():
            similarity = get_cosine(queryed_documents_tfidf[i], vector_query)
            temp = show_results_cosine_similarity_and_ratings(i, similarity, df)
            if new_score == 'weighted_average':
                # temp[0] = cosine_similarity
                # temp[1] = rating
                # temp[2] = ratingCount
                # temp[3] = reviewCount
                # temp[4] = Relevant book information ([booktitle, plot, url])
                score = (temp[0]*weights[0] + temp[1]*weights[1] + temp[2]*weights[2] + temp[3]*weights[3])/np.sum([max_ratingValue, 
                                                                                                                    max_ratingCount, 
                                                                                                                    max_reviewCount])
                x = (score, temp[4])
            elif new_score == 'cosine_normalizer':
                # temp[0] = cosine_similarity
                # temp[1] = rating
                # temp[2] = ratingCount
                # temp[3] = reviewCount
                # temp[4] = Relevant book information ([booktitle, plot, url])
                score = temp[0]*(temp[1]/max_ratingValue + temp[2]/max_ratingCount + temp[3]/max_reviewCount)
                x = (score, temp[4])
            else:
                raise('New score method is not implemented')

            if len(heap_data) < k:
                heapq.heappush(heap_data, x)
            else:
                heapq.heappushpop(heap_data, x)


        for i in range(len(heap_data)):
            output = output.append(pd.Series([heap_data[-(i+1)][1][0], heap_data[-(i+1)][1][1], 
                                              heap_data[-(i+1)][1][2], heap_data[-(i+1)][0]], 
                                             index=output.columns), ignore_index=True) 
        output = output.sort_values(by='Score', ascending=False)
        output = HTML(output.to_html(escape=False,
                                     formatters=dict(column_name_with_image_links=path_to_image_html)))
        return output

In [41]:
# Run cell
input_query = 'friends in love'

#### First try using the cosine_normalizer approach

In [None]:
# Run cell
search_engine_3(query = input_query, inverted_index = inverted_index_question3, 
                vocabulary=vocabulary_question3, tfidf_scores_dict=tfidfDicts_question3,
                df = df_dirty, k = 10,
                new_score='cosine_normalizer')

#### Second try using the weighted_average approach

In [None]:
# Run cell
search_engine_3(query = input_query, inverted_index = inverted_index_question3, 
                vocabulary=vocabulary_question3, tfidf_scores_dict=tfidfDicts_question3,
                df = df_dirty, k = 10,
                new_score='weighted_average', weights=[0.5, 0.2, 0.2, 0.1])

## 4. Make a nice visualization!

In [None]:
# Run cell
def get_book_series(df, num_series=20, series_to_include=['Harry Potter']):
    """ 
    Get first num_series Book Series based on the order of apperance (also including series in the list series_to_include)
    df: Not pre-processed data set!
    """
    bookSeries = {}
    for index, row in df.iterrows():
        d_id = row['BookID']
        book_data = row
        clean_series = re.sub(r'[^a-zA-Z0-9]', ' ', book_data['BookSeries']).split()
        series_name = re.sub(r'[^a-zA-Z]', ' ', book_data['BookSeries']).rstrip().lstrip()
        # If the book is part of a series and the series is one single book
        if (series_name != '') & (len([i for i in clean_series if bool(re.match(r'\d+', i))]) == 1):
            if series_name not in bookSeries:
                # Make sure we only take the first 20 series
                if (len(bookSeries.keys()) < num_series) | (series_name in series_to_include):
                    split_date = re.findall(r'\d+', book_data['PublishingDate'])
                    year = [i for i in split_date if len(i) == 4][0]
                    bookSeries[series_name] = [[' '.join(clean_series), year, book_data['NumberofPages'], book_data['Url']]]
            else:                
                split_date = re.findall(r'\d+', book_data['PublishingDate'])
                try:
                    year = [i for i in split_date if len(i) == 4][0]
                except:
                    year = book_data[8]
                bookSeries[series_name].append([' '.join(clean_series), year, book_data['NumberofPages'], book_data['Url']])

    return bookSeries

In [None]:
# Run cell
book_series = get_book_series(df_dirty)

In [None]:
# Run cell
from ipywidgets import interact, Dropdown

dropdown_bookSeries = Dropdown(options = list(book_series.keys()))

def plot_series(series_name, bookSeries_dict = book_series):
    publish_years = [i[1] for i in bookSeries_dict[series_name]]
    pages = [i[2] for i in bookSeries_dict[series_name]]
    df = pd.DataFrame(columns=['Year', 'Pages'])
    df['Year'] = [int(i) for i in publish_years]
    df['Pages'] = [int(i) for i in pages]
    df = df.sort_values(by='Year', ascending=True)
    df['Years Since Publishment'] = df.Year - df.Year.min()
    df['Pages of Book Series'] = df.Pages.cumsum()
    df.plot(x = 'Years Since Publishment', y = 'Pages of Book Series', title = series_name,
            figsize=(12,8))
    
@interact(series_name = dropdown_bookSeries)
def dropdown_series(series_name):
    plot_series(series_name)

## 5. Algorithmic Question

#### Write a recursive program that, given a string, computes the length of the subsequence of maximum length that is in alphabetical order. Try some examples. Are the examples of short strings correct? Can you find examples that your algorithm does not terminate in reasonable time?

#### Show that the running time of the algorithm is exponential.

#### Write a program that computes the length of the subsequence of maximum length, using dynamic programming.

In [114]:
def dynamic_function(X): 
    # X is a string
    m = len(X)
    
    # L is a list of the length of the longest sequence of characters in alphabetical order that terminates at the i-th character
    L = [0 for i in range(m)]
    
    for i in range(m): 
        if i == 0:
            L[0] = 1
            continue
        max_ = 0
        index = -1
        for j in range(i):
            if X[j] < X[i] and L[j] > max_:
                max_ = L[j]
                index = j
        if n != -1:
            L[i] = L[index] +1
        else:
            L[i] = L[i-1]
    return max(L)

#### What is its runtime complexity?