## 0.1. Import Packages

In [36]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import *
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
import os
import csv
import re
import numpy as np
import pandas as pd
import json
from scipy import spatial
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
import collections
import heapq
from IPython.display import HTML
from bs4 import BeautifulSoup
import time
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from webdriver_manager.firefox import GeckoDriverManager
import webbrowser
import datetime
from langdetect import detect
import io
from multiprocessing import Pool
import multi_processing_functions

In [37]:
from __future__ import print_function

## 0.2. Useful Functions (read, write, etc.)

In [38]:
def write_file(filename, content):
    os.makedirs(os.path.dirname(filename), exist_ok=True)
    with open(filename, "w", encoding='utf-8' ) as f:
        f.write(str(content))

In [39]:
def write_tsv(filename, content):
    os.makedirs(os.path.dirname(filename), exist_ok=True)
    with open(filename, "wt", newline='', encoding='utf-8' ) as out_file:
        tsv_writer = csv.writer(out_file, delimiter='\t')
        tsv_writer.writerow(content)

In [60]:
def read_tsv(filename, type_='utf-8'):
    # cp850
    with open(filename, encoding = type_) as tsvfile:
        reader = csv.reader(tsvfile, delimiter='\t')
        for data in reader:
            return data

In [41]:
def write_json(file_name, content):
    os.makedirs(os.path.dirname(file_name), exist_ok=True)
    with open(file_name, 'w') as outfile:
        json.dump(content, outfile, sort_keys=True, indent=4)

In [42]:
def jsonKeys2int(x):
    if isinstance(x, dict):
            return {int(k):v for k,v in x.items()}
    return x

In [43]:
def read_json(file_name):
    with open(file_name) as json_file:
        data_dict = json.load(json_file, object_hook=jsonKeys2int)
        return data_dict

In [72]:
def merge_tsv_files(path, destiny='data/tsv_files/tsv_files.tsv'):
    tsv_files = os.listdir(path)
    list_tsv = []
    i = 0
    for filename in tsv_files:
        i += 1
        if i%1000 == 0:
            print(i)
        book_id = re.findall(r'\d+', filename)[0]
        filename = path + filename
        book_data = read_tsv(filename)
        book_data = [book_id] + book_data
        list_tsv.append(book_data)
    df=pd.DataFrame(list_tsv,columns=['bookid', 'bookTitle', 'bookSeries', 'bookAuthors', 'ratingValue', 
                                     'ratingCount', 'reviewCount', 'Plot', 'PublishingDate', 
                                     'characters'])
    df.to_csv(destiny, sep='\t')

## 1. Data collection

### 1.1. Get the list of books

### 1.2. Crawl books

In [None]:
driver = webdriver.Firefox(executable_path=GeckoDriverManager().install())

In [None]:
def scrap_book(href):
    driver.get(href)       
    time.sleep(5)
    return driver.page_source

In [None]:
filename = 'data/book_urls.txt'

book_urls = open(filename, 'r')
count = 1
for url in book_urls:
    if count > 0:
        print(url)
        page_number = int((count-1)/100)+1
        html = scrap_book(url)
        path = 'data/page_'+str(page_number)+'/article_'+str(count)+'.html'
        print(path)
        write_file(path, html)
    count +=1
    if count == 2001:
        break

### 1.3 Parse downloaded pages

In [None]:
month_to_number = {'January': 1,
                   'February': 2,
                   'March': 3,
                   'April': 4,
                   'May': 5,
                   'June': 6,
                   'July': 7,
                   'August': 8,
                   'September': 9,
                   'October': 10,
                   'November': 11,
                   'December': 12}

In [None]:
def remove_html_tags(text):
    """Remove html tags from a string"""
    clean = re.compile('<.*?>')
    return re.sub(clean, '', text)

In [None]:
def parse_html_in_folder(path):
    for html_file in os.listdir(path):
        print(html_file)
        with open(path + '/' + html_file, encoding='utf8') as infile:
            soup = BeautifulSoup(infile, features="lxml")
            # Plot can be hidden (if it is hidden we have to take the complete plot)
            try:
                Plot = ' '.join([remove_html_tags(str(c)) for c in soup.find_all('div', id="description")[0].contents[3].contents ])
            except Exception:
                if not soup.find_all('div', id="description"):
                    Plot = ''
                else:
                    Plot = ' '.join([remove_html_tags(str(c)) for c in soup.find_all('div', id="description")[0].contents[1].contents ])
            if Plot:
                if detect(Plot) != 'en':
                    print('Article removed:', html_file)
                    continue
            bookTitle = soup.find_all('h1')[0].contents[0].replace('\n', '').strip()
            bookSeries = soup.find_all('h2', id='bookSeries')[0].text.replace('\n', '').strip()
            bookAuthors = ', '.join([soup.find_all('span', itemprop='name')[i].contents[0] for i in range(
                len(soup.find_all('span', itemprop='name')))])
            ratingValue = soup.find_all('span', itemprop='ratingValue')[0].contents[0].replace('\n', '').strip()
            ratingCount = soup.find_all('meta', itemprop="ratingCount")[0]['content']
            reviewCount = soup.find_all('meta', itemprop="reviewCount")[0]['content']
            try:
                NumberofPages = re.findall(r'\d+', soup.find_all('span', itemprop="numberOfPages")[0].contents[0])[0]
            except:
                if not soup.find_all('span', itemprop="bookFormat"):
                    NumberofPages = ''
                else:
                    NumberofPages = soup.find_all('span', itemprop="bookFormat")[0].contents[0]
            try:
                temp_date = soup.find_all('div', id='details')[0].find_all('div', {"class": "row"})[1].text.split('\n')[
                    2].split()
            except:
                try:
                    temp_date = soup.find_all('div', id='details')[0].find_all('div', {"class": "row"})[0].contents[0].split('\n')[
                        2].split()
                except:
                    try:
                        temp_date = soup.find_all('div', id='details')[0].find_all('nobr', {"class": "greyText"})[0].contents[0].split('\n')[1].split()[-3:]
                    except:
                        temp_date = ''
            PublishingDate = ' '.join(temp_date)
            characters = []
            settings = []
            for i in range(1, len(soup.find_all('div', id="bookDataBox")[0].find_all('a'))):
                if re.match(r'/characters/', soup.find_all('div', id="bookDataBox")[0].find_all('a')[i].attrs['href']):
                    characters.append(soup.find_all('div', id="bookDataBox")[0].find_all('a')[i].text)
                elif re.match(r'/places/', soup.find_all('div', id="bookDataBox")[0].find_all('a')[i].attrs['href']):
                    settings.append(soup.find_all('div', id="bookDataBox")[0].find_all('a')[i].text)
            characters = ', '.join(characters)
            settings = ', '.join(settings)
            url = soup.find_all('link', rel='canonical')[0].attrs['href']

            final_list = [bookTitle, bookSeries, bookAuthors, ratingValue, ratingCount, reviewCount,
                          Plot, NumberofPages, PublishingDate, characters, settings, url]
            
            filename = 'data/tsv_files/book' + re.findall(r'\d+', html_file)[0] + '.tsv'
            
            write_tsv_file(filename, final_list)


In [None]:
# None parallel approach
for i in range(1,301):
    print(i)
    try:
        parse_html_in_folder('data/page_' + str(i))
    except:
        print('FOLDER NOT PARSED:', i)
        

In [None]:
# Parallel approach
if __name__ == '__main__':
    with Pool(8) as p:
        print(p.map(multi_processing_functions.parse_html_in_folder, 
                    ['../data_html/' + i for i in os.listdir('../data_html')]))

## 2. Search Engine

### 2.0. Pre-process of information

In [64]:
def remove_stop_words(plot):
    #This allow us to identify stop word in english
    stopwords = nltk.corpus.stopwords.words('english')
    #stop_words = set(stopwords.words('english'))
    
    word_tokens = word_tokenize(plot)
    filtered_sentence = [w.lower() for w in word_tokens if w.lower() not in stopwords and len(w) > 1]

    text = ' '.join(filtered_sentence)
    return text

In [65]:
def remove_punctuation(plot): 
    tokenizer = nltk.RegexpTokenizer(r"\w+")
    # TODO: Infinite possibilities….
    text = tokenizer.tokenize(plot)
    clean_punctuation = ' '.join(text)
    return clean_punctuation

In [66]:
def remove_stemming(sentence):
    
    ps = PorterStemmer()
    words = word_tokenize(sentence)
    stem_sentence=[]
    
    for w in words:
        stem_sentence.append(ps.stem(w))

    text = " ".join(stem_sentence)
    return text 

In [67]:
def remove_lemma(sentence):
    nlp = spacy.load("en_core_web_sm")
    doc = nlp(sentence)
    lemma = []
    for token in doc:
        lemma.append(token.lemma_)
    text = ' '.join(lemma)
    return text

In [68]:
def global_pre_process(text):
    """ Function to process everything at once """
    text = remove_punctuation(text)
    text = remove_stop_words(text)
    text = remove_lemma(text)
    text = re.sub(r'[^a-zA-Z0-9]', ' ', text).strip()
    return text

In [69]:
def pre_processing(path, destiny_path='clean_tsv_files', columns_to_process= [0, 1, 2, 6, 8, 9], 
                   columns_to_include=[3, 4, 5]):
    """ Pre-process data 
    
    0: bookTitle 
    1: bookSeries 
    2: bookAuthors 
    3: ratingValue 
    4: ratingCount 
    5: reviewCount
    6: Plot
    8: PublishingDate 
    9: characters
    10: settings
    11: url
    
    """
    tsv_files = os.listdir(path)
    for filename in tsv_files:
        print(filename)
        filename = path + filename
        book_data = read_tsv(filename)
        list_of_processed_fields = columns_to_process
        total_list_of_fields = list_of_processed_fields + columns_to_include
        total_list_of_fields.sort()
        pre_processed_string = []
        for i in total_list_of_fields:
            if i in columns_to_process:
                pre_processed_string.append(global_pre_process(book_data[i]))
            else:
                pre_processed_string.append(book_data[i])
        
        # Keep only the year of publication
        if 8 in columns_to_process:
            try:
                pre_processed_string[7] = final = [re.findall(r'\d+', pre_processed_string[7])[i] for i in 
                                                   range(len(re.findall(r'\d+', pre_processed_string[7]))) if 
                                                   len(re.findall(r'\d+', pre_processed_string[7])[i]) == 4][0]
            except:
                pre_processed_string[7] = ''
        
        filename = 'data/' + destiny_path + '/book' + re.findall(r'\d+', filename)[0] + '.tsv'
            
        write_tsv(filename, pre_processed_string)


In [70]:
pre_processing('data/single_tsv/', destiny_path='clean_tsv_files')

book3451.tsv
book3749.tsv


In [None]:
merge_tsv_files('data/clean_tsv_files/', destiny='data/clean_tsv_files/clean_tsv_files.tsv')

### 2.1. Conjunctive query

#### 2.1.1. Create your index!

In [19]:
def get_vocabulary(path, columns):
    tsv_files = os.listdir(path)
    vocabulary = {}
    count = 1
    inverted_index = {}
    for filename in tsv_files:
        d_id = int(re.findall(r'\d+', filename)[0])
        filename = path + filename
        text = read_tsv(filename)
        if isinstance(columns, list):
            text = (' '.join([text[i] for i in columns])).split(' ')
        else:
            raise('Column must be a list')
            
        for word in text:
            if word not in vocabulary: 
                vocabulary[word] = count
                inverted_index[count] = [d_id]
            #    print(inverted_index)
                count +=1
            else:
                key = vocabulary[word]
                if d_id not in inverted_index[key]:
                     inverted_index[key].append(d_id)
    return vocabulary, inverted_index

In [20]:
vocabulary_test, inverted_index_test = get_vocabulary('data/clean_tsv_files_test/', columns=[6])

In [21]:
write_json('data/inverted_index_test.json', inverted_index_test)
write_json('data/vocabulary_dict_test.json', vocabulary_test)

#### 2.1.2. Execute the query

In [22]:
def get_pointer_values(pointer, index_list):
    """ Based on a set of pointer values get the documents """
    values = []
    for i in range(len(pointer)):
        values.append(index_list[i][pointer[i]])
    return values

In [23]:
def update_pointer(values, pointer):
    """ Given the values, compute the minimum and update the pointer accordingly based on their minimum """
    mins = np.where(values == np.min(values))[0]
    for i in range(0, len(mins)):
        pointer[mins[i]] = pointer[mins[i]] + 1 
    return pointer

In [24]:
def query_function(query, index, vocabulary_integer):
    """ Given a query find the documents in which these appear based on the index """
    
    # Pre-process query 
    query = global_pre_process(query)
    
    # Query to list of strings
    query_list = query.split()
    
    # Map strings to integer based on dict
    integer_list = [vocabulary_integer[i] for i in query_list]
    
    # Start to look for the intersection of the query in the index
    total_query_documents = [sorted(index[i]) for i in integer_list]
    
    # Generate a list with the pointer values
    pointers = np.full(len(total_query_documents), 0)
    values = np.full(len(total_query_documents), 0)
    
    # List where intersection documents will be stored
    intersection = []

    # Compute the document in which the search should stop
    max_list = np.array([max(total_query_documents[i]) for i in range(len(total_query_documents))])

    try:
        # Loop over all elements stopping at the minimum between all documents
        while np.any(values != max_list):
            # Get the documents based on the pointer
            values = get_pointer_values(pointer = pointers, 
                                        index_list = total_query_documents)
            # If all values are equal we have found a match and all the pointer values are increased by one
            if len(set(values)) == 1:
                intersection.append(values[0])
                pointers += 1
            # If all values are not equal increase the values of the minimum pointers
            else:
                pointers = update_pointer(values, pointers)
    except:
        intersection = sorted(list(set.intersection(*map(set,total_query_documents))))
    
    assert intersection == sorted(list(set.intersection(*map(set,total_query_documents)))), 'Algorithm is not returning same result as python implementation'
    
    return intersection

In [25]:
def path_to_image_html(path):
    return '<img src="'+ path + '" style=max-height:124px;"/>'

In [26]:
def show_results(book_ids, tsv_path = 'data/tsv_files'):
    output = pd.DataFrame(columns=['BookTitle', 'Plot', 'Url'])
    for book_id in book_ids:
        tsv_file = tsv_path + '/book' + str(book_id) + '.tsv'
        data = read_tsv(tsv_file)
        output = output.append(pd.Series([data[0], data[6], data[-1]], index=output.columns), ignore_index=True)
    return output

In [27]:
def search_engine_1(query, inverted_index, vocabulary, tsv_path):
    query_results = query_function(query, inverted_index, vocabulary)
    output = show_results(query_results, tsv_path)
    output = HTML(output.to_html(escape=False,
                                 formatters=dict(column_name_with_image_links=path_to_image_html)))
    return output

In [28]:
search = search_engine_1('one could', inverted_index_test, vocabulary_test, 'data/test_tsv')
search

Unnamed: 0,BookTitle,Plot,Url
0,The Hunger Games,"Could you survive on your own in the wild, with every one out to make sure you don't live to see the morning? In the ruins of a place once known as North America lies the nation of Panem, a shining Capitol surrounded by twelve outlying districts. The Capitol is harsh and cruel and keeps the districts in line by forcing them all to send one boy and one girl between the ages of twelve and eighteen to participate in the annual Hunger Games, a fight to the death on live TV. Sixteen-year-old Katniss Everdeen, who lives alone with her mother and younger sister, regards it as a death sentence when she steps forward to take her sister's place in the Games. But Katniss has been close to dead before—and survival, for her, is second nature. Without really meaning to, she becomes a contender. But if she is to win, she will have to start making choices that weight survival against humanity and life against love.",https://www.goodreads.com/book/show/2767052-the-hunger-games
1,The Iron King,"Meghan Chase has a secret destiny; one she could never have imagined. Something has always felt slightly off in Meghan's life, ever since her father disappeared before her eyes when she was six. She has never quite fit in at school or at home. When a dark stranger begins watching her from afar, and her prankster best friend becomes strangely protective of her, Meghan senses that everything she's known is about to change. But she could never have guessed the truth - that she is the daughter of a mythical faery king and is a pawn in a deadly war. Now Meghan will learn just how far she'll go to save someone she cares about, to stop a mysterious evil, no faery creature dare face; and to find love with a young prince who might rather see her dead than let her touch his icy heart.",https://www.goodreads.com/book/show/6644117-the-iron-king


### 2.2. Conjunctive query & Ranking score

#### 2.2.1. Inverted index

In [29]:
def vectorize_tfidf(path, vocabulary, inverted_index, json_name='tfidf.json', columns=[6]):
    '''Path: clean tsv file 
    param column: If a list is provided the score will be computed over several columns
    '''
    tsv_files = os.listdir(path)
    no_of_documents = len(tsv_files)
    
    # number of words in vacabulary
    no_of_words_in_vocab = len(vocabulary)
    
    tfidfDicts = {}
    
    for filename in tsv_files:
        d_id = int(re.findall(r'\d+', filename)[0])
        filename = path + filename
        
        # read plot from file name
        if isinstance(columns, list):
            text = read_tsv(filename)
            text = (' '.join([text[i] for i in columns])).split(' ')
        else:
            raise('Column must be a list')
            
        no_of_words_in_plot = len(text)
        # Create a vector
        tfDict = dict.fromkeys((i for i in range(1, no_of_words_in_vocab+1)), 0)
        
        
        for word in text:
            index = vocabulary[word]
            tfDict[index] +=1
        
        tfidfDict = {}
        
        for key, value in tfDict.items():
            if value != 0:
                
                no_of_documents_appeared = len(inverted_index[key])

                tfidf = (value/no_of_words_in_plot) * np.log(no_of_documents/no_of_documents_appeared)

                tfidfDict[key] = float('{:.4f}'.format(tfidf))
                
                #print([word for word, index in vocabulary.items() if index == key], value, no_of_words_in_plot, no_of_documents, no_of_documents_appeared, tfidfDict[key])
        
        tfidfDicts[d_id] = tfidfDict
        
    documents = collections.OrderedDict(sorted(tfidfDicts.items()))
    write_json('data/' + json_name, documents)

In [36]:
%%time
vectorize_tfidf('data/clean_tsv_files_test/', vocabulary_test, inverted_index_test, json_name='tfidf_test.json')

tfidfDicts_test = read_json('data/tfidf_test.json')

Wall time: 54.9 ms


In [31]:
def get_cosine(doc, query):
    intersection = set(doc.keys()) & set(query.keys())
    numerator = sum([doc[x] * query[x] for x in intersection])

    sum1 = sum([doc[x] ** 2 for x in list(doc.keys())])
    sum2 = sum([query[x] ** 2 for x in list(query.keys())])
    denominator = np.sqrt(sum1) * np.sqrt(sum2)

    if not denominator:
        return 0.0
    else:
        return float(numerator) / denominator

In [32]:
def show_results_cosine_similarity(book_id, cosine_similarity, tsv_path = 'data/test_tsv'):
    tsv_file = tsv_path + '/book' + str(book_id) + '.tsv'
    data = read_tsv(tsv_file)
    #       Title    Plot     Url
    data = [data[0], data[6], data[-1]]
    output = (cosine_similarity, data)
    return output

In [33]:
def show_results_cosine_similarity_and_ratings(book_id, cosine_similarity, tsv_path = 'data/tsv_files'):
    tsv_file = tsv_path + '/book' + str(book_id) + '.tsv'
    raw_data = read_tsv(tsv_file)
    #       Title        Plot         Url
    data = [raw_data[0], raw_data[6], raw_data[-1]]
    #                            RatingValue         ratingCount         reviewCount
    output = (cosine_similarity, float(raw_data[3]), float(raw_data[4]), float(raw_data[5]), data)
    return output

In [34]:
def search_engine_2(query, inverted_index, vocabulary, tfidf_scores_dict, tsv_path = 'data/tsv_files', k = 10):
   
    output = pd.DataFrame(columns=['BookTitle', 'Plot', 'Url', 'Similarity'])
    documents_with_query_words = query_function(query, inverted_index, vocabulary)
    queryed_documents_tfidf = {key: value for key, value in tfidf_scores_dict.items() if key in documents_with_query_words}
    heap_data = []
    
    # pre-process query
    query = global_pre_process(query)
    
    # vectorize query
    vector_query = {}
    for word in query.split(' '):
        index = vocabulary[word]
        vector_query[index] = 1
    
    for i in queryed_documents_tfidf.keys():
        similarity = get_cosine(queryed_documents_tfidf[i], vector_query)
        x = show_results_cosine_similarity(i, similarity, tsv_path)
        if len(heap_data) < k:
            heapq.heappush(heap_data, x)
        else:
            heapq.heappushpop(heap_data, x)
    for i in range(len(heap_data)):
        output = output.append(pd.Series([heap_data[-(i+1)][1][0], heap_data[-(i+1)][1][1], 
                                          heap_data[-(i+1)][1][2], heap_data[-(i+1)][0]], 
                                         index=output.columns), ignore_index=True) 
    output = output.sort_values(by='Similarity', ascending=False)
    output = HTML(output.to_html(escape=False,
                                 formatters=dict(column_name_with_image_links=path_to_image_html)))
    return output

In [37]:
search_engine_2(query = 'could one', inverted_index = inverted_index_test, 
                vocabulary=vocabulary_test, tfidf_scores_dict=tfidfDicts_test,
                tsv_path = 'data/test_tsv', k = 10)

Unnamed: 0,BookTitle,Plot,Url,Similarity
0,The Iron King,"Meghan Chase has a secret destiny; one she could never have imagined. Something has always felt slightly off in Meghan's life, ever since her father disappeared before her eyes when she was six. She has never quite fit in at school or at home. When a dark stranger begins watching her from afar, and her prankster best friend becomes strangely protective of her, Meghan senses that everything she's known is about to change. But she could never have guessed the truth - that she is the daughter of a mythical faery king and is a pawn in a deadly war. Now Meghan will learn just how far she'll go to save someone she cares about, to stop a mysterious evil, no faery creature dare face; and to find love with a young prince who might rather see her dead than let her touch his icy heart.",https://www.goodreads.com/book/show/6644117-the-iron-king,0.127684
1,The Hunger Games,"Could you survive on your own in the wild, with every one out to make sure you don't live to see the morning? In the ruins of a place once known as North America lies the nation of Panem, a shining Capitol surrounded by twelve outlying districts. The Capitol is harsh and cruel and keeps the districts in line by forcing them all to send one boy and one girl between the ages of twelve and eighteen to participate in the annual Hunger Games, a fight to the death on live TV. Sixteen-year-old Katniss Everdeen, who lives alone with her mother and younger sister, regards it as a death sentence when she steps forward to take her sister's place in the Games. But Katniss has been close to dead before—and survival, for her, is second nature. Without really meaning to, she becomes a contender. But if she is to win, she will have to start making choices that weight survival against humanity and life against love.",https://www.goodreads.com/book/show/2767052-the-hunger-games,0.102831


## 3. Define a new score!

After giving it a lot of thought, and based on the information we have, we have decided to explore two different approaches:
1. Weighted average of the cosine_similarity, ratingValue, ratingCount and reviewCount. This is not the ideal scenario since weights are not justifiable without any data on the users search history.
2. Normalize the ratingValue, ratingCount and reviewCount and multiply their sum against the cosine similarity value.

In [38]:
%%time
vocabulary_test_question3, inverted_index_test_question3 = get_vocabulary('data/clean_tsv_files_test/',
                columns=[0, 1, 2, 6, 7, 8])

write_json('data/inverted_index_test_question3.json', inverted_index_test_question3)
write_json('data/vocabulary_dict_test_question3.json', vocabulary_test_question3)

vectorize_tfidf('data/clean_tsv_files_test/', vocabulary_test_question3, 
                inverted_index_test_question3, json_name='tfidf_test_question3.json',
                columns=[0, 1, 2, 6, 7, 8])

tfidfDicts_test_question3 = read_json('data/tfidf_test_question3.json')

Wall time: 90.8 ms


In [39]:
def search_engine_3(query, inverted_index, vocabulary, tfidf_scores_dict, tsv_path = 'data/tsv_files', k = 10,
                    new_score='cosine_normalizer', weights=None):
    """
    There are currently two alterantives for the computation of the new score:
    - cosine_normalizer: Normalize all quantitative values and multiply against cosine similiarity
    - weighted_average: Give weights to all features based on expert judgement (provide weights as list required!!)
    - weights: list with weights:
        weights[0]: weight for cosine_similiarity
        weights[1]: weight for ratingValue
        weights[2]: weight for ratingCount
        weights[3]: weight for reviewCount

    """
   
    output = pd.DataFrame(columns=['BookTitle', 'Plot', 'Url', 'Score'])
    documents_with_query_words = query_function(query, inverted_index, vocabulary)
    queryed_documents_tfidf = {key: value for key, value in tfidf_scores_dict.items() if key in documents_with_query_words}
    heap_data = []
    
    # pre-process query
    query = global_pre_process(query)
    
    # vectorize query
    vector_query = {}
    for word in query.split(' '):
        index = vocabulary[word]
        vector_query[index] = 1
        
    # Get max_ratingCount and max_reviewCount
    if new_score == 'cosine_normalizer':
        ratingValue_list = []
        ratingCount_list = []
        reviewCount_list = []
        for i in queryed_documents_tfidf.keys():
            rating_review = read_tsv(tsv_path + '/book' + str(i) + '.tsv')
            ratingValue_list.append(float(rating_review[3]))
            ratingCount_list.append(int(rating_review[4]))
            reviewCount_list.append(int(rating_review[5]))

        max_ratingValue = max(ratingValue_list)
        max_ratingCount = max(ratingCount_list)
        max_reviewCount = max(reviewCount_list)
    
    # Compute cosine over all intersected documents
    for i in queryed_documents_tfidf.keys():
        similarity = get_cosine(queryed_documents_tfidf[i], vector_query)
        temp = show_results_cosine_similarity_and_ratings(i, similarity, tsv_path)
        if new_score == 'weighted_average':
            score = temp[0]*weights[0] + temp[1]*weights[1] + temp[2]*weights[2] + temp[3]*weights[3]
            x = (score, temp[4])
        elif new_score == 'cosine_normalizer':
            score = temp[0]*(temp[1]/max_ratingValue + temp[2]/max_ratingCount + temp[3]/max_reviewCount)
            x = (score, temp[4])
        else:
            raise('New score method is not implemented')
            
        if len(heap_data) < k:
            heapq.heappush(heap_data, x)
        else:
            heapq.heappushpop(heap_data, x)
    
        
    for i in range(len(heap_data)):
        output = output.append(pd.Series([heap_data[-(i+1)][1][0], heap_data[-(i+1)][1][1], 
                                          heap_data[-(i+1)][1][2], heap_data[-(i+1)][0]], 
                                         index=output.columns), ignore_index=True) 
    output = output.sort_values(by='Score', ascending=False)
    output = HTML(output.to_html(escape=False,
                                 formatters=dict(column_name_with_image_links=path_to_image_html)))
    return output

In [42]:
search_engine_3(query = 'could one', inverted_index = inverted_index_test_question3, 
                vocabulary=vocabulary_test_question3, tfidf_scores_dict=tfidfDicts_test_question3,
                tsv_path = 'data/test_tsv', k = 10,
                new_score='cosine_normalizer', weights=[0.5, 0.2, 0.1, 0.2])

Unnamed: 0,BookTitle,Plot,Url,Score
0,The Hunger Games,"Could you survive on your own in the wild, with every one out to make sure you don't live to see the morning? In the ruins of a place once known as North America lies the nation of Panem, a shining Capitol surrounded by twelve outlying districts. The Capitol is harsh and cruel and keeps the districts in line by forcing them all to send one boy and one girl between the ages of twelve and eighteen to participate in the annual Hunger Games, a fight to the death on live TV. Sixteen-year-old Katniss Everdeen, who lives alone with her mother and younger sister, regards it as a death sentence when she steps forward to take her sister's place in the Games. But Katniss has been close to dead before—and survival, for her, is second nature. Without really meaning to, she becomes a contender. But if she is to win, she will have to start making choices that weight survival against humanity and life against love.",https://www.goodreads.com/book/show/2767052-the-hunger-games,0.183938
1,The Iron King,"Meghan Chase has a secret destiny; one she could never have imagined. Something has always felt slightly off in Meghan's life, ever since her father disappeared before her eyes when she was six. She has never quite fit in at school or at home. When a dark stranger begins watching her from afar, and her prankster best friend becomes strangely protective of her, Meghan senses that everything she's known is about to change. But she could never have guessed the truth - that she is the daughter of a mythical faery king and is a pawn in a deadly war. Now Meghan will learn just how far she'll go to save someone she cares about, to stop a mysterious evil, no faery creature dare face; and to find love with a young prince who might rather see her dead than let her touch his icy heart.",https://www.goodreads.com/book/show/6644117-the-iron-king,0.100873


In [41]:
search_engine_2(query = 'could one', inverted_index = inverted_index_test_question3, 
                vocabulary=vocabulary_test_question3, tfidf_scores_dict=tfidfDicts_test_question3,
                tsv_path = 'data/test_tsv', k = 10)

Unnamed: 0,BookTitle,Plot,Url,Similarity
0,The Iron King,"Meghan Chase has a secret destiny; one she could never have imagined. Something has always felt slightly off in Meghan's life, ever since her father disappeared before her eyes when she was six. She has never quite fit in at school or at home. When a dark stranger begins watching her from afar, and her prankster best friend becomes strangely protective of her, Meghan senses that everything she's known is about to change. But she could never have guessed the truth - that she is the daughter of a mythical faery king and is a pawn in a deadly war. Now Meghan will learn just how far she'll go to save someone she cares about, to stop a mysterious evil, no faery creature dare face; and to find love with a young prince who might rather see her dead than let her touch his icy heart.",https://www.goodreads.com/book/show/6644117-the-iron-king,0.100895
1,The Hunger Games,"Could you survive on your own in the wild, with every one out to make sure you don't live to see the morning? In the ruins of a place once known as North America lies the nation of Panem, a shining Capitol surrounded by twelve outlying districts. The Capitol is harsh and cruel and keeps the districts in line by forcing them all to send one boy and one girl between the ages of twelve and eighteen to participate in the annual Hunger Games, a fight to the death on live TV. Sixteen-year-old Katniss Everdeen, who lives alone with her mother and younger sister, regards it as a death sentence when she steps forward to take her sister's place in the Games. But Katniss has been close to dead before—and survival, for her, is second nature. Without really meaning to, she becomes a contender. But if she is to win, she will have to start making choices that weight survival against humanity and life against love.",https://www.goodreads.com/book/show/2767052-the-hunger-games,0.061313


## 4. Make a nice visualization!

In [34]:
def get_book_series(path, num_series=20, series_to_include=['Harry Potter']):
    """ 
    
    """
    bookSeries = {}
    tsv_files = os.listdir(path)
    i = 0
    for filename in tsv_files:
        i += 1
        if i%1000 == 0:
            print(i)
        filename = path + filename
        book_data = read_tsv(filename)
        clean_series = re.sub(r'[^a-zA-Z0-9]', ' ', book_data[1]).split()
        series_name = re.sub(r'[^a-zA-Z]', ' ', book_data[1]).rstrip().lstrip()
        # If the book is part of a series and the series is one single book
        if (series_name != '') & (len([i for i in clean_series if bool(re.match(r'\d+', i))]) == 1):
            if series_name not in bookSeries:
                # Make sure we only take the first 20 series
                if (len(bookSeries.keys()) < num_series) | (series_name in series_to_include):
                    split_date = re.findall(r'\d+', book_data[8])
                    year = [i for i in split_date if len(i) == 4][0]
                    bookSeries[series_name] = [[' '.join(clean_series), year, book_data[7], book_data[-1], filename]]
            else:                
                split_date = re.findall(r'\d+', book_data[8])
                try:
                    year = [i for i in split_date if len(i) == 4][0]
                except:
                    year = book_data[8]
                bookSeries[series_name].append([' '.join(clean_series), year, book_data[7], book_data[-1], filename])

    return bookSeries

In [35]:
book_series = get_book_series('data/tsv_files/')

1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
23000
24000
25000


In [None]:
# TODO: It would be cool if we can put this into a dropdown of some kind
def plot_series(series_name, bookSeries_dict = book_series):
    publish_years = [i[1] for i in bookSeries_dict[series_name]]
    pages = [i[2] for i in bookSeries_dict[series_name]]
    df = pd.DataFrame(columns=['Year', 'Pages'])
    df['Year'] = publish_years
    df['Pages'] = pages
    print(df)
    df = df.sort_values(by='Year', ascending=True)
    df['Years Since Publishment'] = df.Year - df.Year.min()
    df['Pages of Book Series'] = df.Pages.cumsum()
    df.plot(x = 'Years Since Publishment', y = 'Pages of Book Series', title = series_name);

In [None]:
plot_series('Harry Potter')

In [None]:
from ipywidgets import interact, Dropdown

dropdown_bookSeries = Dropdown(options = list(book_series.keys()))
    
@interact(series_name = dropdown_bookSeries)
def dropdown_series(series_name):
    plot_series(series_name)

# TO DELETE

In [22]:
[bookTitle, bookSeries, bookAuthors, ratingValue, ratingCount, reviewCount,
                          Plot, NumberofPages, PublishingDate, characters, settings, url]

In [23]:
x

'Beware the Jabberwock, my son! \n The jaws that bite, the claws that catch! \n Beware the Jubjub bird, and shun \n The frumious Bandersnatch!'

In [28]:
path = 'data/test_tsv/'
tsv_files = os.listdir(path)

list_tsv = []
pd.DataFrame(columns=['bookid', 'bookTitle', 'bookSeries', 'bookAuthors', 'ratingValue', 
                               'ratingCount', 'reviewCount', 'Plot', 'NumberofPages', 'PublishingDate', 
                               'characters', 'settings', 'url'])

def merge_tsv_files(path, destiny='data/tsv_files/tsv_files.tsv')
    tsv_files = os.listdir(path)
    list_tsv = []
    for filename in tsv_files:
        book_id = re.findall(r'\d+', filename)[0]
        filename = path + filename
        book_data = read_tsv(filename)
        book_data = [book_id] + book_data
        list_tsv.append(book_data)
    df=pd.DataFrame(list_tsv,columns=['bookid', 'bookTitle', 'bookSeries', 'bookAuthors', 'ratingValue', 
                                     'ratingCount', 'reviewCount', 'Plot', 'NumberofPages', 'PublishingDate', 
                                     'characters', 'settings', 'url'])
    df.to_csv('data/tsv_files/tsv_files.tsv', sep='\t')

In [29]:
list_tsv

[['1',
  'The Hunger Games',
  '(The Hunger Games #1)',
  'Suzanne Collins',
  '4.33',
  '6409198',
  '172562',
  "Could you survive on your own in the wild, with every one out to make sure you don't live to see the morning?   In the ruins of a place once known as North America lies the nation of Panem, a shining Capitol surrounded by twelve outlying districts. The Capitol is harsh and cruel and keeps the districts in line by forcing them all to send one boy and one girl between the ages of twelve and eighteen to participate in the annual Hunger Games, a fight to the death on live TV.   Sixteen-year-old Katniss Everdeen, who lives alone with her mother and younger sister, regards it as a death sentence when she steps forward to take her sister's place in the Games. But Katniss has been close to dead before—and survival, for her, is second nature. Without really meaning to, she becomes a contender. But if she is to win, she will have to start making choices that weight survival against 

In [30]:
df=pd.DataFrame(list_tsv,columns=['bookid', 'bookTitle', 'bookSeries', 'bookAuthors', 'ratingValue', 
                                     'ratingCount', 'reviewCount', 'Plot', 'NumberofPages', 'PublishingDate', 
                                     'characters', 'settings', 'url'])

In [32]:
df.to_csv('data/tsv_files/tsv_files.tsv', sep='\t')

In [33]:
import pandas as pd

products = {'Product': ['Tablet','iPhone','Laptop','Monitor'],
            'Price': [250,800,1200,300]
            }

df = pd.DataFrame(products, columns= ['Product', 'Price'])

products_list = df.values.tolist()
print (products_list)

[['Tablet', 250], ['iPhone', 800], ['Laptop', 1200], ['Monitor', 300]]
