In [1]:
import re
import os
import csv
import nltk
import pickle
import numpy as np
import heapq
from scipy.spatial import distance
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.tokenize import RegexpTokenizer
from collections import Counter
from heapq import heappush
N_doc=26543

In [2]:
tokenizer = RegexpTokenizer(r'[a-z]+') #Change this line by removing 0-9 if we don't want numbers in the plot tokens.
stop_words = set(stopwords.words("english"))
stemmer= PorterStemmer()

In [3]:
with open('inverted_index_1.pkl', 'rb') as handle:
    inverted_index = pickle.load(handle)
with open('vocabulary.pkl', 'rb') as handle:
    vocabulary = pickle.load(handle)
with open('vocabulary2.pkl', 'rb') as handle:
    vocabulary2 = pickle.load(handle)
with open('tfIdf_index.pkl', 'rb') as handle:
    tfIdf_index = pickle.load(handle)
with open('BookTokens.pkl', 'rb') as handle:
    BookTokens = pickle.load(handle)

In [4]:
class bookTitle():
    def name():
        return "bookTitle"
    def parse(soup):
        bookTitle = soup.find_all('h1')[0].contents[0]
        bookTitle = " ".join(bookTitle.split())
        return (bookTitle)
    def score():
        pass

class bookSeries():
    def name():
        return "bookSeries"
    def parse(soup):
        bookSeries=""
        bookSeries=soup.find('h2',id="bookSeries").text.strip()[1:-1]
        return bookSeries

class bookAuthors():
    def name():
        return "bookAuthors"
    def parse(soup):
        bookAuthors=[]
        for element in soup.find_all("span",itemprop="name"):
            bookAuthors.append(element.text.strip())
        return print_list(bookAuthors)

class ratingValue():
    def name():
        return "ratingValue"
    def parse(soup):
        ratingValue = soup.find_all('span',itemprop="ratingValue")[0].contents[0].split('\n')[1].strip() 
        return ratingValue

class ratingCount():
    def name():
        return "ratingCount"  
    def parse(soup):
        return str(soup.find("meta",itemprop="ratingCount").get("content"))

class reviewCount():
    def name():
        return "reviewCount"
    def parse(soup):
        return str(soup.find("meta",itemprop="reviewCount").get("content"))


class Plot():
    def name():
        return "Plot"
    def parse(soup):
        def headingToRemove(Plot): 
            to_check=Plot.find("i")
            if to_check:
                forbidden_strings=["isbn","edition","librarian's note"]
                for string in forbidden_strings:
                    if string in to_check.text.lower():
                        Plot.find("i").decompose()

        Plot=soup.find("div", id="descriptionContainer").find_all("span")

        if len(Plot)==2:
            Plot=Plot[1]
            headingToRemove(Plot)
            Plot=Plot.text
            Plot=" ".join(Plot.split())
            Plot=Plot.replace("\\","")
        elif len(Plot)==1: 
            Plot=Plot[0]
            headingToRemove(Plot)
            Plot=Plot.text
            Plot=" ".join(Plot.split())
            Plot=Plot.replace("\\","")
        else:
            Plot=""
        return Plot


class NumberOfPages():
    def name():
        return "NumberOfPages"
    def parse(soup):
        N_pages=soup.find_all('span', itemprop="numberOfPages")
        if N_pages:
            return N_pages[0].contents[0].replace('\n', '').strip().split()[0]
        return ""

class Publishing_Date():
    def name():
        return "Publishing_Date"
    def parse(soup):
        elements = [e for e in soup.find_all("div", class_="row") if re.match(r'Published',e.text.strip())]
        #We first try to get the "first published date"
        if elements:
            date=re.findall(r'(?<=\(first published )(.*?)(?=\))',elements[0].text)
        else:
            return ""
        if date:
            return date[0]
        #We now see if there is a publishing date (but not a first publishing one).
        date=" ".join(elements[0].text.split()).split()
        #Handling the issue that not always the date is in the same format 
        if date[1]!="by":
            Publishing_Date=date[1]
            if len(date)>2 and date[2]!="by":
                Publishing_Date+=" "+date[2]
                if len(date)>3 and date[3]!="by":
                    Publishing_Date+=" "+date[3]
            return Publishing_Date
        else:
            return ""

        
class Characters():
    def name ():
        return "Characters"
    def parse(soup):
        Characters=soup.find_all("a",{'href': re.compile(r'^/characters/')})
        characters=[]
        for item in Characters:
            characters.append(" ".join(item.text.split()))
        return print_list(characters)

class Setting():
    def name():
        return "Setting"
    def parse(soup):
        Setting_temp=soup.find_all("div",class_="infoBoxRowItem")
        Setting=[]
        temp=[]
        Setting_places = []
        for element in Setting_temp:
            if element.find("a",{'href': re.compile(r'^/places/')}):
                Setting_places=element
        if Setting_places:
            temp=Setting_places.find_all()
        else:
            Setting=[]
        for element in temp:
            if element.name=="a":
                to_insert=element.text.split()
                Setting.append(" ".join(to_insert))
            if element.name=="span":
                to_add=element.text.split()
                Setting[-1]+=" "+(" ".join(to_add))
        #This is only a vert long workaround but seems to work
        for i in range(len(Setting)):
            Setting[i]=Setting[i].replace("…more","").replace("…less","").strip()
        Setting=list(dict.fromkeys([x for x in Setting if x]))
        return print_list(Setting)


class Url():
    def name():
        return "Url"
    def parse(soup):
        return re.findall(r'(?<=link href=")(.*?)(?=")',str(soup))[0]
    def score(book,query):
        print("Warning: a score for Url is not implemented. Returning default value of 1")
        return 1

In [21]:
fields=[bookTitle,bookSeries,bookAuthors,ratingValue,ratingCount,reviewCount,NumberOfPages,Publishing_Date,Characters,Setting,Url]

def SearchEngine3(fields_list):
    print("Write the plot keywords")
    plot_input=input()
    print("Write other parameters, specifing the field separated by a ','. Example: numpages 235, title hunger")
    text_input=input()
    text_input=text_input.split(",")
    field_names=[x.name().lower() for x in fields]  
    dictionary={}
    for input_field in text_input:
        input_field=input_field.split()
        if input_field and input_field[0].lower() in field_names:
            if input_field[0] in dictionary:
                print("Warning: field",input_field[0],"inserted more than once. Only the first value will be used")
                continue

            if len(input_field)>1:
                dictionary[input_field[0]]=" ".join(input_field[1:len(input_field)])
            else:
                print("Warning: the field",input_field[0],"has no specified value")
        else:
            if input_field:
                print("Warning: the field",'"'+input_field[0]+'"', "does not exist!")
            else:
                print("Warning: empty field name entered")
    
    
    #testato fino a sopra qua.
    
    to_call=[]
    for element in dictionary:
        to_call.append(field_names.index(element))

    with open(book):
        for index in to_call:

            fields_list[index].score(dictionary[fields_list[index].name().lower()],row[fields_list[index].name()])

In [22]:
SearchEngine3(fields)

Write the plot keywords
asdads
Write other parameters, specifing the field separated by a ','. Example: numpages 235, title hunger
booktitle the hunger games, numberofpages 123
0
6


In [None]:
def scoreNumPages(book,parameter):