In [62]:
import re
import os
import csv
import nltk
import pickle
import numpy as np
import heapq
from scipy.spatial import distance
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.tokenize import RegexpTokenizer
from collections import Counter
from heapq import heappush
N_doc=26543

#from script_module import*

In [63]:
tokenizer = RegexpTokenizer(r'[a-z]+') #Change this line by removing 0-9 if we don't want numbers in the plot tokens.
stop_words = set(stopwords.words("english"))
stemmer= PorterStemmer()

In [64]:
with open('inverted_index_1.pkl', 'rb') as handle:
    inverted_index = pickle.load(handle)
with open('vocabulary.pkl', 'rb') as handle:
    vocabulary = pickle.load(handle)
with open('vocabulary2.pkl', 'rb') as handle:
    vocabulary2 = pickle.load(handle)
with open('tfIdf_index.pkl', 'rb') as handle:
    tfIdf_index = pickle.load(handle)
with open('BookTokens.pkl', 'rb') as handle:
    BookTokens = pickle.load(handle)

In [65]:
class bookTitle():
    def name():
        return "bookTitle"
    def parse(soup):
        bookTitle = soup.find_all('h1')[0].contents[0]
        bookTitle = " ".join(bookTitle.split())
        return (bookTitle)
    def score(book_info,query):
        w1 = set(book_info)
        w2 = set(query)

        return 1-nltk.jaccard_distance(w1, w2)

class bookSeries():
    def name():
        return "bookSeries"
    def parse(soup):
        bookSeries=""
        bookSeries=soup.find('h2',id="bookSeries").text.strip()[1:-1]
        return bookSeries
    def score(book_info,query):
        w1 = set(book_info)
        w2 = set(query)

        return 1-nltk.jaccard_distance(w1, w2)

class bookAuthors():
    def name():
        return "bookAuthors"
    def parse(soup):
        bookAuthors=[]
        for element in soup.find_all("span",itemprop="name"):
            bookAuthors.append(element.text.strip())
        return print_list(bookAuthors)
    def score(book_info,query):
        w1=book_info.lower()
        w1=w1.split(";")

        w2 = query.lower()
        w2 = w2.split(",")
        score=[]


        for word_query in w1:
            temp_score=0
            for element in w2:
                temp_score=max(1-nltk.jaccard_distance(set(word_query.strip()), set(element.strip())),temp_score)
            score.append(temp_score)

        return sum(score)/len(score)

class ratingValue():
    def name():
        return "ratingValue"
    def parse(soup):
        ratingValue = soup.find_all('span',itemprop="ratingValue")[0].contents[0].split('\n')[1].strip() 
        return ratingValue
    def score(book_info,query):
        
        def custom_tanh(x,parameter):
            value=3/2*(x-parameter+2)
            return((np.tanh(value))+1)/2
        
        try:
            query=float(query)
        except ValueError:
            print ("Warning; failed conversion of",query,"to float")
            return 0
        
        if query<0 or query>5:
            print("Warning: value ouside of the range 0-5")
            
        return custom_tanh(float(book_info),query)

class ratingCount():
    def name():
        return "ratingCount"  
    def parse(soup):
        return str(soup.find("meta",itemprop="ratingCount").get("content"))
    def score(book_info,query):
        try:
            book_info=int(book_info)
            query=int(query)
        except ValueError:
            return 0
        
        return min(1,book_info/query)
        
class reviewCount():
    def name():
        return "reviewCount"
    def parse(soup):
        return str(soup.find("meta",itemprop="reviewCount").get("content"))
    def score(book_info,query):
        try:
            book_info=int(book_info)
            query=int(query)
        except ValueError:
            return 0
        
        return min(1,book_info/query)

class Plot():
    def name():
        return "Plot"
    def parse(soup):
        def headingToRemove(Plot): 
            to_check=Plot.find("i")
            if to_check:
                forbidden_strings=["isbn","edition","librarian's note"]
                for string in forbidden_strings:
                    if string in to_check.text.lower():
                        Plot.find("i").decompose()

        Plot=soup.find("div", id="descriptionContainer").find_all("span")

        if len(Plot)==2:
            Plot=Plot[1]
            headingToRemove(Plot)
            Plot=Plot.text
            Plot=" ".join(Plot.split())
            Plot=Plot.replace("\\","")
        elif len(Plot)==1: 
            Plot=Plot[0]
            headingToRemove(Plot)
            Plot=Plot.text
            Plot=" ".join(Plot.split())
            Plot=Plot.replace("\\","")
        else:
            Plot=""
        return Plot
    def score(book_info,query):
        pass


class NumberOfPages():
    def name():
        return "NumberOfPages"
    def parse(soup):
        N_pages=soup.find_all('span', itemprop="numberOfPages")
        if N_pages:
            return N_pages[0].contents[0].replace('\n', '').strip().split()[0]
        return ""
    def score(book_info,query):
        
        try:
            n_pages=int(book_info)
        except ValueError:
            return 0
        
        try:
            query=int(query)
        
        except ValueError:
            print("This should not be printed")
            return 0
        
        exponent=-(1/60*(n_pages-query))**2
        return np.exp(exponent)
    
class Publishing_Date():
    def name():
        return "Publishing_Date"
    def parse(soup):
        elements = [e for e in soup.find_all("div", class_="row") if re.match(r'Published',e.text.strip())]
        #We first try to get the "first published date"
        if elements:
            date=re.findall(r'(?<=\(first published )(.*?)(?=\))',elements[0].text)
        else:
            return ""
        if date:
            return date[0]
        #We now see if there is a publishing date (but not a first publishing one).
        date=" ".join(elements[0].text.split()).split()
        #Handling the issue that not always the date is in the same format 
        if date[1]!="by":
            Publishing_Date=date[1]
            if len(date)>2 and date[2]!="by":
                Publishing_Date+=" "+date[2]
                if len(date)>3 and date[3]!="by":
                    Publishing_Date+=" "+date[3]
            return Publishing_Date
        else:
            return ""
    def score(book_info,query):
        get_date=book_info.split(" ")[-1]
        try:
            get_date=int(get_date)
        except ValueError:
            return 0
        
        try:
            query=int(query)
        
        except ValueError:
            
            print("This should not be printed")
            return 0
        
        exponent=-((2/(2030-query)**0.75)*(get_date-query))**2
        return np.exp(exponent)

        
class Characters():
    def name ():
        return "Characters"
    def parse(soup):
        Characters=soup.find_all("a",{'href': re.compile(r'^/characters/')})
        characters=[]
        for item in Characters:
            characters.append(" ".join(item.text.split()))
        return print_list(characters)
    def score(book_info,query):
        w1=book_info.lower()
        w1=w1.split(";")

        w2 = query.lower()
        w2 = w2.split(",")
        score=[]


        for word_query in w1:
            temp_score=0
            for element in w2:
                temp_score=max(1-nltk.jaccard_distance(set(word_query.strip()), set(element.strip())),temp_score)
            score.append(temp_score)

        return sum(score)/len(score)

class Setting():
    def name():
        return "Setting"
    def parse(soup):
        Setting_temp=soup.find_all("div",class_="infoBoxRowItem")
        Setting=[]
        temp=[]
        Setting_places = []
        for element in Setting_temp:
            if element.find("a",{'href': re.compile(r'^/places/')}):
                Setting_places=element
        if Setting_places:
            temp=Setting_places.find_all()
        else:
            Setting=[]
        for element in temp:
            if element.name=="a":
                to_insert=element.text.split()
                Setting.append(" ".join(to_insert))
            if element.name=="span":
                to_add=element.text.split()
                Setting[-1]+=" "+(" ".join(to_add))
        #This is only a vert long workaround but seems to work
        for i in range(len(Setting)):
            Setting[i]=Setting[i].replace("…more","").replace("…less","").strip()
        Setting=list(dict.fromkeys([x for x in Setting if x]))
        return print_list(Setting)
    def score(book_info,query):
        
        w1=book_info.lower()
        w1=w1.split(";")

        w2 = query.lower()
        w2 = w2.split(",")
        score=[]


        for word_query in w1:
            temp_score=0
            for element in w2:
                temp_score=max(1-nltk.jaccard_distance(set(word_query.strip()), set(element.strip())),temp_score)
            score.append(temp_score)

        return sum(score)/len(score)


class Url():
    def name():
        return "Url"
    def parse(soup):
        return re.findall(r'(?<=link href=")(.*?)(?=")',str(soup))[0]
    def score(book_info,query):
        print("Warning: a score for Url is not implemented. Returning default value of 1")
        return 0

In [66]:
def TfIdfScore_plot(query):
    query = tokenizer.tokenize(query.lower())
    query_stems = [stemmer.stem(word) for word in query if word not in stop_words]

    query_stem_test=query_stems
    query_stems=[]

    #Checking if input stems exists in the vocabulary

    for word in query_stem_test:
        try:
            vocabulary[word]
            query_stems.append(word)
        except KeyError:
            print("Stem",word,"not found. It will be ignored.")

    query_stems=list(dict.fromkeys([x for x in query_stems])) #Removing possible similarities

    ##########################
    temp=set()

    if len(query_stems)>0:
        temp=inverted_index[vocabulary[query_stems[0]]]
        for stem in query_stems:
            temp=temp.intersection(inverted_index[vocabulary[stem]])

    matching_books=list(sorted(temp))

    #Calculating tfIdf for the query.
    query_tfIdf=[]

    for word in query_stems:
        query_tfIdf.append((vocabulary[word],np.log(N_doc/vocabulary2[vocabulary[word]])))
    query_tfIdf.sort()

    query_tfIdf=dict((x,y) for x,y in query_tfIdf)



    BooksWithScore=[]
    
    for book in matching_books:
        doc_vector=[]
        query_vector=[]
        for word_id in BookTokens[book]:
            doc_vector.append(word_id[1])
            if word_id[0] in query_tfIdf:
                query_vector.append(1)
            else:
                query_vector.append(0)

        doc_vector=np.array(doc_vector)
        query_vector=np.array(query_vector)
        cos_similarity=1-distance.cosine(doc_vector,query_vector)

        heappush(BooksWithScore, (book,cos_similarity))
    
    BooksWithScore.sort()
    return BooksWithScore

In [67]:
fields=[bookTitle,bookSeries,bookAuthors,ratingValue,ratingCount,reviewCount,NumberOfPages,Publishing_Date,Characters,Setting,Url]

def SearchEngine3(fields_list):
    print("Write the plot keywords")
    plot_input=input()
    print("Write other parameters, specifing the field separated by a ','. Example: numpages 235, title hunger")
    text_input=input()
    text_input=text_input.split(",")
    field_names=[x.name().lower() for x in fields]  
    query_dictionary={}
    for input_field in text_input:
        input_field=input_field.split()
        if input_field and input_field[0].lower() in field_names:
            if input_field[0] in query_dictionary:
                print("Warning: field",input_field[0],"inserted more than once. Only the first value will be used")
                continue

            if len(input_field)>1:
                query_dictionary[input_field[0]]=" ".join(input_field[1:len(input_field)])
            else:
                print("Warning: the field",input_field[0],"has no specified value")
        else:
            if input_field:
                print("Warning: the field",'"'+input_field[0]+'"', "does not exist!")
            else:
                print("Warning: empty field name entered")
    
    
    #testato fino a sopra qua.
    
    print(query_dictionary)
    
    to_call=[]
    for element in query_dictionary:
        to_call.append(field_names.index(element))
    
    Book_with_plot_score=(TfIdfScore_plot(plot_input))
    Book_with_full_score=[]
    
    for element in Book_with_plot_score:
        book=element[0]
        plot_score=element[1]
        temp_score=0
        with open('articles/article_' + str(book) +'.tsv', 'r', encoding="utf-8") as file:
            temp = csv.DictReader(file, delimiter = '\t')
            for row in temp:
                for field in query_dictionary:
                    field_name=fields_list[field_names.index(field)].name()
                    temp_score+=fields_list[field_names.index(field)].score(row[field_name],query_dictionary[field])
        score=temp_score+plot_score
        Book_with_full_score.append((book,score))
                    
    print(sorted(Book_with_full_score,key=lambda x:-x[1]))
                    

                    
           

In [68]:
SearchEngine3(fields)

Write the plot keywords


KeyboardInterrupt: Interrupted by user

In [None]:
def scoreNumPages(book,parameter):

In [None]:
a=(1,2)

In [None]:

w1 = set('harry potter')
w2 = set('harry potter: order of the phoenix. Commented by X')
w3 = set('harry potter 2')

1-nltk.jaccard_distance(w1, w2)

In [None]:
for i in range(1,24000):
    with open('articles/article_' + str(i) +'.tsv', 'r', encoding="utf-8") as file:
                temp = csv.DictReader(file, delimiter = '\t')
                for row in temp:
                    print(row["Publishing_Date"])

In [None]:
int("")