In [2]:
import re
import os
import csv
import nltk
import pickle
import numpy as np
import heapq
from scipy.spatial import distance
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.tokenize import RegexpTokenizer
from collections import Counter
from heapq import heappush
N_doc=26543

In [3]:
tokenizer = RegexpTokenizer(r'[a-z]+') #Change this line by removing 0-9 if we don't want numbers in the plot tokens.
stop_words = set(stopwords.words("english"))
stemmer= PorterStemmer()

To see the code used to create these pkl files look at the "DataStructures" notebook

In [4]:
with open('inverted_index_1.pkl', 'rb') as handle:
    inverted_index = pickle.load(handle)
with open('vocabulary.pkl', 'rb') as handle:
    vocabulary = pickle.load(handle)
with open('vocabulary2.pkl', 'rb') as handle:
    vocabulary2 = pickle.load(handle)
with open('tfIdf_index.pkl', 'rb') as handle:
    tfIdf_index = pickle.load(handle)
with open('BookTokens.pkl', 'rb') as handle:
    BookTokens = pickle.load(handle)

## 2.1 Conjunctive query

In [4]:
query = input()
query = tokenizer.tokenize(query.lower())
query_stems = [stemmer.stem(word) for word in query if word not in stop_words]

query_stem_test=query_stems
query_stems=[]

#Checking if input stems exists in the vocabulary

for word in query_stem_test:
    try:
        vocabulary[word]
        query_stems.append(word)
    except KeyError:
        print("Stem",word,"not found. It will be ignored.")


        
        
temp=set()
if len(query_stems)>0:
    temp=inverted_index[vocabulary[query_stems[0]]]
    for stem in query_stems:
        temp=temp.intersection(inverted_index[vocabulary[stem]])

matching_books=list(sorted(temp))

for i in matching_books:
    with open('articles/article_' + str(i) +'.tsv', 'r', encoding="utf-8") as file:
            temp = csv.DictReader(file, delimiter = '\t')
            for row in temp:
                print("BookTitle:",row["bookTitle"])
                print("Plot:")
                print(row["Plot"])
                print("Url:",row["Url"])
                print()

k
BookTitle: Treasure Island
Plot:
For sheer storytelling delight and pure adventure, Treasure Island has never been surpassed. From the moment young Jim Hawkins first encounters the sinister Blind Pew at the Admiral Benbow Inn until the climactic battle for treasure on a tropic isle, the novel creates scenes and characters that have fired the imaginations of generations of readers. Written by a superb prose stylist, a master of both action and atmosphere, the story centers upon the conflict between good and evil - but in this case a particularly engaging form of evil. It is the villainy of that most ambiguous rogue Long John Silver that sets the tempo of this tale of treachery, greed, and daring. Designed to forever kindle a dream of high romance and distant horizons, Treasure Island is, in the words of G. K. Chesterton, 'the realization of an ideal, that which is promised in its provocative and beckoning map; a vision not only of white skeletons but also green palm trees and sapphire

BookTitle: Attraction
Plot:
He is everything she doesn’t want, so why does she want him so badly? From the New York Times Bestselling Author Penny Reid One week. Private beach. Invisible girl. Jerk-faced bully. What’s the worst that could happen? Kaitlyn Parker has no problem being the invisible girl, which is why she finds herself hiding in various cabinets and closets all over her college campus. Despite her best efforts, she can’t escape the notice of Martin Sandeke—bad boy, jerkface bully, and the universe’s hottest, wealthiest, and most unobtainable bachelor—who also happens to be Kaitlyn’s chemistry lab partner. Kaitlyn might be the only girl who isn’t interested in exploiting his stunning rower’s build, chiseled features, and family's billionaire fortune. Kaitlyn wants Martin for his brain, specifically to tabulate findings of trace elements in surface water. When Kaitlyn saves Martin from a nefarious plot, Martin uses the opportunity to push Kaitlyn out of her comfort zone: spr

BookTitle: Barracuda: The Unauthorized Biography of Sarah Palin
Plot:
Award-winning author Aladdin Elaasar's latest; "BARRACUDA: The Unauthorized Biography of Sarah Palin" is a timely book that reveals how McCain's choice of 'the Barracuda', a.k.a. Sarah Palin, as his running mate, opened the flood gates of the media to controversy and speculations. Will Palin play a role in the 2012 presidential elections? Will she be the next and the first female American president, or at least vice-president?
Url: https://www.goodreads.com/book/show/9537345-barracuda

BookTitle: My Avenging Angel
Plot:
To save her life, he must break a covenant-and lose his heart. An Angels and Demons story. It's Victoria Bloom's twenty-fifth birthday. But is she out celebrating? Oh, no. She's in a stuffy old attic with the Three Stooges-a.k.a. her so-called spirit guides. There's a demon who wants her dead, the same one that killed her mother two decades ago. No worries, say the Stooges. All she has to do is summon

## 2.2 Conjunctive query and ranking score

In [5]:
query = input()
query = tokenizer.tokenize(query.lower())
query_stems = [stemmer.stem(word) for word in query if word not in stop_words]

query_stem_test=query_stems
query_stems=[]

#Checking if input stems exists in the vocabulary

for word in query_stem_test:
    try:
        vocabulary[word]
        query_stems.append(word)
    except KeyError:
        print("Stem",word,"not found. It will be ignored.")

query_stems=list(dict.fromkeys([x for x in query_stems])) #Removing possible similarities

##########################
temp=set()

if len(query_stems)>0:
    temp=inverted_index[vocabulary[query_stems[0]]]
    for stem in query_stems:
        temp=temp.intersection(inverted_index[vocabulary[stem]])

matching_books=list(sorted(temp))

#Calculating tfIdf for the query.
query_tfIdf=[]

for word in query_stems:
    query_tfIdf.append((vocabulary[word],np.log(N_doc/vocabulary2[vocabulary[word]])))
query_tfIdf.sort()

query_tfIdf=dict((x,y) for x,y in query_tfIdf)



BooksWithScore=[] #Initializing the heap structure

for book in matching_books:
    doc_vector=[]
    query_vector=[]
    for word_id in BookTokens[book]:
        doc_vector.append(word_id[1])
        if word_id[0] in query_tfIdf:
            query_vector.append(1)
        else:
            query_vector.append(0)

    doc_vector=np.array(doc_vector)
    query_vector=np.array(query_vector)
    cos_similarity=1-distance.cosine(doc_vector,query_vector)
    
    heappush(BooksWithScore, (cos_similarity,book))


#print(BooksWithScore)
top_k_books=heapq.nlargest(10,BooksWithScore)

for book in top_k_books:
    i=book[1]
    with open('articles/article_' + str(i) +'.tsv', 'r', encoding="utf-8") as file:
            temp = csv.DictReader(file, delimiter = '\t')
            for row in temp:
                print("BookTitle:",row["bookTitle"])
                print("Plot:")
                print(row["Plot"])
                print("Url:",row["Url"])
                print("Score:",book[0])
                print()
#heapq.heapify(H)

hunger games
BookTitle: The Hunger Games Tribute Guide
Plot:
The New York Times bestselling Hunger Games is now a major motion picture—and here is the ultimate guide to the all the tributes in the 74th annual Hunger Games! Here is the ultimate guide to the twenty-fourth annual Hunger Games. Follow the tributes' journey from the reaping to the Games, with an exclusive look at all the highlights along the way—the trip to the Capitol, the Tribute Parade, the stations of the Training Center, and the interviews with Caesar Flickerman. Plus you'll find profiles of President Snow and Seneca Crane, portraits of each tribute, and detailed information on each district's industry. This unique guide to the tributes contains never-before-seen photos and quotes from the film, and is a must-have for any Hunger Games fan.
Url: https://www.goodreads.com/book/show/13027304-the-hunger-games-tribute-guide
Score: 0.47299175550025985

BookTitle: The Hunger Games Trilogy Boxset
Plot:
The extraordinary, groun