## Evaluation

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import urllib.request
from bs4 import BeautifulSoup
import re
import collections

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import collections
import string
import random
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
from nltk.corpus import wordnet
import unicodedata
import html

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
corpus=pd.read_csv('/content/booksummaries/booksummaries.txt-SA_input.txt',names=['title','summary'])

In [None]:
corpus.head(2)

Unnamed: 0,title,summary
0,Animal Farm,"Old Major, the old boar on the Manor Farm, ca..."
1,A Clockwork Orange,"Alex, a teenager living in near-future Englan..."


In [None]:
# corpus['text'][0]

## Preprocessing

In [None]:
def remove_special_chars(text):
    re1 = re.compile(r'  +')
    x1 = text.lower().replace('#39;', "'").replace('amp;', '&').replace('#146;', "'").replace(
        'nbsp;', ' ').replace('#36;', '$').replace('\\n', "\n").replace('quot;', "'").replace(
        '<br />', "\n").replace('\\"', '"').replace('<unk>', 'u_n').replace(' @.@ ', '.').replace(
        ' @-@ ', '-').replace('\\', ' \\ ')
    return re1.sub(' ', html.unescape(x1))
corpus['summary'] = corpus['summary'].apply(remove_special_chars)
corpus['title'] = corpus['title'].apply(remove_special_chars)
corpus.head(1)

Unnamed: 0,title,summary
0,animal farm,"old major, the old boar on the manor farm, ca..."


In [None]:
def remove_non_ascii(text):
    """Remove non-ASCII characters from list of tokenized words"""
    return unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
corpus['summary'] = corpus['summary'].apply(remove_non_ascii)
corpus['title'] = corpus['title'].apply(remove_non_ascii)
corpus.head(1)

Unnamed: 0,title,summary
0,animal farm,"old major, the old boar on the manor farm, ca..."


In [None]:
#remove_repeating_char
def remove_repeating_char(text):
    return re.sub(r'(.)\1+', r'\1', text)

corpus['summary'] = corpus['summary'].apply(remove_repeating_char)
corpus['title'] = corpus['title'].apply(remove_repeating_char)
corpus.head(1)

Unnamed: 0,title,summary
0,animal farm,"old major, the old boar on the manor farm, ca..."


In [None]:
#processPost for applying all functions
def processPost(text): 
    #Replace @username with empty string
    text = re.sub('@[^\s]+', ' ', text)
    #Convert www.* or https?://* to " "
    text = re.sub('((www\.[^\s]+)|(https?://[^\s]+))',' ',text)
    #Replace #word with word
    text = re.sub(r'#([^\s]+)', r'\1', text)    
    return text

corpus['summary'] = corpus['summary'].apply(processPost)
corpus['title'] = corpus['title'].apply(processPost)
corpus.head(1)

Unnamed: 0,title,summary
0,animal farm,"old major, the old boar on the manor farm, ca..."


In [None]:
def remove_punctuation(text):
    """Remove punctuation from list of tokenized words"""
    translator = str.maketrans('', '', string.punctuation)
    return text.translate(translator)

corpus['summary'] = corpus['summary'].apply(remove_punctuation)
corpus['title'] = corpus['title'].apply(remove_punctuation)
corpus.head(1)

Unnamed: 0,title,summary
0,animal farm,old major the old boar on the manor farm cals...


In [None]:
# replace all numbers with ''
def replace_numbers(text):
    """Replace all interger occurrences in list of tokenized words with textual representation"""
    return re.sub(r'\d+', '', text)

corpus['summary'] = corpus['summary'].apply(replace_numbers)
corpus['title'] = corpus['title'].apply(replace_numbers)
corpus.head(1)

Unnamed: 0,title,summary
0,animal farm,old major the old boar on the manor farm cals...


In [None]:
# remove spaces from right and left sentences
def remove_whitespaces(text):
    return text.strip()
    
corpus['summary'] = corpus['summary'].apply(replace_numbers)
corpus['title'] = corpus['title'].apply(replace_numbers)
corpus.head(1)

Unnamed: 0,title,summary
0,animal farm,old major the old boar on the manor farm cals...


**Vector space model**

In [None]:
def vectorization(corpus):
  corpus_list=list(corpus.summary)
  vectorizer = TfidfVectorizer()
  vectorizer.fit(corpus_list)
  doc_vector = vectorizer.transform(corpus_list)
  return vectorizer,doc_vector
vectorizer,doc_vector=vectorization(corpus)

In [None]:
query="Old Major "


In [None]:
def ranking(query,doc_vector,vectorizer,number_of_result_doc):
  
 
  query=remove_punctuation(query)
  query=replace_numbers(query)
  query=remove_whitespaces(query)
  query=vectorizer.transform([query])

  from sklearn.metrics.pairwise import cosine_similarity
  cosine_similarities = cosine_similarity(doc_vector,query).flatten()
  similarity_dict={}
  c=0
  for i in cosine_similarities:
    similarity_dict[c]=i
    c+=1
  similarity=sorted(similarity_dict.items(), key=lambda x: x[1], reverse=True)
  similarity = similarity[:number_of_result_doc]

  return similarity
similarity=ranking(query,doc_vector,vectorizer,10)
similarity

[(2551, 0.20811477316849483),
 (16430, 0.1670191892424984),
 (5077, 0.15739290973896516),
 (2659, 0.15091432798056872),
 (11567, 0.1468925688493193),
 (15534, 0.14684964036083253),
 (15250, 0.13064960032166695),
 (9099, 0.12458188853002733),
 (3500, 0.11618840169618186),
 (1787, 0.1160056738906852)]

In [None]:
# In order to evaluate a search engine over this data we need two things:
# 1. Queries 
# 2. Relevance Judgements

# QUERIES dictionary with {query_id: query}
# queries = dict(enumerate(list(train['title'])))

queries = dict(enumerate([
    'Old Major',
    #'The Plague',
    # 'little boat'
    
]))

# RELEVANCE JUDGEMENTS list with [(query_id, document_id, judgement), ...] judgement 0 | 1 with 1 = relevant
qrels = [
         (0, 2551, 1),
         (0,16430,1),
         (0,5077,1),
         (0,2659,0),
         (0,11567,0),
         (0,15534,0),
         (0,15250,0),
         (0,9099,0),
         (0,3500,0),
         (0,1787,0),
         (0,108,1),
         (0,70,1),
         (0,1049,0),

]

In [None]:
def precision_at_k(query_id, k=7):

  doc_ranking = ranking(query,doc_vector,vectorizer,k)
  retrieved = [doc[0] for doc in doc_ranking[:k]] # take only the document id, rather than score

  TP = np.array([int((query_id, doc, 1) in qrels) for doc in retrieved]).sum()
  FP = np.array([int((query_id, doc, 0) in qrels) for doc in retrieved]).sum()

  precision = TP / (TP+FP)

  return TP, FP, precision

In [None]:
def f1_score_at_k(query_id, k=7):
  # calculate f_1 score
  # hint: you need to find TP's etc in a similar way to precision at k
  doc_ranking = ranking(query,doc_vector,vectorizer,k)
  retrieved = [doc[0] for doc in doc_ranking[:k]] # take only the document id, rather than score
  
  TP, FP, precision = precision_at_k(query_id, k)
  
  relevant_docs = np.array(qrels)
  relevant_docs = relevant_docs[relevant_docs[:, 0] == query_id][:,2].sum()
  FN = relevant_docs - TP

  recall = TP / (TP+FN)
  f1 = (2 * precision * recall) / (precision + recall)
  
  return f1

In [None]:
k = 7
for query_id, query in queries.items():
  tp, fp, precision = precision_at_k(query_id, k=k)
  f1_score = f1_score_at_k(query_id, k=k)
  print('retrieved query "{}" with Precision@{} = {} and F1-score = {}'.format(query, k, precision, f1_score))


retrieved query "Old Major" with Precision@7 = 0.42857142857142855 and F1-score = 0.5


**End of vector space model**

In [None]:
# split each sentence based on some features
def text2words(text):
    return word_tokenize(text)

corpus['summary'] = corpus['summary'].apply(text2words)
corpus['title'] = corpus['title'].apply(text2words)
corpus.head(1)

Unnamed: 0,title,summary
0,"[animal, farm]","[old, major, the, old, boar, on, the, manor, f..."


In [None]:
# initiate stopwords from nltk
# Removing Stopwords using nltk stopwords
stop_words = stopwords.words('english')

# add additional missing terms
stop_words.extend([i for i in string.ascii_lowercase])
stop_words.extend([i for i in string.digits])
# stop_words.extend([i for i in string.punctuation])
stop_words.extend(["about", "across", "after", "all", "also", "an", "and", "another", "added","any", "are", "as", "at", "basically", "be", "because", 'become', "been", "before", "being", "between","both",
 "but", "by","came","can","come","could","did","do","does","each","else","every","either","especially", "for","from","get","given","gets",'give','gives',"got","goes","had","has","have","he","her","here",
 "him","himself","his","how","if","in","into","is","it","its","just","lands","like","make","making", "made", "many","may","me","might","more","most","much","must","my","never","provide","provides", "perhaps",
 "no","now","of","on","only","or","other", "our","out","over","re","said","same","see","should","since","so","some","still","such","seeing", "see", "take","than","that","the","their","them","then","there",
"these","they","this","those","through","to","too","under","up","use","using","used", "underway", "very","want","was","way","we","well","were","what","when","where","which","while","whilst","who","will","with",
"would","you","your", 'etc', 'via', 'eg']) 

In [None]:
# remove stop words like the he she it for 
def remove_stopwords(words):

    return [word for word in words if word not in stop_words]

corpus['summary'] = corpus['summary'].apply(remove_stopwords)
corpus['title'] = corpus['title'].apply(remove_stopwords)
corpus.head(1)

Unnamed: 0,title,summary
0,"[animal, farm]","[old, major, old, boar, manor, farm, cals, ani..."


In [None]:
# corpus['text'] = corpus['text'].apply(lambda x: [item for item in x if len(item) > 2])
# corpus.head(1)

In [None]:
# return root of each word if word is Noun
def lemmatize_words(words):
    """Lemmatize words in text"""

    lemmatizer = WordNetLemmatizer()
    
    return [lemmatizer.lemmatize(word) for word in words]
    # return ' '.join([lemmatizer.lemmatize(word) for word in words])

corpus['summary'] = corpus['summary'].apply(lemmatize_words)
corpus['title'] = corpus['title'].apply(lemmatize_words)
corpus.head(1)

Unnamed: 0,title,summary
0,"[animal, farm]","[old, major, old, boar, manor, farm, cals, ani..."


In [None]:
corpus['title'] = corpus['title'].apply(lambda x: ' '.join(x))

In [None]:
# # return root of each word if word is Verb
# def lemmatize_verbs(words):
#     """Lemmatize verbs in text"""

#     lemmatizer = WordNetLemmatizer()
#     return ' '.join([lemmatizer.lemmatize(word, pos='v') for word in words])

# corpus['text'] = corpus['text'].apply(lemmatize_verbs)
# corpus.head(1)

In [None]:
corpus['summary'][0]

['old',
 'major',
 'old',
 'boar',
 'manor',
 'farm',
 'cals',
 'animal',
 'farm',
 'meting',
 'compare',
 'human',
 'parasite',
 'teach',
 'animal',
 'revolutionary',
 'song',
 'beast',
 'england',
 'major',
 'dy',
 'two',
 'young',
 'pig',
 'snowbal',
 'napoleon',
 'asume',
 'comand',
 'turn',
 'dream',
 'philosophy',
 'animal',
 'revolt',
 'drive',
 'drunken',
 'iresponsible',
 'mr',
 'jones',
 'farm',
 'renaming',
 'animal',
 'farm',
 'adopt',
 'seven',
 'comandments',
 'animalism',
 'important',
 'al',
 'animal',
 'equal',
 'snowbal',
 'atempts',
 'teach',
 'animal',
 'reading',
 'writing',
 'fod',
 'plentiful',
 'farm',
 'run',
 'smothly',
 'pig',
 'elevate',
 'position',
 'leadership',
 'set',
 'aside',
 'special',
 'fod',
 'item',
 'ostensibly',
 'personal',
 'health',
 'napoleon',
 'take',
 'pup',
 'farm',
 'dog',
 'train',
 'privately',
 'napoleon',
 'snowbal',
 'strugle',
 'leadership',
 'snowbal',
 'anounces',
 'plan',
 'build',
 'windmil',
 'napoleon',
 'dog',
 'chase',
 '

In [None]:
corpus_ = corpus[:2000]

In [None]:
# corpus_['summary'] = corpus_['summary'].apply(lambda x:x.split())

In [None]:
# from sklearn.model_selection import train_test_split
# train, test =train_test_split(corpus_,test_size=0.025,random_state=42)
train, test =corpus_[:1950],corpus_[1950:]

In [None]:
corpus.head()

Unnamed: 0,title,summary
0,animal farm,"[old, major, old, boar, manor, farm, cals, ani..."
1,clockwork orange,"[alex, tenager, living, nearfuture, england, l..."
2,plague,"[text, plague, divided, five, part, town, oran..."
3,enquiry concerning human understanding,"[argument, enquiry, proceds, series, increment..."
4,fire upon dep,"[novel, posit, space, around, milky, divided, ..."


## vectorize and get vocabulary impelement CountVectorizer 

In [None]:
words = [word for words in train['summary'] for word in words] 

In [None]:
words_uni= list(set(words))

In [None]:
len(words)

691987

In [None]:
len(words_uni)

49809

In [None]:
def calculateBOW(wordset,l_doc):
  tf_diz = dict.fromkeys(wordset,0)
  for word in l_doc:
      tf_diz[word]=l_doc.count(word)
  return tf_diz

In [None]:
bow = []
for lst in train['summary']:
    bow.append(calculateBOW(words_uni,lst))

In [None]:
df_bow = pd.DataFrame(bow)
df_bow.head()

Unnamed: 0,valhala,roleplaying,toman,jenet,rapturously,macrolife,daytrip,wade,heretic,canadian,...,galaphile,grey,belringer,shanaras,clue,ogier,treasured,brake,unique,comprised
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
vocabulary = np.array(df_bow.columns)

In [None]:
# # vectorize and get vocabulary
# vectorizer = CountVectorizer(stop_words='english')
# documents_vectorized = vectorizer.fit_transform(train['summary'])
# vocabulary = vectorizer.get_feature_names_out()

# print ('We have a {} document corpus with a {} term vocabulary'.format(*documents_vectorized.shape))

# # This is what it looks like
# df = pd.DataFrame(documents_vectorized.toarray(), columns=vocabulary)
# doc_ids = df.index.values
# df[:5]

In [None]:
queries = dict(enumerate(list(train['title'])))

In [None]:
# In order to evaluate a search engine over this data we need two things:
# 1. Queries 
# 2. Relevance Judgements

# QUERIES dictionary with {query_id: query}
# queries = dict(enumerate(list(train['title'])))

queries = dict(enumerate([
    'ship wreck',
    'sail sailing ship',
    # 'little boat'
    
]))

# RELEVANCE JUDGEMENTS list with [(query_id, document_id, judgement), ...] judgement 0 | 1 with 1 = relevant
qrels = [
         (0, 9, 1),
         (0,11,1),
         (0,12,1),
         (0,13,0),
         (0,14,0),
         (0,0,0),
         (0,24,0),
         (0,17,0),
         (0,3,0),
         (0,4,0),
         (0,108,1),
         (0,70,1),
         (0,1049,0),

         (1, 0, 0),
         (1, 1, 0),
         (1, 2, 1),
         (1, 3, 1),
         (1, 5, 0),
         (1, 7, 1),
         (1, 21, 0),
         (1, 23, 1),
         (1, 1380, 0),
         (1, 1441, 1),
         (1, 1452, 1),
]

In [None]:
# According to the Relevance Judgements, Is the document entitled 'The Ship is Ready' relevant to the query 'sail sailing ship' ?

# What about the document 'The Wind and the Sea'. Is it relevant to the query 'ship wreck' according to our Relevance Judgements ? 

In [None]:
def retrieve_ranking(query, bm25_df):
  q_terms = query.split(' ')
  q_terms_only = bm25_df[q_terms]
  score_q_d = q_terms_only.sum(axis=1)
  return sorted(zip(bm25_df.index.values,score_q_d.values), key = lambda tup:tup[1], reverse=True)

In [None]:
def precision_at_k(query_id, k=5):
  # calculate precision @ k. we've given you a headstart
  # hint: think about precision in terms of true positives and false positives
  # hint: remember set theory from Lab 1? 
  doc_ranking = ranking(query,doc_vector,vectorizer,k)
  retrieved = [doc[0] for doc in doc_ranking[:k]] # take only the document id, rather than score
  # return retrieved_id,retrieved_score
  # Precision = TruePositives / (TruePositives + FalsePositives)
  # Recall = TruePositives / (TruePositives + FalseNegatives)
  TP = np.array([int((query_id, doc, 1) in qrels) for doc in retrieved]).sum()
  FP = np.array([int((query_id, doc, 0) in qrels) for doc in retrieved]).sum()
  precision = TP/(TP+FP)

  return TP, FP, precision
  # return retrieved

In [None]:
tp, fp, precision = precision_at_k(1, k=5)

  if sys.path[0] == '':


In [None]:
def f1_score_at_k(query_id, k=5):
  # calculate f_1 score
  # hint: you need to find TP's etc in a similar way to precision at k
  doc_ranking = ranking(query,doc_vector,vectorizer,k)
  retrieved = [doc[0] for doc in doc_ranking[:k]] # take only the document id, rather than score
  # f1 = (2 * Precision * Recall) / (Precision + Recall)
  TP, FP, precision = precision_at_k(query_id, k)
  relevant_docs = np.array(qrels)
  relevant_docs = relevant_docs[relevant_docs[:, 0] == query_id][:,2].sum()
  FN = relevant_docs - TP

  recall = TP / (TP+FN)
  f1 = (2 * precision * recall) / (precision + recall)
  
  return f1

In [None]:
# To retrieve and calculate accuracy metrics for each query lets loop over them
k = 5
for query_id, query in queries.items():
  tp, fp, precision = precision_at_k(query_id, k=k)
  f1_score = f1_score_at_k(query_id, k=k)
  print('retrieved query "{}" with Precision@{} = {} and F1-score = {}'.format(query, k, precision, f1_score))


  if sys.path[0] == '':
  if sys.path[0] == '':


retrieved query "ship wreck" with Precision@5 = nan and F1-score = nan
retrieved query "sail sailing ship" with Precision@5 = nan and F1-score = nan


  if sys.path[0] == '':
  if sys.path[0] == '':


## Alternative method: 

In [None]:
def precision_at_k(query_id, k=5):

  doc_ranking = retrieve_ranking(queries[query_id], bm25_df)
  retrieved = [doc[0] for doc in doc_ranking[:k]] # take only the document id, rather than score

  qrels_query = [qrel for qrel in qrels if qrel[0] == query_id] # iterate through the relevance judgements and return rows which are relevant to given query
  relevant_doc_ids = [qrel[1] for qrel in qrels_query if qrel[-1] == 1] # retrieve the ids of documents that have positive relevance judgements (i.e relevant documents)
  non_relevant_doc_ids = [qrel[1] for qrel in qrels_query if qrel[-1] == 0] # retrieve the ids of documents that have 0 relevance judgements (i.e non relevant documents)

  TP = len(set(retrieved) & set(relevant_doc_ids)) # intersection between retrieved documents and relevant documents. num of docs in intersection = TP (positive examples that are correctly identified)
  FP = len(set(retrieved) & set(non_relevant_doc_ids)) # intersection between retrieved documents and non relevant documents. num of docs in interesetion is FP (negative examples that are incorrectly identifed are positive)

  precision = TP / (TP+FP)

  return TP, FP, precision

In [None]:
def f1_score_at_k(query_id, k=5):
  # calculate f_1 score
  # hint: you need to find TP's etc in a similar way to precision at k
  doc_ranking = retrieve_ranking(queries[query_id], bm25_df)
  retrieved = [doc[0] for doc in doc_ranking[:k]] # take only the document id, rather than score
  
  qrels_query = [qrel for qrel in qrels if qrel[0] == query_id] # iterate through the relevance judgements and return rows which are relevant to given query
  relevant_doc_ids = [qrel[1] for qrel in qrels_query if qrel[-1] == 1] # retrieve the ids of documents that have positive relevance judgements (i.e relevant documents)
  non_relevant_doc_ids = [qrel[1] for qrel in qrels_query if qrel[-1] == 0] # retrieve the ids of documents that have 0 relevance judgements (i.e non relevant documents)

  TP = len(set(retrieved) & set(relevant_doc_ids)) # intersection between retrieved documents and relevant documents. num of docs in intersection = TP (positive examples that are correctly identified)
  FP = len(set(retrieved) & set(non_relevant_doc_ids)) # intersection between retrieved documents and non relevant documents. num of docs in interesetion is FP (negative examples that are incorrectly identifed are positive)
  FN = len(set(relevant_doc_ids) - set(retrieved)) # relevance docs minus the retrieved docs equal FN (positive examples that are incorrectly identified as negative)

  precision = TP / (TP + FP)
  recall = TP / (TP + FN)

  f1 = 2 * precision * recall / (precision + recall)  
  
  return f1

In [None]:
# # To retrieve and calculate accuracy metrics for each query lets loop over them
# k = 5
# for query_id, query in queries.items():
#   tp, fp, precision = precision_at_k(query_id, k=k)
#   f1_score = f1_score_at_k(query_id, k=k)
#   print('retrieved query "{}" with Precision@{} = {} and F1-score = {}'.format(query, k, precision, f1_score))
