In [44]:
import pandas as pd 
import numpy as np
import re

In [45]:
df = pd.read_csv("/content/wiki_movie_plots_deduped_new.csv", usecols=["Title", "Plot"])
df.head()

Unnamed: 0,Title,Plot
0,Kansas Saloon Smashers,"A bartender is working at a saloon, serving dr..."
1,Love by the Light of the Moon,"The moon, painted with a smiling face hangs ov..."
2,The Martyred Presidents,"The film, just over a minute long, is composed..."
3,"Terrible Teddy, the Grizzly King",Lasting just 61 seconds and consisting of two ...
4,Jack and the Beanstalk,The earliest known adaptation of the classic f...


In [46]:
df.drop(df.index[15000:], inplace=True)
df.head()
df.to_csv('wikipedia_dataset.csv', index=False)

In [47]:
df["Plot"] = df["Plot"].str.lower()
df.head()

Unnamed: 0,Title,Plot
0,Kansas Saloon Smashers,"a bartender is working at a saloon, serving dr..."
1,Love by the Light of the Moon,"the moon, painted with a smiling face hangs ov..."
2,The Martyred Presidents,"the film, just over a minute long, is composed..."
3,"Terrible Teddy, the Grizzly King",lasting just 61 seconds and consisting of two ...
4,Jack and the Beanstalk,the earliest known adaptation of the classic f...


Creating a Mapping from index to Title:

In [48]:
title_mapping = dict()
for ind in range(0,len(df["Title"])):
  title_mapping[ind]=df["Title"][ind]

In [49]:
i=0
for key in title_mapping.keys():
  i=i+1
  if i>=10:
    break
  print(key," ",title_mapping.get(key))

0   Kansas Saloon Smashers
1   Love by the Light of the Moon
2   The Martyred Presidents
3   Terrible Teddy, the Grizzly King
4   Jack and the Beanstalk
5   Alice in Wonderland
6   The Great Train Robbery
7   The Suburbanite
8   The Little Train Robbery


In [52]:
def clean_summary(text):
    text = re.sub("\'", "", text)
    text = re.sub("[^a-zA-Z]"," ",text)
    text = ' '.join(text.split())
    text = text.lower()
    return text

In [53]:
df["Plot"] = df["Plot"].apply(lambda x: clean_summary(str(x)))
df.head()

Unnamed: 0,Title,Plot
0,Kansas Saloon Smashers,a bartender is working at a saloon serving dri...
1,Love by the Light of the Moon,the moon painted with a smiling face hangs ove...
2,The Martyred Presidents,the film just over a minute long is composed o...
3,"Terrible Teddy, the Grizzly King",lasting just seconds and consisting of two sho...
4,Jack and the Beanstalk,the earliest known adaptation of the classic f...


In [54]:
import nltk
nltk.download("punkt")
nltk.download("stopwords")
nltk.download("wordnet")
nltk.download('omw-1.4')
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [55]:
stopwords = nltk.corpus.stopwords.words('english')
stemmer = nltk.stem.SnowballStemmer('english')
lemmatizer = nltk.stem.WordNetLemmatizer()

TOKENIZING THE CORPUS

In [56]:
corpus = df["Plot"].values
corpus

array(['a bartender is working at a saloon serving drinks to customers after he fills a stereotypically irish mans bucket with beer carrie nation and her followers burst inside they assault the irish man pulling his hat over his eyes and then dumping the beer over his head the group then begin wrecking the bar smashing the fixtures mirrors and breaking the cash register the bartender then sprays seltzer water in nations face before a group of policemen appear and order everybody to leave',
       'the moon painted with a smiling face hangs over a park at night a young couple walking past a fence learn on a railing and look up the moon smiles they embrace and the moons smile gets bigger they then sit down on a bench by a tree the moons view is blocked causing him to frown in the last scene the man fans the woman with his hat because the moon has left the sky and is perched over her shoulder to see everything better',
       'the film just over a minute long is composed of two shots in t

In [57]:
def tokenizing_corpus(review):
    return list(map(word_tokenize, review))

In [58]:
corpus_tokenized = tokenizing_corpus(corpus)

In [59]:
for i in range(8):
    print(f"Doc {i} : {title_mapping.get(i)} -- {corpus_tokenized[i]}")

Doc 0 : Kansas Saloon Smashers -- ['a', 'bartender', 'is', 'working', 'at', 'a', 'saloon', 'serving', 'drinks', 'to', 'customers', 'after', 'he', 'fills', 'a', 'stereotypically', 'irish', 'mans', 'bucket', 'with', 'beer', 'carrie', 'nation', 'and', 'her', 'followers', 'burst', 'inside', 'they', 'assault', 'the', 'irish', 'man', 'pulling', 'his', 'hat', 'over', 'his', 'eyes', 'and', 'then', 'dumping', 'the', 'beer', 'over', 'his', 'head', 'the', 'group', 'then', 'begin', 'wrecking', 'the', 'bar', 'smashing', 'the', 'fixtures', 'mirrors', 'and', 'breaking', 'the', 'cash', 'register', 'the', 'bartender', 'then', 'sprays', 'seltzer', 'water', 'in', 'nations', 'face', 'before', 'a', 'group', 'of', 'policemen', 'appear', 'and', 'order', 'everybody', 'to', 'leave']
Doc 1 : Love by the Light of the Moon -- ['the', 'moon', 'painted', 'with', 'a', 'smiling', 'face', 'hangs', 'over', 'a', 'park', 'at', 'night', 'a', 'young', 'couple', 'walking', 'past', 'a', 'fence', 'learn', 'on', 'a', 'railing'

REMOVING THE STOPWORDS

In [60]:
verify=[',',':',';','.','#','(',')','{','[',']',"}",'/','?']

In [61]:
def removing_stopwords(tokenized_review):
    _function = lambda review: [word for word in review if word not in stopwords and word not in verify]
    return list(map(_function, tokenized_review))

In [62]:
nostopword_corpus = removing_stopwords(corpus_tokenized)

In [67]:
for i in range(10):
    print(f"Doc {i}: {title_mapping.get(i)} -- {nostopword_corpus[i]}")

Doc 0: Kansas Saloon Smashers -- ['bartender', 'working', 'saloon', 'serving', 'drinks', 'customers', 'fills', 'stereotypically', 'irish', 'mans', 'bucket', 'beer', 'carrie', 'nation', 'followers', 'burst', 'inside', 'assault', 'irish', 'man', 'pulling', 'hat', 'eyes', 'dumping', 'beer', 'head', 'group', 'begin', 'wrecking', 'bar', 'smashing', 'fixtures', 'mirrors', 'breaking', 'cash', 'register', 'bartender', 'sprays', 'seltzer', 'water', 'nations', 'face', 'group', 'policemen', 'appear', 'order', 'everybody', 'leave']
Doc 1: Love by the Light of the Moon -- ['moon', 'painted', 'smiling', 'face', 'hangs', 'park', 'night', 'young', 'couple', 'walking', 'past', 'fence', 'learn', 'railing', 'look', 'moon', 'smiles', 'embrace', 'moons', 'smile', 'gets', 'bigger', 'sit', 'bench', 'tree', 'moons', 'view', 'blocked', 'causing', 'frown', 'last', 'scene', 'man', 'fans', 'woman', 'hat', 'moon', 'left', 'sky', 'perched', 'shoulder', 'see', 'everything', 'better']
Doc 2: The Martyred Presidents -

CASE FOLDING

In [68]:
def case_folding(tokenized_reviews):
    _function = lambda review: [word.lower() for word in review]
    return list(map(_function, tokenized_reviews))

In [69]:
nostopw_casefolded_corpus = case_folding(nostopword_corpus)

In [70]:
for i in range(10):
    print(f"Doc {i}: {title_mapping.get(i)} -- {nostopw_casefolded_corpus[i]}")

Doc 0: Kansas Saloon Smashers -- ['bartender', 'working', 'saloon', 'serving', 'drinks', 'customers', 'fills', 'stereotypically', 'irish', 'mans', 'bucket', 'beer', 'carrie', 'nation', 'followers', 'burst', 'inside', 'assault', 'irish', 'man', 'pulling', 'hat', 'eyes', 'dumping', 'beer', 'head', 'group', 'begin', 'wrecking', 'bar', 'smashing', 'fixtures', 'mirrors', 'breaking', 'cash', 'register', 'bartender', 'sprays', 'seltzer', 'water', 'nations', 'face', 'group', 'policemen', 'appear', 'order', 'everybody', 'leave']
Doc 1: Love by the Light of the Moon -- ['moon', 'painted', 'smiling', 'face', 'hangs', 'park', 'night', 'young', 'couple', 'walking', 'past', 'fence', 'learn', 'railing', 'look', 'moon', 'smiles', 'embrace', 'moons', 'smile', 'gets', 'bigger', 'sit', 'bench', 'tree', 'moons', 'view', 'blocked', 'causing', 'frown', 'last', 'scene', 'man', 'fans', 'woman', 'hat', 'moon', 'left', 'sky', 'perched', 'shoulder', 'see', 'everything', 'better']
Doc 2: The Martyred Presidents -

LEMMATIZING WORDS

In [71]:
def lemmatize_words(tokenized_reviews):
    function = lambda review: [lemmatizer.lemmatize(word) for word in review]
    return list(map(function, tokenized_reviews))

In [72]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [73]:
nostopw_casefolded_lemmatized_corpus = lemmatize_words(nostopw_casefolded_corpus)

In [74]:
for i in range(10):
    print(f"Doc {i}: {title_mapping.get(i)} -- {nostopw_casefolded_lemmatized_corpus[i]}")

Doc 0: Kansas Saloon Smashers -- ['bartender', 'working', 'saloon', 'serving', 'drink', 'customer', 'fill', 'stereotypically', 'irish', 'man', 'bucket', 'beer', 'carrie', 'nation', 'follower', 'burst', 'inside', 'assault', 'irish', 'man', 'pulling', 'hat', 'eye', 'dumping', 'beer', 'head', 'group', 'begin', 'wrecking', 'bar', 'smashing', 'fixture', 'mirror', 'breaking', 'cash', 'register', 'bartender', 'spray', 'seltzer', 'water', 'nation', 'face', 'group', 'policeman', 'appear', 'order', 'everybody', 'leave']
Doc 1: Love by the Light of the Moon -- ['moon', 'painted', 'smiling', 'face', 'hang', 'park', 'night', 'young', 'couple', 'walking', 'past', 'fence', 'learn', 'railing', 'look', 'moon', 'smile', 'embrace', 'moon', 'smile', 'get', 'bigger', 'sit', 'bench', 'tree', 'moon', 'view', 'blocked', 'causing', 'frown', 'last', 'scene', 'man', 'fan', 'woman', 'hat', 'moon', 'left', 'sky', 'perched', 'shoulder', 'see', 'everything', 'better']
Doc 2: The Martyred Presidents -- ['film', 'minu

Stemming:

In [75]:
def stem_words(tokenized_tweets):
    _function = lambda tweet: [stemmer.stem(word) for word in tweet]
    return list(map(_function, tokenized_tweets))

In [76]:
tokenized_nostopw_case_stem_corpus = stem_words(nostopw_casefolded_lemmatized_corpus)

In [77]:
for i in range(10):
    print(f"Doc {i}: {title_mapping.get(i)} -- {tokenized_nostopw_case_stem_corpus[i]}")

Doc 0: Kansas Saloon Smashers -- ['bartend', 'work', 'saloon', 'serv', 'drink', 'custom', 'fill', 'stereotyp', 'irish', 'man', 'bucket', 'beer', 'carri', 'nation', 'follow', 'burst', 'insid', 'assault', 'irish', 'man', 'pull', 'hat', 'eye', 'dump', 'beer', 'head', 'group', 'begin', 'wreck', 'bar', 'smash', 'fixtur', 'mirror', 'break', 'cash', 'regist', 'bartend', 'spray', 'seltzer', 'water', 'nation', 'face', 'group', 'policeman', 'appear', 'order', 'everybodi', 'leav']
Doc 1: Love by the Light of the Moon -- ['moon', 'paint', 'smile', 'face', 'hang', 'park', 'night', 'young', 'coupl', 'walk', 'past', 'fenc', 'learn', 'rail', 'look', 'moon', 'smile', 'embrac', 'moon', 'smile', 'get', 'bigger', 'sit', 'bench', 'tree', 'moon', 'view', 'block', 'caus', 'frown', 'last', 'scene', 'man', 'fan', 'woman', 'hat', 'moon', 'left', 'sky', 'perch', 'shoulder', 'see', 'everyth', 'better']
Doc 2: The Martyred Presidents -- ['film', 'minut', 'long', 'compos', 'two', 'shot', 'first', 'girl', 'sit', 'ba

Inverted Index:

In [78]:
inverted_index=dict()
for index in range(len(tokenized_nostopw_case_stem_corpus)):
  for word in tokenized_nostopw_case_stem_corpus[index]:
    if(word not in inverted_index):
      inverted_index[word] = list()
    inverted_index[word].append(index)

for key in inverted_index:
  inverted_index[key] = list(set(inverted_index[key]))
  

In [79]:
for key in list(inverted_index.keys())[:10]:
    print(f"Word: {key}\nDocument Indices: {inverted_index[key]}\n")

Word: bartend
Document Indices: [0, 12290, 11781, 522, 12812, 4624, 7184, 11280, 1047, 13337, 12319, 4642, 4134, 9769, 13865, 10813, 8766, 7231, 8775, 7759, 13391, 14927, 9298, 8790, 11350, 13398, 12381, 11870, 7271, 13928, 5737, 10859, 5229, 2159, 6767, 11887, 13425, 8307, 14966, 2685, 2178, 13444, 6797, 13454, 9875, 8858, 5792, 13482, 10924, 14000, 8371, 9916, 192, 10447, 11477, 13016, 9951, 8928, 14050, 6884, 231, 13032, 1772, 6380, 14070, 14072, 12025, 13050, 13568, 13058, 3331, 2820, 9492, 13087, 13103, 3891, 5433, 13633, 10569, 1357, 9556, 1371, 11614, 3428, 9060, 14692, 13673, 11122, 13686, 12151, 13182, 13699, 11652, 2953, 13197, 11662, 13204, 7069, 10656, 5026, 2468, 9142, 6583, 10678, 14782, 3009, 14273, 10180, 11717, 4555, 6603, 8140, 10192, 13267, 2522, 4064, 11760, 9201, 6642, 7667, 7166]

Word: work
Document Indices: [0, 8193, 8199, 8200, 9, 8201, 8206, 8211, 20, 8213, 26, 8218, 31, 8225, 36, 8232, 8234, 8237, 46, 8238, 48, 8239, 8251, 60, 8252, 62, 8254, 8255, 8262, 8263

defining an boolean function to retrive boolean data:

In [80]:
def parse_query(infix_tokens):
    
    precedence = {}
    precedence['NOT'] = 3
    precedence['AND'] = 2
    precedence['OR'] = 1
    precedence['('] = 0
    precedence[')'] = 0    

    output = []
    operator_stack = []

    for token in infix_tokens:
        if (token == '('):
            operator_stack.append(token)

        elif (token == ')'):
            operator = operator_stack.pop()
            while operator != '(':
                output.append(operator)
                operator = operator_stack.pop()

        elif (token in precedence):
            if (operator_stack):
                current_operator = operator_stack[-1]
                while (operator_stack and precedence[current_operator] > precedence[token]):
                    output.append(operator_stack.pop())
                    if (operator_stack):
                        current_operator = operator_stack[-1]
            operator_stack.append(token) # add token to stack
        else:
            output.append(token.lower())

    while (operator_stack):
        output.append(operator_stack.pop())

    return output

In [81]:
def boolean_query(query, inverted_index):
    query = query.strip()
    query_tokens = query.split()
    boolean_query = parse_query(query_tokens)
        
    result_stack = list()
    for idx, token in enumerate(boolean_query):
        if token not in ["AND", "NOT", "OR"]:
            result = set(inverted_index[token])
        else:
            if token in ['AND', 'OR']:
                right_operand = result_stack.pop()
                left_operand = result_stack.pop()
                
                if token == 'AND':
                    operation = set.intersection
                else:
                    operation = set.union
                
                result = operation(left_operand, right_operand)
                
            else:
                operand = result_stack.pop()
                complement_document_ids = inverted_index[boolean_query[idx-1]]
                result = list()
                for word in inverted_index:
                    result.extend([_id for _id in inverted_index[word] if _id not in complement_document_ids])
                result = set(result)
                
        result_stack.append(result)
    
    return result_stack.pop()

retrive the boolean data for "the" and "mob" words:

In [84]:
document_ids = boolean_query("irish AND man", inverted_index)
j=0
for ids in document_ids:
  j=j+1
  if j>=10:
    break
  print(f'Doc {ids} -- {title_mapping[ids]}')


Doc 0 -- Kansas Saloon Smashers
Doc 12929 -- The Boxer
Doc 12040 -- The Music of Chance
Doc 1545 -- Laughter in Hell
Doc 395 -- The Magic Cup
Doc 8844 -- Prime Cut
Doc 11791 -- Far and Away
Doc 14225 -- Charlie's Angels: Full Throttle
Doc 8215 -- The Gnome-Mobile


definiging an function to create positional index:

In [85]:
def construct_positional_posting_list(tokenized_corpus):
    positional_index = dict()
    for tweet_id, tweet in enumerate(tokenized_corpus):
        for token_id, token in enumerate(tweet):
            if token not in positional_index:
                positional_index[token] = dict()
            if tweet_id not in positional_index[token]:
                positional_index[token][tweet_id] = list()
            positional_index[token][tweet_id].append(token_id)
    
    for token in positional_index:
        for tweet_id in positional_index[token]:
            positional_index[token][tweet_id] = sorted(positional_index[token][tweet_id])
        items = list(positional_index[token].items())
        items.sort(key=lambda x: x[0])
        for k, v in items:
            positional_index[token][k] = v
            
    return positional_index

calling positional_index function to return positional indexes:

In [86]:
positional_index = construct_positional_posting_list(tokenized_nostopw_case_stem_corpus)

printing positional indexes:

In [87]:
for key in list(positional_index.keys())[:10]:
    print(f"Word: {key}\nText & Token Indices: {positional_index[key]}\n")

Word: bartend
Text & Token Indices: {0: [0, 36], 192: [25], 231: [64, 94, 97, 112], 522: [54], 1047: [130, 141], 1357: [22], 1371: [207], 1772: [19], 2159: [68], 2178: [171], 2468: [223], 2522: [110], 2685: [55, 71, 75], 2820: [4], 2953: [240], 3009: [95], 3331: [259], 3428: [107], 3891: [59, 88], 4064: [381], 4134: [210], 4555: [175], 4624: [61], 4642: [351], 5026: [30], 5229: [82, 155], 5433: [202], 5737: [249], 5792: [50], 6380: [56, 63], 6583: [61], 6603: [198], 6642: [16], 6767: [25], 6797: [68], 6884: [9], 7069: [81], 7166: [229], 7184: [40], 7231: [74], 7271: [317], 7667: [215], 7759: [80], 8140: [16, 440, 452], 8307: [29], 8371: [512, 527], 8766: [50, 85, 151, 165], 8775: [8], 8790: [196], 8858: [91], 8928: [71], 9060: [337], 9142: [110], 9201: [62], 9298: [334, 384, 391, 402], 9492: [112], 9556: [195], 9769: [206], 9875: [66], 9916: [189], 9951: [184, 196], 10180: [33], 10192: [78], 10447: [178], 10569: [537], 10656: [47, 98], 10678: [72, 82], 10813: [260], 10859: [13], 10924:

checking phrase query with positional index:

In [88]:
def positional_intersect(a1, a2, K):
    answer = list()
    i = 0
    j = 0
    while i < len(a1) and j < len(a2):
        document_id_a1 = list(a1.keys())[i]
        document_id_a2 = list(a2.keys())[j]
        
        if document_id_a1 == document_id_a2:
            l = list()
            aa1 = a1[document_id_a1]
            aa2 = a2[document_id_a2]
            
            k = 0
            while k < len(aa1):
                m = 0
                while m < len(aa2):
                    distance = aa2[m] - aa1[k]
                    if distance == K:
                        l.append(m)
                    m += 1
                
                for ps in l:
                    distance = (aa2[ps] - aa1[k])
                    if distance != K:
                        l.remove(ps)
                        
                for ps in l:
                    answer.append((document_id_a1, aa1[k], aa2[ps]))
                
                k += 1
            
            i += 1
            j += 1
        
        elif document_id_a1 < document_id_a2:
            i += 1
        else:
            j += 1
    
    return answer

In [89]:
to = {
    1: [7,18,33,72,86,231],
    2: [1,17,74,222,255],
    4: [8,16,190,429,433],
    5: [363,367],
    7: [13,23,191]
}
        
be = {
    1: [17,25],
    4: [17,191,291,430,434],
    5: [14,19,101]
}

calling positional index function:

In [90]:
positional_intersect(to, be, 1)

[(4, 16, 17), (4, 190, 191), (4, 429, 430), (4, 433, 434)]

Handling wild card queries using inverted index:

In [93]:
def search(query):
    if '*' in query:
        query = query.replace('*', '')
        result = []
        for word in inverted_index:
            if query in word:
                result.extend(inverted_index[word])
        return result
    elif '"' in query:
        query = query.replace('"', '')
        result = []
        for word in inverted_index:
            if query == word:
                result.extend(inverted_index[word])
        return result
    else:
        return inverted_index[query]

In [94]:
ans = search('bartend*')
for ids in ans[:20]:
  print(f'Doc {ids}: {title_mapping.get(i)}')

Doc 0: The Night Before Christmas
Doc 12290: The Night Before Christmas
Doc 11781: The Night Before Christmas
Doc 522: The Night Before Christmas
Doc 12812: The Night Before Christmas
Doc 4624: The Night Before Christmas
Doc 7184: The Night Before Christmas
Doc 11280: The Night Before Christmas
Doc 1047: The Night Before Christmas
Doc 13337: The Night Before Christmas
Doc 12319: The Night Before Christmas
Doc 4642: The Night Before Christmas
Doc 4134: The Night Before Christmas
Doc 9769: The Night Before Christmas
Doc 13865: The Night Before Christmas
Doc 10813: The Night Before Christmas
Doc 8766: The Night Before Christmas
Doc 7231: The Night Before Christmas
Doc 8775: The Night Before Christmas
Doc 7759: The Night Before Christmas


Handling Phrase Queries:

In [95]:

def preprocess(doc):
    # tokenize the document
    tokens = nltk.word_tokenize(doc.lower())

    # stem and lemmatize each token
    processed_tokens = []
    for token in tokens:
        stem = stemmer.stem(token)
        lemma = lemmatizer.lemmatize(stem)
        processed_tokens.append(lemma)

    # return the processed tokens
    return processed_tokens

def phrase_query(positional_index, query):
    # preprocess the query
    processed_query = preprocess(query)

    # initialize a list of candidate document IDs
    candidate_docs = list(positional_index[processed_query[0]].keys())

    # iterate over each token in the processed query
    for i in range(1, len(processed_query)):
        # get the candidate document IDs for the current token
        token_docs = list(positional_index[processed_query[i]].keys())

        # initialize a new list of candidate document IDs
        new_candidate_docs = []

        # iterate over each candidate document ID
        for doc_id in candidate_docs:
            # check if the candidate document contains the current token
            if doc_id in token_docs:
                # iterate over each position of the current token in the candidate document
                for pos in positional_index[processed_query[i]][doc_id]:
                    # check if the current position follows a previous position of the previous token
                    if pos == positional_index[processed_query[i-1]][doc_id][-1] + 1:
                        # add the candidate document to the new list of candidate documents
                        new_candidate_docs.append(doc_id)
                        break

        # set the candidate documents to the new list of candidate documents
        candidate_docs = new_candidate_docs

    # return the final list of candidate document IDs
    return candidate_docs


In [96]:
query = "irish man"

results = phrase_query(positional_index,query)

for ids in results[:20]:
  print(f'Doc {ids}: {title_mapping.get(i)}')


Doc 0: The Night Before Christmas


4. Retrieve relevant text using similarity index

Using Jaccard Similarity:

In [98]:
import math
from collections import Counter

def get_jaccard_index(doc, query_set):
    doc_set = set(doc)
    intersection = len(doc_set.intersection(query_set))
    union = len(doc_set.union(query_set))
    return intersection/union

# input query
query = "irish man"

# preprocessing query

query_processed = set(preprocess(query))

# retrieve relevant documents
relevant_docs = []
for ind in range(len(tokenized_nostopw_case_stem_corpus)):
    jaccard_index = get_jaccard_index(tokenized_nostopw_case_stem_corpus[ind], query_processed)
    if jaccard_index > 0:
        relevant_docs.append((ind, jaccard_index))
relevant_docs = sorted(relevant_docs, key=lambda x: x[1], reverse=True)

# print relevant documents
for ind, jaccard_index in relevant_docs[:10]:
    print(f'Doc {ind} : {title_mapping.get(ind)} -- {tokenized_nostopw_case_stem_corpus[ind]} -- {jaccard_index}')


Doc 3713 : You're a Lucky Fellow, Mr. Smith -- ['woman', 'marri', 'man', 'fulfil', 'condit'] -- 0.16666666666666666
Doc 968 : Manslaughter -- ['wealthi', 'woman', 'run', 'kill', 'man', 'automobil', 'accid'] -- 0.125
Doc 1107 : East Lynne -- ['trophi', 'wife', 'stodgi', 'man', 'wealth', 'yearn', 'interest', 'life'] -- 0.1111111111111111
Doc 1799 : This Side of Heaven -- ['person', 'profession', 'problem', 'eventu', 'drive', 'man', 'attempt', 'suicid'] -- 0.1111111111111111
Doc 2719 : Zaza -- ['glamor', 'femal', 'singer', 'colbert', 'affair', 'marri', 'man', 'marshal'] -- 0.1111111111111111
Doc 3883 : Oh, What a Night -- ['man', 'keep', 'secret', 'young', 'niec', 'intern', 'jewel', 'thief'] -- 0.1111111111111111
Doc 4495 : Key Witness -- ['man', 'run', 'away', 'avoid', 'suspicion', 'murder', 'end', 'troubl'] -- 0.1111111111111111
Doc 7965 : The Dirty Game -- ['man', 'tell', 'three', 'differ', 'spi', 'mission', 'took', 'part'] -- 0.1111111111111111
Doc 11046 : Puss in Boots -- ['film', 'p

Using Cosine Similarity:

In [99]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

nostopw_casefolded_lemmatized_corpus

query = "smile face"

vectorizer = TfidfVectorizer()
document_vectors = vectorizer.fit_transform([' '.join(document) for document in nostopw_casefolded_lemmatized_corpus])
query_vector = vectorizer.transform([query])

similarities = cosine_similarity(query_vector, document_vectors)[0]
indices = similarities.argsort()[::-1]

for i in indices[:10]:
    print(f'Document {i} : {title_mapping.get(i)} -- {nostopw_casefolded_lemmatized_corpus[i]} -- Cosine Similarity --> {similarities[i]}')

Document 1 : Love by the Light of the Moon -- ['moon', 'painted', 'smiling', 'face', 'hang', 'park', 'night', 'young', 'couple', 'walking', 'past', 'fence', 'learn', 'railing', 'look', 'moon', 'smile', 'embrace', 'moon', 'smile', 'get', 'bigger', 'sit', 'bench', 'tree', 'moon', 'view', 'blocked', 'causing', 'frown', 'last', 'scene', 'man', 'fan', 'woman', 'hat', 'moon', 'left', 'sky', 'perched', 'shoulder', 'see', 'everything', 'better'] -- Cosine Similarity --> 0.2601756974362717
Document 14761 : Smile -- ['katie', 'teenage', 'girl', 'malibu', 'california', 'find', 'throe', 'growing', 'facing', 'parent', 'boyfriend', 'sexuality', 'privileged', 'life', 'half', 'world', 'away', 'rural', 'china', 'lin', 'share', 'birthday', 'katie', 'face', 'much', 'different', 'reality', 'facial', 'deformity', 'life', 'life', 'fear', 'shame', 'father', 'daniel', 'devoted', 'life', 'hope', 'dream', 'circumstance', 'change', 'one', 'day', 'opportunity', 'come', 'discovery', 'worldwide', 'doctor', 'gift', 

5. Retrieve relevant text using liklelihood language model:

Using Probabilistic(inverse-document-frequency) Model:

In [100]:
import math

# Example input query
query = "irish man"

# preprocessing query

query = list(preprocess(query))

# Calculate document frequencies
doc_freq = {}
for doc in tokenized_nostopw_case_stem_corpus:
    for word in set(doc):
        if word in doc_freq:
            doc_freq[word] += 1
        else:
            doc_freq[word] = 1

# Calculate inverse document frequencies
num_docs = len(tokenized_nostopw_case_stem_corpus)
inv_doc_freq = {}
for word in doc_freq:
    inv_doc_freq[word] = math.log(num_docs / doc_freq[word])

# Calculate likelihood scores for each document
doc_scores = []
for doc in tokenized_nostopw_case_stem_corpus:
    score = 0
    for word in query:
        if word in doc:
            score += inv_doc_freq[word]
    doc_scores.append(score)

# Sort documents by score and print top results
results = sorted(zip(tokenized_nostopw_case_stem_corpus, doc_scores), key=lambda x: x[1], reverse=True)
for i in range(min(len(results), 10)):
    print(f"Document {i}: {results[i][0]}, Score --> {results[i][1]}")

Document 0: ['bartend', 'work', 'saloon', 'serv', 'drink', 'custom', 'fill', 'stereotyp', 'irish', 'man', 'bucket', 'beer', 'carri', 'nation', 'follow', 'burst', 'insid', 'assault', 'irish', 'man', 'pull', 'hat', 'eye', 'dump', 'beer', 'head', 'group', 'begin', 'wreck', 'bar', 'smash', 'fixtur', 'mirror', 'break', 'cash', 'regist', 'bartend', 'spray', 'seltzer', 'water', 'nation', 'face', 'group', 'policeman', 'appear', 'order', 'everybodi', 'leav'], Score --> 5.743771482872565
Document 1: ['set', 'san', 'francisco', 'earli', 'film', 'revolv', 'around', 'amarilli', 'mari', 'pickford', 'daughter', 'widow', 'scrubwoman', 'amarilli', 'proud', 'hard', 'work', 'irish', 'famili', 'take', 'care', 'five', 'roughhous', 'brother', 'engag', 'bartend', 'terri', 'mcgowan', 'william', 'scott', 'get', 'job', 'cigarett', 'girl', 'cafe', 'fire', 'unfair', 'caus', 'lose', 'job', 'theater', 'scrubwoman', 'work', 'cigarett', 'girl', 'meet', 'gordon', 'phillip', 'norman', 'kerri', 'handsom', 'wealthi', 'fr

Language Likelihood Model-2 (A Machine Language Model)

In [101]:
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist, LidstoneProbDist
from nltk.lm import Vocabulary, MLE, Laplace, Lidstone

In [102]:
def preprocess_docs(corpus):
    processed_docs = []
    for doc in corpus:
        # tokenization
        tokens = word_tokenize(doc)
        # remove numbers and single-character tokens
        processed_doc = [token.lower() for token in tokens if not token.isnumeric() and len(token) > 1]
        processed_docs.append(processed_doc)
    return processed_docs

In [103]:
def train_likelihood_language_model(processed_docs, smoothing='mle', gamma=0.1):
    # create vocabulary
    words = [word for doc in processed_docs for word in doc]
    vocab = Vocabulary(words, unk_cutoff=1)
    
    # create frequency distribution
    freq_dist = FreqDist(words)
    
    # create probability distribution
    if smoothing == 'mle':
        prob_dist = MLE(freq_dist)
    elif smoothing == 'laplace':
        prob_dist = Laplace(freq_dist)
    elif smoothing == 'lidstone':
        prob_dist = LidstoneProbDist(freq_dist, gamma, bins=len(vocab))
    else:
        raise ValueError('Invalid smoothing method.')
    
    # create language model
    lm = nltk.lm.models.Lidstone(order=2, gamma=gamma, vocabulary=vocab)
    lm.fit([list(nltk.ngrams(doc, 2)) for doc in processed_docs])

    
    return lm

In [104]:
def retrieve_relevant_docs(query, processed_docs, lm, num_docs=10):
    # preprocess query
    query = query.lower().split()
    
    # retrieve relevant docs
    doc_scores = {}
    for doc_id, doc in enumerate(processed_docs):
        score = 0
        for word in query:
            if word in lm.vocab:
                score += lm.score(word, doc)
        doc_scores[doc_id] = score
    
    # sort docs by relevance score
    relevant_docs = sorted(doc_scores.items(), key=lambda x: x[1], reverse=True)[:num_docs]
    
    # get content of relevant docs
    content = []
    for doc_id, score in relevant_docs:
        doc = tokenized_nostopw_case_stem_corpus[doc_id]
        content.append((doc_id, score, doc))
    
    return content


In [105]:
lm = train_likelihood_language_model(tokenized_nostopw_case_stem_corpus, smoothing='mle', gamma=0.1)

In [None]:
query = 'film show'
relevant_docs = retrieve_relevant_docs(query, tokenized_nostopw_case_stem_corpus, lm, num_docs=10)

In [107]:
for doc_id, score, doc in relevant_docs:
    print(f'Relevance score: {score:.64f}\n{title_mapping.get(doc_id)} -- {doc}\n')

Relevance score: 0.0000363352288211035007441801480965182236104737967252731323242188
Kansas Saloon Smashers -- ['bartend', 'work', 'saloon', 'serv', 'drink', 'custom', 'fill', 'stereotyp', 'irish', 'man', 'bucket', 'beer', 'carri', 'nation', 'follow', 'burst', 'insid', 'assault', 'irish', 'man', 'pull', 'hat', 'eye', 'dump', 'beer', 'head', 'group', 'begin', 'wreck', 'bar', 'smash', 'fixtur', 'mirror', 'break', 'cash', 'regist', 'bartend', 'spray', 'seltzer', 'water', 'nation', 'face', 'group', 'policeman', 'appear', 'order', 'everybodi', 'leav']

Relevance score: 0.0000363352288211035007441801480965182236104737967252731323242188
Love by the Light of the Moon -- ['moon', 'paint', 'smile', 'face', 'hang', 'park', 'night', 'young', 'coupl', 'walk', 'past', 'fenc', 'learn', 'rail', 'look', 'moon', 'smile', 'embrac', 'moon', 'smile', 'get', 'bigger', 'sit', 'bench', 'tree', 'moon', 'view', 'block', 'caus', 'frown', 'last', 'scene', 'man', 'fan', 'woman', 'hat', 'moon', 'left', 'sky', 'perch

Advanced Search

Relevance Feedback Using Rocchio Algorithm

Using Implicit Feedback & 5 iterations:

In [108]:
import math
from collections import Counter

def cosine_similarity(doc, query):
    # Normalize the term frequency vector for the document and the query
    magnitude_doc = math.sqrt(sum([value ** 2 for value in doc.values()]))
    magnitude_query = math.sqrt(sum([value ** 2 for value in query.values()]))
    normalized_doc = {word: value / magnitude_doc for word, value in doc.items()}
    normalized_query = {word: value / magnitude_query for word, value in query.items()}

    # Compute the cosine similarity between the normalized document and the normalized query
    common_words = set(normalized_doc).intersection(set(normalized_query))
    dot_product = sum([normalized_doc[word] * normalized_query[word] for word in common_words])
    return dot_product

def relevance_feedback(documents, query, number_of_iterations = 5, alpha = 0.5 , beta = 0.5):
    # Create a dictionary to keep track of the term frequency of each word in each document
    word_frequency = [Counter(doc) for doc in documents]

    # Create a dictionary to keep track of the document frequency of each word
    document_frequency = Counter(word for doc in documents for word in set(doc))

    # Create a dictionary to keep track of the inverse document frequency of each word
    inverse_document_frequency = {word: math.log(len(documents) / document_frequency[word]) for word in document_frequency}

    # Initialize the query vector with the inverse document frequency of each word
    query_vector = {word: inverse_document_frequency[word] for word in query.split()}

    for i in range(number_of_iterations):
        # Compute the similarity between the query and each document
        similarities = [cosine_similarity(word_frequency[i], query_vector) for i in range(len(documents))]

        # Compute the pseudo-relevance feedback documents
        top_documents = sorted(range(len(similarities)), key=lambda i: similarities[i], reverse=True)[:5]
        feedback_documents = [documents[i] for i in top_documents]

        # Compute the term frequency of each word in the feedback documents
        feedback_word_frequency = [Counter(doc) for doc in feedback_documents]

        # Compute the term frequency of each word in the feedback documents
        feedback_document_frequency = Counter(word for doc in feedback_documents for word in set(doc))

        # Compute the feedback weight of each word in the query
        feedback_weights = {word: 0 for word in query.split()}
        for doc in feedback_word_frequency:
            for word in doc:
                if word not in feedback_weights:
                  feedback_weights[word] = 0
                feedback_weights[word] += doc[word] / feedback_document_frequency[word]

        # Update the query vector with the feedback weights
        for word in query_vector:
            query_vector[word] = alpha * query_vector[word] + beta * feedback_weights[word] * inverse_document_frequency[word]

    # Compute the similarities between the query and the updated documents
    similarities = [cosine_similarity(word_frequency[i], query_vector) for i in range(len(documents))]

    # Return the indices of the documents sorted by decreasing order of similarity
    return sorted(range(len(similarities)), key=lambda i: similarities[i], reverse=True)

In [109]:
query = "irish man"
ans = relevance_feedback(nostopw_casefolded_lemmatized_corpus,query)

In [110]:

for index in ans[:20]:
  print(f'document {index}: {title_mapping.get(index)} -- {nostopw_casefolded_lemmatized_corpus[index]}')


document 6024: Wings of the Hawk -- ['mexico', 'expatriate', 'american', 'known', 'irish', 'gallagher', 'join', 'mexican', 'revolutionary', 'mine', 'partner', 'marco', 'struck', 'gold', 'seized', 'colonel', 'paco', 'ruiz', 'corrupt', 'official', 'rule', 'province', 'marco', 'killed', 'ruiz', 'band', 'rebel', 'save', 'irish', 'certain', 'death', 'particularly', 'brave', 'one', 'woman', 'raquel', 'noriega', 'wounded', 'gunfire', 'rebel', 'arent', 'sure', 'irish', 'take', 'back', 'leader', 'arturo', 'torres', 'talk', 'raquel', 'faint', 'injury', 'irish', 'offer', 'remove', 'bullet', 'raquel', 'engaged', 'marry', 'arturo', 'sister', 'elena', 'kidnapped', 'go', 'search', 'sister', 'raquel', 'irish', 'taken', 'prisoner', 'ruiz', 'locked', 'cell', 'elena', 'captive', 'say', 'intends', 'marry', 'ruiz', 'mistakenly', 'trust', 'villainous', 'ruiz', 'coldly', 'executes', 'mother', 'one', 'arturos', 'loyal', 'rebel', 'tomas', 'irish', 'raquel', 'broken', 'jail', 'rebel', 'arturo', 'killed', 'irish

Approach 2 Using Explicit Feedback:

Enter Your Query: irish man
Document8 -- The Little Train Robbery -- ['open', 'scene', 'show', 'interior', 'robber', 'den', 'wall', 'decor', 'portrait', 'notori', 'crimin', 'pictur', 'illustr', 'exploit', 'famous', 'bandit', 'gang', 'loung', 'other', 'read', 'novel', 'illustr', 'paper', 'although', 'youth', 'appear', 'dress', 'like', 'typic', 'western', 'desperado', 'bandit', 'queen', 'lead', 'blindfold', 'new', 'recruit', 'enter', 'room', 'led', 'center', 'room', 'rais', 'right', 'hand', 'solemn', 'sworn', 'bandag', 'remov', 'eye', 'find', 'look', 'muzzl', 'dozen', 'gang', 'congratul', 'new', 'member', 'heartili', 'shake', 'hand', 'bandit', 'queen', 'evid', 'leader', 'gang', 'call', 'volunt', 'hold', 'train', 'respond', 'pick', 'seven', 'job', 'immedi', 'leav', 'cabin', 'next', 'scene', 'show', 'gang', 'break', 'barn', 'steal', 'poni', 'ride', 'away', 'upon', 'reach', 'place', 'agre', 'upon', 'picket', 'poni', 'leav', 'charg', 'trust', 'member', 'proceed', 'wild', 'mountain', 'spot', 

Semantic Matching

Using Vector model and Cosine Similarity

In [112]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

def get_synonyms(word):
    synonyms = []
    for syn in wordnet.synsets(word):
        for lemma in syn.lemmas():
            synonyms.append(lemma.name())
    return list(set(synonyms))

def get_relevant_documents(query, documents):
    # Tokenize query
    query_tokens = word_tokenize(query)
    
    # Lemmatize query tokens
    lemmatizer = WordNetLemmatizer()
    query_lemmas = [lemmatizer.lemmatize(token) for token in query_tokens]
    
    # Replace words in query with their synonyms
    for i in range(len(query_lemmas)):
        synonyms = get_synonyms(query_lemmas[i])
        if len(synonyms) > 0:
            query_lemmas[i] = '|'.join(synonyms)
    
    # Create a new list to store the modified documents
    modified_documents = []
    for document in documents:
        modified_document = []
        for j in range(len(document)):
            word = document[j]
            synonyms = get_synonyms(word)
            if len(synonyms) > 0:
                modified_document.append(synonyms[0])
            else:
                modified_document.append(word)
        modified_documents.append(modified_document)
    
    # Convert documents to vectors
    vectors = []
    for document in modified_documents:
        vector = []
        for lemma in query_lemmas:
            vector.append(document.count(lemma))
        vectors.append(vector)
    
    # Convert query to vector
    query_vector = []
    for lemma in query_lemmas:
        query_vector.append(query_lemmas.count(lemma))
    
    # Calculate cosine similarity between query and each document
    similarities = cosine_similarity(vectors, [query_vector])
    
    # Sort documents by similarity score
    sorted_documents = [documents[i] for i in np.argsort(similarities[:,0])[::-1]]
    
    return sorted_documents


query = "irish man"
relevant_documents = get_relevant_documents(query, nostopw_casefolded_lemmatized_corpus)
for ind in range(len(relevant_documents))[:10]:
  print(f'Document{ind}-- {relevant_documents[ind]}')

Document0-- ['fourteen', 'year', 'old', 'maria', 'antonia', 'beautiful', 'charming', 'naive', 'archduchess', 'austria', 'youngest', 'empress', 'maria', 'theresa', 'daughter', 'one', 'left', 'unmarried', 'among', 'sister', 'sent', 'mother', 'marry', 'dauphin', 'france', 'future', 'louis', 'xvi', 'france', 'seal', 'alliance', 'two', 'rival', 'country', 'marie', 'antoinette', 'travel', 'france', 'relinquishing', 'connection', 'home', 'country', 'including', 'pet', 'pug', 'mop', 'meet', 'king', 'louis', 'xv', 'france', 'future', 'husband', 'louis', 'auguste', 'two', 'arrive', 'palace', 'versailles', 'built', 'king', 'great', 'grandfather', 'married', 'encouraged', 'produce', 'heir', 'throne', 'soon', 'possible', 'next', 'day', 'reported', 'king', 'nothing', 'happened', 'wedding', 'night', 'time', 'pass', 'marie', 'antoinette', 'find', 'life', 'court', 'versailles', 'stifling', 'husband', 'courtier', 'disdain', 'foreigner', 'blame', 'producing', 'heir', 'although', 'fault', 'really', 'lie',

In [114]:
import nltk
from nltk.corpus import wordnet
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Define the documents
documents = [['bartend', 'work', 'saloon', 'serv'],
             ['moon', 'paint', 'smile', 'face']]

# Define the query
query = "smile face"

# Tokenize the query and find synonyms for each word
tokens = nltk.word_tokenize(query)
synonyms = []
for token in tokens:
    syns = []
    for syn in wordnet.synsets(token):
        for lemma in syn.lemmas():
            syns.append(lemma.name())
    synonyms.append(list(set(syns)))

# Flatten the list of synonyms
synonyms = [syn for syn_list in synonyms for syn in syn_list]

# Combine the original query and the synonyms
query = query.split() + synonyms
query = " ".join(query)

# Calculate the TF-IDF matrix for the documents and the query
vectorizer = TfidfVectorizer()
tfidf_documents = vectorizer.fit_transform([" ".join(doc) for doc in nostopw_casefolded_lemmatized_corpus])
tfidf_query = vectorizer.transform([query])

# Calculate the cosine similarity between the query and each document
cos_similarities = cosine_similarity(tfidf_documents, tfidf_query)

# Sort the documents by their cosine similarity to the query
sorted_documents = [nostopw_casefolded_lemmatized_corpus[i] for i in cos_similarities.argsort(axis=0)[::-1].squeeze()]

# Print the sorted documents
for ind in range(len(sorted_documents))[:10]:
  print(f'Document{ind}-- {sorted_documents[ind]}')

Document0-- ['moon', 'painted', 'smiling', 'face', 'hang', 'park', 'night', 'young', 'couple', 'walking', 'past', 'fence', 'learn', 'railing', 'look', 'moon', 'smile', 'embrace', 'moon', 'smile', 'get', 'bigger', 'sit', 'bench', 'tree', 'moon', 'view', 'blocked', 'causing', 'frown', 'last', 'scene', 'man', 'fan', 'woman', 'hat', 'moon', 'left', 'sky', 'perched', 'shoulder', 'see', 'everything', 'better']
Document1-- ['undercover', 'secret', 'service', 'agent', 'stumble', 'upon', 'smuggling', 'ring', 'illegally', 'transporting', 'mexican', 'united', 'state', 'air', 'pull', 'gun', 'pilot', 'one', 'trip', 'pilot', 'sends', 'aircraft', 'sudden', 'climb', 'causing', 'agent', 'tumble', 'back', 'cabin', 'pilot', 'pull', 'lever', 'open', 'cabin', 'floor', 'sending', 'agent', 'six', 'illegal', 'alien', 'plummeting', 'death', 'agent', 'bos', 'tom', 'saxby', 'john', 'litel', 'need', 'pilot', 'infiltrate', 'smuggling', 'ring', 'turn', 'commercial', 'airline', 'former', 'military', 'pilot', 'brass'

Finding Intent of Queries:

Extra Content don't refer while summarizing:

In [64]:
import math
from collections import Counter

def cosine_similarity(doc, query):
    # Normalize the term frequency vector for the document and the query
    magnitude_doc = math.sqrt(sum([value ** 2 for value in doc.values()]))
    magnitude_query = math.sqrt(sum([value ** 2 for value in query.values()]))
    normalized_doc = {word: value / magnitude_doc for word, value in doc.items()}
    normalized_query = {word: value / magnitude_query for word, value in query.items()}

    # Compute the cosine similarity between the normalized document and the normalized query
    common_words = set(normalized_doc).intersection(set(normalized_query))
    dot_product = sum([normalized_doc[word] * normalized_query[word] for word in common_words])
    return dot_product

def relevance_feedback(documents, query, number_of_iterations = 5, alpha = 0.5 , beta = 0.5):
    # Create a dictionary to keep track of the term frequency of each word in each document
    word_frequency = [Counter(doc) for doc in documents]

    # Create a dictionary to keep track of the document frequency of each word
    document_frequency = Counter(word for doc in documents for word in set(doc))

    # Create a dictionary to keep track of the inverse document frequency of each word
    inverse_document_frequency = {word: math.log(len(documents) / document_frequency[word]) for word in document_frequency}

    # Initialize the query vector with the inverse document frequency of each word
    query_vector = {word: inverse_document_frequency[word] for word in query.split()}

    for i in range(number_of_iterations):
        # Compute the similarity between the query and each document
        similarities = [cosine_similarity(word_frequency[i], query_vector) for i in range(len(documents))]

        # Compute the pseudo-relevance feedback documents
        top_documents = sorted(range(len(similarities)), key=lambda i: similarities[i], reverse=True)[:5]
        feedback_documents = [documents[i] for i in top_documents]

        # Compute the term frequency of each word in the feedback documents
        feedback_word_frequency = [Counter(doc) for doc in feedback_documents]

        # Compute the term frequency of each word in the feedback documents
        feedback_document_frequency = Counter(word for doc in feedback_documents for word in set(doc))

        # Compute the feedback weight of each word in the query
        feedback_weights = {word: 0 for word in query.split()}
        for doc in feedback_word_frequency:
            for word in doc:
                if word not in feedback_weights:
                  feedback_weights[word] = 0
                feedback_weights[word] += doc[word] / feedback_document_frequency[word]

        # Update the query vector with the feedback weights
        for word in query_vector:
            query_vector[word] = alpha * query_vector[word] + beta * feedback_weights[word] * inverse_document_frequency[word]

    # Compute the similarities between the query and the updated documents
    similarities = [cosine_similarity(word_frequency[i], query_vector) for i in range(len(documents))]

    # Return the indices of the documents sorted by decreasing order of similarity
    return sorted(range(len(similarities)), key=lambda i: similarities[i], reverse=True)

In [65]:
query = "irish man"
ans = relevance_feedback(nostopw_casefolded_lemmatized_corpus,query)

for index in ans[:20]:
  print(f'document {index}: {title_mapping.get(index)} -- {nostopw_casefolded_lemmatized_corpus[index]}')

document 6024: Wings of the Hawk -- ['mexico', 'expatriate', 'american', 'known', 'irish', 'gallagher', 'join', 'mexican', 'revolutionary', 'mine', 'partner', 'marco', 'struck', 'gold', 'seized', 'colonel', 'paco', 'ruiz', 'corrupt', 'official', 'rule', 'province', 'marco', 'killed', 'ruiz', 'band', 'rebel', 'save', 'irish', 'certain', 'death', 'particularly', 'brave', 'one', 'woman', 'raquel', 'noriega', 'wounded', 'gunfire', 'rebel', 'arent', 'sure', 'irish', 'take', 'back', 'leader', 'arturo', 'torres', 'talk', 'raquel', 'faint', 'injury', 'irish', 'offer', 'remove', 'bullet', 'raquel', 'engaged', 'marry', 'arturo', 'sister', 'elena', 'kidnapped', 'go', 'search', 'sister', 'raquel', 'irish', 'taken', 'prisoner', 'ruiz', 'locked', 'cell', 'elena', 'captive', 'say', 'intends', 'marry', 'ruiz', 'mistakenly', 'trust', 'villainous', 'ruiz', 'coldly', 'executes', 'mother', 'one', 'arturos', 'loyal', 'rebel', 'tomas', 'irish', 'raquel', 'broken', 'jail', 'rebel', 'arturo', 'killed', 'irish

Enter Your Query: irish man
Document8 -- The Little Train Robbery -- ['open', 'scene', 'show', 'interior', 'robber', 'den', 'wall', 'decor', 'portrait', 'notori', 'crimin', 'pictur', 'illustr', 'exploit', 'famous', 'bandit', 'gang', 'loung', 'other', 'read', 'novel', 'illustr', 'paper', 'although', 'youth', 'appear', 'dress', 'like', 'typic', 'western', 'desperado', 'bandit', 'queen', 'lead', 'blindfold', 'new', 'recruit', 'enter', 'room', 'led', 'center', 'room', 'rais', 'right', 'hand', 'solemn', 'sworn', 'bandag', 'remov', 'eye', 'find', 'look', 'muzzl', 'dozen', 'gang', 'congratul', 'new', 'member', 'heartili', 'shake', 'hand', 'bandit', 'queen', 'evid', 'leader', 'gang', 'call', 'volunt', 'hold', 'train', 'respond', 'pick', 'seven', 'job', 'immedi', 'leav', 'cabin', 'next', 'scene', 'show', 'gang', 'break', 'barn', 'steal', 'poni', 'ride', 'away', 'upon', 'reach', 'place', 'agre', 'upon', 'picket', 'poni', 'leav', 'charg', 'trust', 'member', 'proceed', 'wild', 'mountain', 'spot', 