In [17]:
!pip install rank_bm25


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [18]:
import math
import numpy as np
from multiprocessing import Pool, cpu_count

In [19]:
class BM25Okapi:
  def __init__(self, corpus, tokenizer=None, k1=1.5, b=0.75, epsilon=0.25):
    self.k1 = k1
    self.b = b
    self.epsilon = epsilon
    self.corpus_size = 0
    self.avgdl = 0
    self.doc_freqs = []
    self.idf = {}
    self.doc_len = []
    
    nd = self._initialize(corpus)
    self._calc_k(nd)

  def _initialize(self, corpus):
    nd = {}  # word -> number of documents with word
    num_doc = 0

    for document in corpus:
       self.doc_len.append(len(document))
       num_doc += len(document)
       
       frequencies = {}
       for word in document:
         if word not in frequencies:
           frequencies[word] = 0
         frequencies[word] += 1
       self.doc_freqs.append(frequencies)

       for word, freq in frequencies.items():
          try:
            nd[word]+=1
          except KeyError:
            nd[word] = 1
       self.corpus_size += 1
    
    self.avgdl = num_doc / self.corpus_size
    return nd

  def _calc_k(self, nd):
    idf_sum = 0
    negative_idfs = []

    for word, freq in nd.items():
      idf = math.log(self.corpus_size - freq + 0.5) - math.log(freq + 0.5)
      self.idf[word] = idf
      idf_sum += idf
      if idf < 0:
        negative_idfs.append(word)
    self.average_idf = idf_sum / len(self.idf)
    eps = self.epsilon * self.average_idf
    for word in negative_idfs:
      self.idf[word] = eps

  def get_top_n(self, query, documents, n=5):
    assert self.corpus_size == len(documents), "The documents given don't match the index corpus!"
    scores = self.get_scores(query)
    top_n = np.argsort(scores)[::-1][:n]
    return [documents[i] for i in top_n]

  def get_scores(self, query):
    score = np.zeros(self.corpus_size)
    doc_len = np.array(self.doc_len)

    for q in query:
       q_freq = np.array([(doc.get(q) or 0) for doc in self.doc_freqs])
       score += (self.idf.get(q) or 0) * (q_freq * (self.k1 + 1) /(q_freq + self.k1 * (1 - self.b + self.b * doc_len / self.avgdl)))
    
    return score

In [20]:
import pandas as pd
import csv
import spacy
# from rank_bm25 import BM25Okapi
from tqdm import tqdm
import time
import import_ipynb



nlp = spacy.load("en_core_web_sm")


In [21]:
df=pd.read_csv('titles.csv')

In [22]:
df['description']= df['description'].astype(str)

df['search_text']=df['release_year'].astype(str)+' '+df['type'].astype(str)+' '+ df['title'].astype(str)+' '+df['description']

In [23]:

text_list = df.search_text.str.lower().values
tok_text=[] # for our tokenised corpus
#Tokenising using SpaCy:
for doc in tqdm(nlp.pipe(text_list, disable=["tagger", "parser","ner"])):
   tok = [t.text for t in doc if t.is_alpha]
   tok_text.append(tok)

5850it [00:18, 310.13it/s]


In [24]:
bm25 = BM25Okapi(tok_text)

In [25]:
flag=True
while (flag==True):
  query = input("Enter text : ")
  tokenized_query = query.lower().split(" ")

  t0 = time.time()
  results = bm25.get_top_n(tokenized_query, df.search_text.values, n=5)
  t1 = time.time()
  print(f'Searched 5851 records in {round(t1-t0,3) } seconds \n')

  for i in results:
    print(i+"\n")
  
  print("Want to search more , Press 1 else press any number to end your search")
  n=int(input())
  if(n!=1):
    flag=False
    

print("Search Complete !!")

Enter text : horror comedy
Searched 5851 records in 0.011 seconds 

2019 MOVIE Uncle Naji in UAE Naji decides with his friends to go on holiday to a mountainous region, they face many funny and strange comedy situations ,but unexpected moment happened turned their funny journey to horror, fear and mystery.

2020 MOVIE Ghost Stories An anthology of four short horror tales.

1995 SHOW Goosebumps Anything can turn spooky in this horror anthology series based on the best-selling books by master of kid horror, R.L. Stine. In every episode, see what happens when regular kids find themselves in scary situations, and how they work to confront and overcome their fears.

2020 SHOW A Perfect Day for Arsenide A Perfect Day for Arsenide adapts ten stories from the same-titled novel by Hong Kong writer Pizza, the author of Lost On A Red Mini Bus To Taipo. Spanning suspense, horror, comedy, fantasy and more, the inventive series rolls out whimsical and bizarre stories about the absurdity of life in t