In [1]:
# Importing required libraries
import re
import math

In [2]:
# Function read Documents
def read_documents(folder_path):
    docs = []
    for i in range(1, 400):
        with open(f'{folder_path}/document_{i}.txt', 'r') as f:
            docs.append(f.read())
    return docs

# Function to read Query
def read_query(folder_path):
    with open(f'{folder_path}/Query.txt', 'r') as f:
        query = f.read()
    return query

In [3]:
# Class to preprocess
class Preprocess:
    # Function to convert text to lowercase
    @staticmethod
    def convert_to_lowercase(text):
      return text.lower()

    # Function to remove special characters
    @staticmethod
    def remove_special_characters(text):
      return re.sub(r'[^a-zA-Z0-9\s]+', ' ', text)

    # Function to tokenize
    @staticmethod
    def tokenize(text):
        return text.split()

In [4]:
# Function to get word count
def get_word_count(tokens):
    word_count = {}
    for token in tokens:
        if len(token) == 1:                  # Ignore single character tokens
            continue
        if token in word_count:
            word_count[token] += 1
        else:
            word_count[token] = 1
    return word_count

In [5]:
# Class to get similarity score between query and given docs
class Similarity:
  def __init__(self, query, docs):
      # query is a dictionary with words as keys and their frequencies as values
      # docs is a list of dictionaries with words as keys and their frequencies as values
      self.query = query
      self.docs = docs
      self.no_of_docs = len(docs)

  # Function to calculate Probability
  def prob(self, word, doc):
      # Doc is a dictionary with words as keys and their frequencies as values
      prob_value = (doc.get(word, 0) + 1) / (sum(doc.values()) + 2)
      return prob_value

  # Function to calculate similarity score
  def calculate_similarity_score(self, doc1, doc2):
      sim = 0
      li1 = set(doc1.keys())
      li2 = set(doc2.keys())
      p1 = {}
      p2 = {}

      # Calculating vocabulary of doc1(Query) and doc2
      vocab = li1.union(li2)
      for word in vocab:
          p1[word] = self.prob(word, doc1)
          p2[word] = self.prob(word, doc2)

      # Normalize the values of Probability
      p1_sum = sum(p1.values())
      p2_sum = sum(p2.values())
      for word in p1:
          p1[word] = p1[word] / p1_sum
      for word in p2:
          p2[word] = p2[word] / p2_sum

      # Calculate similarity score
      for word in vocab:
          sim += p1[word] * math.log(p1[word] / p2[word])

      return sim

  # Function to get similarity scores for all documents, from high to low similarity scores
  def get_scores(self):
      scores = []
      for i in range(self.no_of_docs):
          scores.append((i+1, self.calculate_similarity_score(self.query, self.docs[i])))
      scores.sort(key=lambda x: x[1])
      return scores

In [6]:
path = '/content/drive/MyDrive/Assigment Req/doc'
myDocs = read_documents(path)
myQuery = read_query(path)

# Converting text of each document to lowercase and removing special characters
myDocs = [Preprocess.convert_to_lowercase(doc) for doc in myDocs]
myDocs = [Preprocess.remove_special_characters(doc) for doc in myDocs]

# Converting query to lowercase and removing special characters
myQuery = Preprocess.convert_to_lowercase(myQuery)
myQuery = Preprocess.remove_special_characters(myQuery)

# Converting text to tokens
myDocs = [Preprocess.tokenize(doc) for doc in myDocs]
myQuery = Preprocess.tokenize(myQuery)

# Getting word count
myDocsHashMap = [get_word_count(doc) for doc in myDocs] # A dictionary is created for each document to store the count of words.
myQueryHashMap = get_word_count(myQuery)

# Getting similary score
sim = Similarity(myQueryHashMap, myDocsHashMap)
scores = sim.get_scores()

print(scores)


[(3, 0.07332280543985659), (54, 0.14306115768352953), (43, 0.15367402427443247), (315, 0.1560064841562762), (318, 0.16007783330471398), (321, 0.16523729533075426), (317, 0.16541894849750133), (316, 0.1654189484975014), (313, 0.16905749202405806), (51, 0.17251654687228646), (161, 0.17458277075048936), (323, 0.17860574829168274), (44, 0.1810484644568216), (41, 0.1834086023077123), (320, 0.1857424923614756), (60, 0.18966535495815728), (172, 0.19209521860996703), (56, 0.2034067716994095), (322, 0.20653291531180756), (196, 0.2073886057585195), (170, 0.20744226754818934), (344, 0.209627847591491), (42, 0.21107136368060792), (303, 0.21109484748848484), (45, 0.21172761949647506), (55, 0.21308509917397472), (173, 0.2140406479486245), (83, 0.21628039002858795), (171, 0.21719784522075813), (162, 0.21743893977839382), (11, 0.2179510840131151), (194, 0.21893200039858637), (53, 0.22315393642344958), (302, 0.22446831919218452), (50, 0.22756694893972754), (35, 0.22882976164926455), (184, 0.22932813375

In [7]:
print(f'Most similar document that matches with the given Query doc is: document_{scores[0][0]}')

Most similar document that matches with the given Query doc is: document_3
