In [15]:
import os
import math
from collections import defaultdict
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

In [16]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/aryansethia/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/aryansethia/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/aryansethia/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/aryansethia/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [17]:
def preprocess(text):
    # Case folding
    text = text.lower()
    
    # Remove special characters and numbers
    text = re.sub(r'\W+', ' ', text)
    
    # Tokenization
    tokens = word_tokenize(text)
    
    # Stop word removal
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    
    # Stemming
    stemmer = nltk.stem.PorterStemmer()
    tokens = [stemmer.stem(word) for word in tokens]
    
    return tokens

In [18]:
# Step 1: Read Documents from the folder
def readDocuments(folderName):
    documents = defaultdict(list)
    try:
        for filename in os.listdir(folderName):
            with open(folderName + "/" + filename, "r", encoding="utf-8") as file:
                text = file.read()
                text_tokens = preprocess(text)
                documents[filename] = text_tokens
    except Exception as e:
        print(f"Error reading documents: {e}")
    return documents

In [19]:
# Step 2: Find all unique words
def findUniqueWords(documents):
    unique_words = set()
    for document in documents.values():
        for word in document:
            unique_words.add(word)
    return unique_words

In [20]:
# Step 3: Calculate tf Weight for documents
def calculatetfWeight(tf):
    return 1 + math.log10(tf) if tf > 0 else 0

# Step 4: Calculate idf Weight for query
def calculateidfWeight(df, N):
    return math.log10(N / df) if df > 0 else 0

In [21]:
# Step 5: Create Posting List (stores log(tf) for documents)
def createPostingList(documents, unique_words):
    posting_list = {}
    document_frequencies = defaultdict(int)
    N = len(documents)

    # Calculate document frequencies
    for word in unique_words:
        for text_tokens in documents.values():
            if word in text_tokens:
                document_frequencies[word] += 1
    
    # Build the posting list
    for word in unique_words:
        posting_list[word] = []
        for doc_name, text_tokens in documents.items():
            term_freq = text_tokens.count(word)
            if term_freq > 0:
                # Store log(tf) for document
                posting_list[word].append((doc_name, calculatetfWeight(term_freq)))

    # Save the posting list to a text file in the desired format
    try:
        with open("posting_list.txt", "w", encoding="utf-8") as f:
            for word, postings in posting_list.items():
                df = document_frequencies[word]  # Get actual document frequency value
                # Write in the format "term", df: [(doc: "logtf"), ...]
                f.write(f'"{word}", {df}: [')
                f.write(", ".join([f'("{doc}", "{logtf:.6f}")' for doc, logtf in postings]))
                f.write("]\n")
    except Exception as e:
        print(f"Error writing to file: {e}")

                
    return posting_list, document_frequencies

In [22]:
# Step 6: Normalize Vectors using Cosine Normalization
def normalize_vector(vector):
    norm = math.sqrt(sum(weight ** 2 for weight in vector.values()))
    if norm == 0:
        return vector
    return {term: weight / norm for term, weight in vector.items()}

In [23]:
# Step 7: Compute Cosine Similarity
def compute_cosine_similarity(query_vector, doc_vector):
    dot_product = sum(query_vector[term] * doc_vector.get(term, 0) for term in query_vector)
    return dot_product 

In [24]:
# Step 8: Rank documents based on cosine similarity
def rank_documents(documents, query, posting_list, document_frequencies, unique_words):
    N = len(documents)
    query_tokens = preprocess(query)
    unique_words_query = set(query_tokens)
    unique_words= unique_words.union(unique_words_query)
    query_vector = {}

    # Calculate query tf-idf weights (ltc scheme)
    for word in unique_words:
        tf = query_tokens.count(word)
        # Get df from doc_frequency based on word, default to 0 if word not found
        df = document_frequencies.get(word, 0)
        if df > 0:
            idf = calculateidfWeight(df, N) 
            query_vector[word] = calculatetfWeight(tf) * idf

    # Normalize the query vector
    query_vector = normalize_vector(query_vector)

    # Calculate cosine similarities
    similarities = {}
    for doc_name in documents.keys():
        doc_vector = {}
        for word in unique_words:
        # Get the posting list for the current word
            posting = posting_list.get(word, [])
            # Check if the document has the word
        
            for doc, log_tf in posting:
                if doc == doc_name:
                # Set idf = 1 for documents in lnc scheme
                    doc_vector[word] = log_tf * 1  # Assign the log_tf value
        
            #If the word is not found, you can explicitly set its value to 0
            if word not in doc_vector:
                doc_vector[word] = 0

        # Normalize the document vector
        doc_vector = normalize_vector(doc_vector)

        # Compute cosine similarity
        similarity = compute_cosine_similarity(query_vector, doc_vector)
        similarities[doc_name] = similarity

    # Sort documents by similarity and return top 10
    ranked_docs = sorted(similarities.items(), key=lambda item: (-item[1], item[0]))
    return ranked_docs[:10]

In [25]:
folderName = "Corpus"
query = "Developing your Zomato business account and profile is a great way to boost your restaurant’s online reputation"
query2 = "Warwickshire, came from an ancient family and was the heiress to some land"
# Read documents and create posting list
documents = readDocuments(folderName)
unique_words = findUniqueWords(documents)
posting_list, document_frequencies = createPostingList(documents, unique_words)
# Rank documents by cosine similarity
ranked_docs = rank_documents(documents, query, posting_list, document_frequencies, unique_words)
ranked_docs2 = rank_documents(documents, query2, posting_list, document_frequencies, unique_words)
print("Ranked Documents by Relevance for query 1:")
for doc_name, score in ranked_docs:
    print(f"{doc_name}: {score:.4f}")

print("\n")

print("Ranked Documents by Relevance for query 2:")
for doc_name, score in ranked_docs2:
    print(f"{doc_name}: {score:.4f}")

Ranked Documents by Relevance for query 1:
zomato.txt: 0.2146
swiggy.txt: 0.1310
instagram.txt: 0.0605
messenger.txt: 0.0592
youtube.txt: 0.0585
Discord.txt: 0.0533
bing.txt: 0.0518
paypal.txt: 0.0471
reddit.txt: 0.0441
flipkart.txt: 0.0407


Ranked Documents by Relevance for query 2:
shakespeare.txt: 0.1201
levis.txt: 0.0241
Adobe.txt: 0.0227
google.txt: 0.0207
nike.txt: 0.0192
zomato.txt: 0.0177
huawei.txt: 0.0137
skype.txt: 0.0117
blackberry.txt: 0.0109
Dell.txt: 0.0108
