In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# **Importing Libraries**

In [41]:
import numpy as np
import math
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
import os
import string
import logging
import re
from collections import defaultdict, Counter
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# Uploading files and load into

In [42]:
# Read documents from uploaded files
file_paths = ["/content/drive/MyDrive/FILES/Class_wk3"]

# Load documents into a list
docs = []
for file_path in file_paths:
    if os.path.isdir(file_path):
        for filename in os.listdir(file_path):
            filepath = os.path.join(file_path, filename)
            # Check if it's a file and then read
            if os.path.isfile(filepath):
                with open(filepath, 'r', encoding='utf-8') as file:
                    docs.append(file.read())
    # If it's not a directory, try reading it as a file
    elif os.path.isfile(file_path):
        with open(file_path, 'r', encoding='utf-8') as file:
            docs.append(file.read())

# **Define Queries**

In [43]:
# Define the queries manually (for testing purpose)
queries = [
    "Phone",
    "software version",
]

# **Text** **preprocessing**

In [44]:
# Function to lowercase and tokenize text
def tokenize(text):
    return text.lower().split()

 # Convert to lowercase
    text = text.lower()
    # Remove special characters and punctuation
    text = re.sub(r"[^a-zA-Z0-9\s]", "", text)
    # Remove digit
    text = re.sub(r"\d+", "", text)

In [45]:
# Tokenize documents and queries
tokenized_docs = [tokenize(doc) for doc in docs]
tokenized_queries = [tokenize(query) for query in queries]

# Create a vocabulary from the tokenized documents
vocab = list(set(term for doc in tokenized_docs for term in doc))


print(tokenized_docs)
print(tokenized_queries)
print(vocab)

[['1.', 'customer', 'service', 'complaint', '-', 'airline', 'delay', '@airlinesupport', 'i’ve', 'been', 'waiting', 'over', 'three', 'hours', 'at', 'the', 'gate', 'with', 'no', 'updates', 'on', 'my', 'flight', 'status.', 'the', 'initial', 'delay', 'was', 'supposed', 'to', 'be', 'just', '45', 'minutes,', 'but', 'now', 'we’re', 'way', 'past', 'that,', 'and', 'there', 'has', 'been', 'no', 'communication', 'from', 'your', 'staff.', 'i', 'understand', 'that', 'delays', 'happen,', 'but', 'the', 'lack', 'of', 'information', 'is', 'unacceptable.', 'when', 'can', 'we', 'expect', 'some', 'updates?', '@customersupport', 'your', 'experience', 'matters', 'to', 'us.', 'we’re', 'sorry', 'for', 'the', 'delay', 'and', 'the', 'inconvenience', 'this', 'has', 'caused.', 'please', 'send', 'us', 'a', 'dm', 'with', 'your', 'booking', 'reference', 'and', 'additional', 'details', 'about', 'your', 'flight,', 'and', 'we’ll', 'look', 'into', 'it', 'right', 'away.', 'we’ll', 'make', 'sure', 'to', 'keep', 'you', 'up

# **Term Frequency**

In [46]:
# Function to calculate term frequency (TF)
def term_frequency(term, document):
    return document.count(term) / len(document)

# **Inverse Document Frequency**

In [47]:
# Function to calculate inverse document frequency (IDF)
def inverse_document_frequency(term, all_documents):
    num_docs_containing_term = sum(1 for doc in all_documents if term in doc)
    return math.log(len(all_documents) / (1 + num_docs_containing_term))

# **Compute**

In [48]:
# Compute TF-IDF for a document
def compute_tfidf(document, all_documents, vocab):
    tfidf_vector = []
    for term in vocab:
        tf = term_frequency(term, document)
        idf = inverse_document_frequency(term, all_documents)
        tfidf_vector.append(tf * idf)
    return np.array(tfidf_vector)

# **Cosine similarity between two vector**

In [49]:
# Compute cosine similarity between two vectors
def cosine_similarity(vec1, vec2):
    dot_product = np.dot(vec1, vec2)
    norm_vec1 = np.linalg.norm(vec1)
    norm_vec2 = np.linalg.norm(vec2)
    return dot_product / (norm_vec1 * norm_vec2)

# **Calculate TF-IDF vectors**

In [50]:
# Calculate TF-IDF vectors for documents and queries
doc_tfidf_vectors = [compute_tfidf(doc, tokenized_docs, vocab) for doc in tokenized_docs]
query_tfidf_vectors = [compute_tfidf(query, tokenized_docs, vocab) for query in tokenized_queries]

In [51]:
# Open the file to write the results
with open("Abin.txt", "w") as result_file:
    # Calculate cosine similarities and rank top 3 documents for each query
    for i, query_vector in enumerate(query_tfidf_vectors):
        similarities = [cosine_similarity(query_vector, doc_vector) for doc_vector in doc_tfidf_vectors]

        # Rank documents by similarity score
        ranked_docs = sorted(enumerate(similarities), key=lambda x: x[1], reverse=True)[:3]  # Top 3

        # Prepare the result string
        result_str = f"\nTop 3 results for query '{queries[i]}':\n"
        for rank, (doc_index, score) in enumerate(ranked_docs, 1):
            result_str += f"Rank {rank}: Document {doc_index + 1} with score {score:.4f}\n"

        # Print to console and write to file
        print(result_str)
        result_file.write(result_str)


Top 3 results for query 'Phone':
Rank 1: Document 2 with score 0.2993
Rank 2: Document 1 with score 0.0766
Rank 3: Document 3 with score 0.0000


Top 3 results for query 'software version':
Rank 1: Document 2 with score 0.2263
Rank 2: Document 3 with score 0.0697
Rank 3: Document 1 with score 0.0000

