In [1]:
import os
import logging
import math
import numpy as np
import nltk
nltk . download ('stopwords')
nltk . download ('punkt')
nltk . download ('wordnet')
import string
import logging
import re
from collections import defaultdict , Counter
from nltk . corpus import stopwords
from nltk . tokenize import word_tokenize
from nltk . stem import WordNetLemmatizer

STOPWORDS = set( stopwords . words ('english') )
LEMMATIZER = WordNetLemmatizer ()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [2]:
def load_text_files(folder_path):
    data = {}
    doc_id_to_filename = {}
    doc_id = 0
    for filename in os.listdir(folder_path):
        if filename.endswith(".txt"):
            with open(os.path.join(folder_path, filename), 'r', encoding='utf-8') as file:
                data[doc_id] = file.read()
                doc_id_to_filename[doc_id] = filename
                logging.info(f"Loaded file: {filename} with doc_id: {doc_id}")
                doc_id += 1
    return data, doc_id_to_filename

In [3]:
def tokenize(text):
    return text.lower().split()

In [4]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^a-zA-Z0-9\s]", "", text)
    tokens = toke(text)
    cleaned_tokens = [LEMMATIZER.lemmatize(word) for word in tokens if word not in STOPWORDS]
    return cleaned_tokens

In [5]:
def term_frequency(term, document):
    return document.count(term) / len(document)

In [6]:
def inverse_document_frequency(term, all_documents):
    num_docs_containing_term = sum(1 for doc in all_documents if term in doc)
    return math.log(len(all_documents) / (1 + num_docs_containing_term))

In [7]:
def compute_tfidf(document, all_documents, vocab):
    tfidf_vector = []
    for term in vocab:
        tf = term_frequency(term, document)
        idf = inverse_document_frequency(term, all_documents)
        tfidf_vector.append(tf * idf)
    return np.array(tfidf_vector)

In [8]:
def cosine_similarity(vec1, vec2):
    dot_product = np.dot(vec1, vec2)
    norm_vec1 = np.linalg.norm(vec1)
    norm_vec2 = np.linalg.norm(vec2)
    return dot_product / (norm_vec1 * norm_vec2) if norm_vec1 * norm_vec2 != 0 else 0

In [9]:
import os
import logging

def process_queries(queries, tokenized_docs, doc_tfidf_vectors, vocab):
    results = []
    for query in queries:
        tokenized_query = tokenize(query)
        query_tfidf_vector = compute_tfidf(tokenized_query, tokenized_docs, vocab)

        similarities = []
        for doc_id, doc_vector in enumerate(doc_tfidf_vectors):
            similarity = cosine_similarity(query_tfidf_vector, doc_vector)
            similarities.append((doc_id, similarity))


        similarities.sort(key=lambda x: x[1], reverse=True)

        results.append((query, similarities))
        print(results)
    return results

In [10]:
def write_queries_to_file(results, doc_id_to_filename, output_folder):

    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    output_file = os.path.join(output_folder, "Krishma.txt")
    with open(output_file, 'w', encoding='utf-8') as f:
        for query, similarities in results:
            f.write(f"Query: {query}\n")
            for doc_id, similarity in similarities:
                filename = doc_id_to_filename[doc_id]
                f.write(f"  Document: {filename}, Similarity: {similarity:.4f}\n")
            f.write("\n")

    logging.info(f"Results written to {output_file}")

In [11]:
def convert_doc_ids_to_filenames(doc_ids, doc_id_to_filename):
    return [doc_id_to_filename[doc_id] for doc_id in doc_ids]


In [16]:
def main():
    folder_path = "/content/drive/MyDrive/week3"

    docs, doc_id_to_filename = load_text_files(folder_path)

    queries = ["Smartphones",
"evolution", "over years", "software advancements","future mobile technology",
"Smartwatches",
"fitness health",
 "monitoring productivity",
 "wearable technology",
"communication",
"trends future",
"Virtual reality"]


    tokenized_docs = [tokenize(doc) for doc in docs.values()]
    vocab = sorted(set(word for doc in tokenized_docs for word in doc))
    print(logging.info(f"Vocabulary size: {len(vocab)}"))

    doc_tfidf_vectors = [compute_tfidf(doc, tokenized_docs, vocab) for doc in tokenized_docs]


    results = process_queries(queries, tokenized_docs, doc_tfidf_vectors, vocab)


    output_folder = "/content/drive/MyDrive/Query/queries"


    write_queries_to_file(results, doc_id_to_filename, output_folder)

if __name__ == "__main__":
    main()

None
[('Smartphones', [(0, 0.4219783102889094), (1, 0.0), (2, 0.0), (3, 0.0), (4, 0.0), (5, 0.0), (6, 0.0), (7, 0.0), (8, 0.0), (9, 0.0)])]
[('Smartphones', [(0, 0.4219783102889094), (1, 0.0), (2, 0.0), (3, 0.0), (4, 0.0), (5, 0.0), (6, 0.0), (7, 0.0), (8, 0.0), (9, 0.0)]), ('evolution', [(0, 0.15783473400191225), (2, 0.14354939054380533), (1, 0.0), (3, 0.0), (4, 0.0), (5, 0.0), (6, 0.0), (7, 0.0), (8, 0.0), (9, 0.0)])]
[('Smartphones', [(0, 0.4219783102889094), (1, 0.0), (2, 0.0), (3, 0.0), (4, 0.0), (5, 0.0), (6, 0.0), (7, 0.0), (8, 0.0), (9, 0.0)]), ('evolution', [(0, 0.15783473400191225), (2, 0.14354939054380533), (1, 0.0), (3, 0.0), (4, 0.0), (5, 0.0), (6, 0.0), (7, 0.0), (8, 0.0), (9, 0.0)]), ('over years', [(0, 0), (1, 0), (2, 0), (3, 0), (4, 0), (5, 0), (6, 0), (7, 0), (8, 0), (9, 0)])]
[('Smartphones', [(0, 0.4219783102889094), (1, 0.0), (2, 0.0), (3, 0.0), (4, 0.0), (5, 0.0), (6, 0.0), (7, 0.0), (8, 0.0), (9, 0.0)]), ('evolution', [(0, 0.15783473400191225), (2, 0.143549390543