<a href="https://colab.research.google.com/github/Avinavshrestha/Avinav_Tech400/blob/main/Avinav.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
import os
import math
import numpy as np

# Define function to load documents
def load_documents(directory):
    documents = {}
    for doc_id, filename in enumerate(os.listdir(directory), start=1):
        with open(os.path.join(directory, filename), 'r', encoding='utf-8') as file:
            documents[doc_id] = file.read()
    return documents

def tokenize(text):

    return text.lower().split()
docs = load_documents('/content/directory')

# Tokenize all documents
tokenized_docs = [tokenize(doc) for doc in docs.values()]

# Example query
queries = ['German made']

# Tokenize the query
tokenized_queries = [tokenize(query) for query in queries]

print(tokenized_docs)
print(tokenized_queries)


queries = ['German made']
tokenized_queries = [tokenize(query) for query in queries]

print("Queries:", tokenized_queries)

# Creating a sorted vocabulary from all documents
vocab = sorted(set([word for doc in tokenized_docs for word in doc]))


def term_frequency(term, document):
    return document.count(term) / len(document)

def inverse_document_frequency(term, docs):
    num_docs_containing_term = sum(1 for doc in docs if term in doc)
    return math.log(len(docs) / (1 + num_docs_containing_term))

def compute_tfidf_vector(document, vocab, docs):
    tfidf_vector = []
    for term in vocab:
        tf = term_frequency(term, document)
        idf = inverse_document_frequency(term, docs)
        tfidf_vector.append(tf * idf)
    return tfidf_vector

# Compute TF-IDF vectors for all documents
tfidf_docs = [compute_tfidf_vector(doc, vocab, tokenized_docs) for doc in tokenized_docs]

# Compute TF-IDF vectors for all queries (if applicable)
tfidf_queries = [compute_tfidf_vector(query, vocab, tokenized_docs) for query in tokenized_queries]

def cosine_similarity(vec1, vec2):
    dot_product = np.dot(vec1, vec2)
    magnitude = np.linalg.norm(vec1) * np.linalg.norm(vec2)
    if magnitude == 0:
        return 0
    return dot_product / magnitude

# Compute cosine similarities between queries and documents
similarities = []
for query_vector in tfidf_queries:
    doc_similarities = [cosine_similarity(query_vector, doc_vector) for doc_vector in tfidf_docs]
    similarities.append(doc_similarities)

def rank_documents(similarities):
    ranked_docs = sorted(enumerate(similarities), key=lambda x: x[1], reverse=True)
    return ranked_docs

# Rank documents for each query
for i, doc_sim in enumerate(similarities):
    ranked_docs = rank_documents(doc_sim)
    print(f"Ranked documents for query {i + 1}:", ranked_docs)


def write_results_to_file(ranked_docs_list, filename="result_avinav.txt"):
    with open(filename, 'w') as file:
        for query_index, ranked_docs in enumerate(ranked_docs_list, start=1):
            file.write(f"Query {query_index} Results:\n")
            for doc_id, score in ranked_docs:
                file.write(f"Document {doc_id + 1} with score {score:.4f}\n")
            file.write("\n")

ranked_docs_list = [rank_documents(doc_sim) for doc_sim in similarities]
write_results_to_file(ranked_docs_list)
output_filename = "result_Avinav.txt"

print(f"Results saved to {output_filename}")

[['subaru', 'is', 'a', 'japanese', 'automobile', 'manufacturer', 'known', 'for', 'its', 'distinctive', 'use', 'of', 'boxer', 'engines', 'and', 'all-wheel-drive', '(awd)', 'technology', 'in', 'most', 'of', 'its', 'vehicles.', 'founded', 'in', '1953,', 'subaru', 'is', 'a', 'division', 'of', 'subaru', 'corporation,', 'formerly', 'known', 'as', 'fuji', 'heavy', 'industries.', 'the', 'brand', 'is', 'particularly', 'popular', 'for', 'its', 'reliable,', 'durable,', 'and', 'practical', 'cars', 'that', 'appeal', 'to', 'adventure', 'enthusiasts', 'and', 'outdoor-focused', 'consumers.', "subaru's", 'lineup', 'includes', 'sedans,', 'suvs,', 'and', 'crossovers,', 'with', 'notable', 'models', 'such', 'as', 'the', 'outback,', 'forester,', 'and', 'impreza.', 'the', "company's", 'commitment', 'to', 'safety', 'is', 'a', 'hallmark', 'of', 'its', 'brand,', 'with', 'many', 'of', 'its', 'vehicles', 'equipped', 'with', 'the', 'eyesight', 'driver', 'assist', 'technology,', 'which', 'includes', 'features', 'li