<a href="https://colab.research.google.com/github/Divyanshi-16/Information-Retrieval-3/blob/main/English.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

*   Name: Divyanshi Chauhan
*   Roll No.: 21074012

*   Discipline: Computer Science and Engineering(IDD)
*   Use business.zip file to upload in the files section in google colab

In [None]:
from zipfile import ZipFile

file_name = "/content/business.zip"

with ZipFile(file_name, 'r') as zip:
    zip.extractall()
    print('Done')

In [None]:
import math
from collections import defaultdict
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk import download
download('punkt')
download('stopwords')

documents = {
    1: "This is the first document",
    2: "This document is the second document",
    3: "And this is the third one",
    4: "Is this the first document?"
}

# Preprocessing
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

def preprocess(text):
    tokens = word_tokenize(text.lower())
    tokens = [stemmer.stem(token) for token in tokens if token.isalnum() and token not in stop_words]
    return tokens

# Create index
index = defaultdict(lambda: defaultdict(int))
for doc_id, doc_text in documents.items():
    terms = preprocess(doc_text)
    for term in terms:
        index[term][doc_id] += 1

# Compute TF-IDF
num_docs = len(documents)
doc_lengths = {doc_id: sum(tf ** 2 for tf in index[term].values()) ** 0.5 for term in index for doc_id in index[term]}
tfidf_index = {term: {doc_id: (tf * math.log(num_docs / len(index[term]))) / doc_lengths[doc_id] for doc_id, tf in index[term].items()} for term in index}

# Vectorize query
def vectorize_query(query):
    query_terms = preprocess(query)
    query_vector = {}
    for term in query_terms:
        if term in index:
            query_vector[term] = (query_terms.count(term) / len(query_terms)) * math.log(num_docs / len(index[term]))
    return query_vector

# Compute cosine similarity
def cosine_similarity(query_vector, doc_id):
    dot_product = sum(query_vector.get(term, 0) * tfidf_index[term].get(doc_id, 0) for term in query_vector)
    doc_length = doc_lengths[doc_id]
    if doc_length == 0:
        return 0
    return dot_product / doc_length

# Perform search
def search(query):
    query_vector = vectorize_query(query)
    scores = {doc_id: cosine_similarity(query_vector, doc_id) for doc_id in documents}
    ranked_docs = sorted(scores.items(), key=lambda x: x[1], reverse=True)
    return ranked_docs

query = "This is the second document"
results = search(query)
print("Query:", query)
for doc_id, score in results:
    print(f"Document {doc_id}: {documents[doc_id]} - Score: {score:.4f}")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Query: This is the second document
Document 2: This document is the second document - Score: 1.0437
Document 1: This is the first document - Score: 0.0069
Document 4: Is this the first document? - Score: 0.0069
Document 3: And this is the third one - Score: 0.0000


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
import os
import math
from collections import defaultdict
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk import download

download('punkt')
download('stopwords')

# Folder containing documents
folder_path = "/content/business"

# Preprocessing
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

def preprocess(text):
    tokens = word_tokenize(text.lower())
    tokens = [stemmer.stem(token) for token in tokens if token.isalnum() and token not in stop_words]
    return tokens

# Create index
index = defaultdict(lambda: defaultdict(int))
doc_lengths = defaultdict(float)

for filename in os.listdir(folder_path):
    if filename.endswith(".utf8"):  # Assuming all documents are text files with .utf8 extension
        with open(os.path.join(folder_path, filename), 'r', encoding='utf-8') as file:
            doc_id = filename.split(".")[0]  # Extract document ID from filename
            doc_text = file.read()
            terms = preprocess(doc_text)
            print(f"Document ID: {doc_id}, Terms: {terms}")
            for term in terms:
                index[term][doc_id] += 1
                doc_lengths[doc_id] += 1  # Increment document length

# Vectorize query
def vectorize_query(query):
    query_terms = preprocess(query)
    return query_terms

# Perform search
def search(query):
    query_terms = vectorize_query(query)
    scores = defaultdict(float)
    for term in query_terms:
        for doc_id, tfidf in index[term].items():
            scores[doc_id] += tfidf
    ranked_docs = sorted(scores.items(), key=lambda x: x[1], reverse=True)
    return ranked_docs

query = "bengal calcutta telegraph new delhi report anirudh"
results = search(query)
print("Query:", query)
for doc_id, score in results:
    print(f"Document {doc_id} - Score: {score:.4f}")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Document 1070912_business_story_8305616 - Score: 9.0000
Document 1070730_business_story_8124037 - Score: 9.0000
Document 1070724_business_story_8098361 - Score: 9.0000
Document 1070613_business_story_7916247 - Score: 9.0000
Document 1070424_business_story_7688905 - Score: 9.0000
Document 1070125_business_story_7307552 - Score: 9.0000
Document 1070117_business_story_7272611 - Score: 9.0000
Document 1070113_business_story_7256145 - Score: 9.0000
Document 1070714_business_story_8057896 - Score: 9.0000
Document 1070518_business_story_7794647 - Score: 9.0000
Document 1070909_business_story_8294386 - Score: 9.0000
Document 1070821_business_story_8217621 - Score: 9.0000
Document 1070417_business_story_7657024 - Score: 9.0000
Document 1070711_business_story_8042060 - Score: 9.0000
Document 1070703_business_story_8008224 - Score: 9.0000
Document 1070822_business_story_8222208 - Score: 9.0000
Document 1070613_business_story_7916587