<a href="https://colab.research.google.com/github/Divyanshi-16/Information-Retrieval-3/blob/main/Marathi.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

*   Name: Divyanshi Chauhan
*   Roll No.: 21074012

*   Discipline: Computer Science and Engineering(IDD)
*   Use source.zip file to upload in the files section in google colab

In [None]:
from zipfile import ZipFile

file_name = "/content/source.zip"

with ZipFile(file_name, 'r') as zip:
    zip.extractall()
    print('Done')

Done


In [None]:
stopwords_file_path = '/content/stopwords-mr.txt'

with open(stopwords_file_path, 'r', encoding='utf-8') as file:
    stopwords_marathi = set(file.read().split())

In [None]:
import os
import re
import math
from collections import defaultdict
from nltk.tokenize import word_tokenize

def load_marathi_stopwords(stopwords_file_path):
    with open(stopwords_file_path, 'r', encoding='utf-8') as stopwords_file:
        marathi_stopwords = stopwords_file.read().splitlines()
    return set(marathi_stopwords)

def perform_marathi_stemming(word):
    suffixes = {
        1: ["ो", "े", "ू", "ु", "ी", "ि", "ा", "च"],
        2: ["चा", "चे", "ने", "नी", "ना", "ते", "ीं", "तील", "ात", "ाँ", "ां", "ों", "ें", "तच", "ता", "ही", "ले"],
        3: ["ाचा", "ाचे", "तील", "ानी", "ाने", "ाना", "ाते", "ाती", "ाता", "तीं", "तून", "तील", "तही", "तपण", "कडे", "ातच",
            "हून", "पणे", "ाही", "ाले"],
        4: ["मधले", "ातील", "च्या", "न्या", "ऱ्या", "ख्या", "वर", "साठी", "ातून", "कडून", "मुळे", "वरून", "ातील", "नीही",
            "ातही", "ातपण", "ाकडे", "पाशी", "ाहून", "ापणे", "मधला"],
        5: ["ामधले", "ाच्या", "ान्या", "ाऱ्या", "ाख्या", "ावर", "ासाठी", "पासून", "ाकडून", "ामुळे", "ावरून", "कडेही",
            "ानीही", "ापाशी", "ामधला", "मध्ये"],
        6: ["पर्यंत", "ापासून", "ाकडेही", "पूर्वक", "लेल्या", "ामध्ये"],
        7: ["ापर्यंत", "प्रमाणे", "तसुद्धा", "ापूर्वक", "ालेल्या"],
        8: ["ाप्रमाणे", "ातसुद्धा"],
    }

    for i in range(8, 0, -1):
        if len(word) > i + 1:
            for suf in suffixes[i]:
                if word.endswith(suf):
                    return word[:-i]
    return word

def process_marathi_text(text, marathi_stopwords):
    # Remove English alphabet characters, digits, and special characters using regex
    text = re.sub(r'[^ ऀ-ॿ]+', ' ', text)

    # Tokenize the text into words
    words = text.split()

    # Remove stopwords
    stop_words = set(marathi_stopwords)
    words = [word for word in words if word.lower() not in stop_words]

    # Perform stemming/lemmatization on Marathi words
    marathi_words = [perform_marathi_stemming(word) for word in words]

    return marathi_words

def create_index(folder_path, marathi_stopwords):
    term_frequency = defaultdict(lambda: defaultdict(int))
    document_frequency = defaultdict(int)
    posting_list = defaultdict(list)

    for filename in os.listdir(folder_path):
        if filename.endswith('.txt'):
            file_path = os.path.join(folder_path, filename)

            with open(file_path, 'r', encoding='utf-8') as file:
                text = file.read()

            marathi_words = process_marathi_text(text, marathi_stopwords)

            for term in set(marathi_words):
                term_frequency[term][filename] += marathi_words.count(term)
                document_frequency[term] += 1

            for term in set(marathi_words):
                posting_list[term].append((filename, marathi_words.count(term)))

    return term_frequency, document_frequency, posting_list

def vectorize_query(query, marathi_stopwords):
    query_terms = process_marathi_text(query, marathi_stopwords)
    query_vector = defaultdict(float)
    for term in query_terms:
        query_vector[term] += 1
    return query_vector

def compute_tf_idf(term_frequency, document_frequency, num_docs, doc_lengths):
    tfidf_index = defaultdict(dict)
    for term, doc_tf in term_frequency.items():
        idf = math.log(num_docs / document_frequency[term])
        for doc, tf in doc_tf.items():
            tfidf_index[term][doc] = (1 + math.log(tf)) * idf / doc_lengths[doc]
    return tfidf_index

def compute_doc_lengths(posting_list, num_docs):
    doc_lengths = defaultdict(float)
    for term, postings in posting_list.items():
        idf = math.log(num_docs / len(postings))
        for doc, tf in postings:
            doc_lengths[doc] += (1 + math.log(tf)) * idf
    return {doc: math.sqrt(length) for doc, length in doc_lengths.items()}

def cosine_similarity(query_vector, tfidf_index, doc_lengths):
    scores = defaultdict(float)
    for term, query_weight in query_vector.items():
        if term in tfidf_index:
            for doc, doc_weight in tfidf_index[term].items():
                scores[doc] += query_weight * doc_weight
    for doc, score in scores.items():
        scores[doc] /= doc_lengths[doc]
    return scores

folder_path = '/content/source'

stopwords_file_path = '/content/stopwords-mr.txt'
marathi_stopwords = load_marathi_stopwords(stopwords_file_path)

term_frequency, document_frequency, posting_list = create_index(folder_path, marathi_stopwords)
num_docs = len(os.listdir(folder_path))
doc_lengths = compute_doc_lengths(posting_list, num_docs)

tfidf_index = compute_tf_idf(term_frequency, document_frequency, num_docs, doc_lengths)

query = "आपलं समन्वय"
query_vector = vectorize_query(query, marathi_stopwords)
scores = cosine_similarity(query_vector, tfidf_index, doc_lengths)

# Print top 5 relevant documents
print("Query:", query)
ranked_docs = sorted(scores.items(), key=lambda x: x[1], reverse=True)[:5]
print("Relevant documents:")
for doc, score in ranked_docs:
    print(f"Document: {doc}, Score: {score:.4f}")

Query: आपलं समन्वय
Relevant documents:
Document: Kolhapur2414285894.htm.txt, Score: 0.0229
Document: GoaD5AE4D8881.htm.txt, Score: 0.0196
Document: Mumbai8E244E3F6E.htm.txt, Score: 0.0189
Document: Maharashtra24998CBA97.htm.txt, Score: 0.0166
Document: National91D06E120F.htm.txt, Score: 0.0152
