### Data Preprocessing

In [4]:
import os
import shutil

root_dir = "/home/achal/Downloads/marathi/mr.doc.2010/Marathi-Data"
dest_dir = "/home/achal/Downloads/marathi/mr.doc.2010/Marathi-Data/documents"

os.makedirs(dest_dir, exist_ok=True)

for subdir, _, files in os.walk(root_dir, topdown=False):
    for file in files:
        if file.endswith(".cms.txt") or file.endswith(".htm.txt") or file.endswith(".htm.1.txt") or file.endswith(".htm.2.txt"):
            file_path = os.path.join(subdir, file)
            shutil.move(file_path, os.path.join(dest_dir, file))
for subdir, _, _ in os.walk(root_dir, topdown=False):
    if not os.listdir(subdir):
        os.rmdir(subdir)

print("All .utf8 files moved successfully, and empty folders deleted!")

All .utf8 files moved successfully, and empty folders deleted!


In [7]:
import os
import re

input_dir = "/home/achal/Downloads/marathi/documents"
tags_pattern = r'</?(doc|text)>|<docno>.*?</docno>'
datetime_pattern = r'\[\s*[A-Za-z]+,\s+[A-Za-z]+\s+\d{1,2},\s+\d{4}\s+\d{2}:\d{2}:\d{2}\s+[apAP][mM]\s*\]'
punctuation_pattern = r'[।!?.,:;\'"()\[\]{}॥]+'

for filename in os.listdir(input_dir):
    if filename.endswith(".cms.txt") or filename.endswith(".htm.txt") or filename.endswith(".htm.1.txt") or filename.endswith(".htm.2.txt"):
        file_path = os.path.join(input_dir, filename)
        with open(file_path, 'r', encoding='utf-8') as file:
            content = file.read()
            content = re.sub(tags_pattern, '', content)
            content = re.sub(datetime_pattern, '', content)
            content = re.sub(punctuation_pattern, '', content)
            content = content.lower()
            content = ' '.join(content.split())
        with open(file_path, 'w', encoding='utf-8') as file:
            file.write(content)

print("All documents have been cleaned in-place!")

All documents have been cleaned in-place!


### Converting to Pandas Dataframe

In [None]:
import pandas as pd
import os

input_dir = "/home/achal/Downloads/marathi/documents"
data = []

for filename in os.listdir(input_dir):
    if filename.endswith(".cms.txt")  or filename.endswith(".htm.txt") or filename.endswith(".htm.1.txt") or filename.endswith(".htm.2.txt"):
        file_path = os.path.join(input_dir, filename)
        with open(file_path, 'r', encoding='utf-8') as file:
            content = file.read()
        data.append({"filename": filename, "content": content})

df = pd.DataFrame(data)
print(df.head())
df.to_csv("marathi_documents.csv", index=False, encoding="utf-8")

In [8]:
import pandas as pd

file_path = '/kaggle/input/marathi-documents/marathi_documents.csv'
df = pd.read_csv(file_path)
df

Unnamed: 0,filename,content
0,2132377.cms.txt,महाराष्ट्र नागपूर विधानभवनासमोर वृद्धाची आत्मह...
1,1802816.cms.txt,महामुंबई दिव्यांनी उजळायची दीप अमावास्या म टा ...
2,1006707.cms.txt,अर्थव्यवहार आयओसी भागभांडवल विकण्याचा प्रस्ताव...
3,963568.cms.txt,संवाद लॉटरी का लागत नाही तुम्हाला कधी लॉटरी ला...
4,Mumbai2A1752ACAA.htm.txt,दरवर्षी तंबाखू घेतो ५० लाख व्यक्तींचा बळी मुंब...
...,...,...
99270,1017374.cms.txt,महामुंबई मुंबई टाइम्स वरळीत सायकल स्पर्धा मुंब...
99271,MaharashtraA294AEB2F0.htm.txt,अशोकच्या कर्मचाऱ्यांना वेतनवाढ लागू होणार - मु...
99272,Mumbai82A9804E43.htm.txt,भाऊ दाजी संग्रहालयप्रकरणी महापौरांचे चौकशीचे आ...
99273,Solapur42BC2D8C97.htm.txt,अंदाजपत्रक दुरुस्ती प्रस्तावावर महापालिकेची शु...


### Stopword Removal

List of stopwords in marathi taken from : [Github](https://github.com/stopwords-iso/stopwords-mr/blob/master/stopwords-mr.json)

In [9]:
stop_words_marathi = ["अधिक","अनेक","अशी","असलयाचे","असलेल्या","असा","असून","असे","आज","आणि","आता","आपल्या","आला","आली","आले","आहे","आहेत","एक","एका","कमी","करणयात","करून","का","काम","काय","काही","किवा","की","केला","केली","केले","कोटी","गेल्या","घेऊन","जात","झाला","झाली","झाले","झालेल्या","टा","डॉ","तर","तरी","तसेच","ता","ती","तीन","ते","तो","त्या","त्याचा","त्याची","त्याच्या","त्याना","त्यानी","त्यामुळे","त्री","दिली","दोन","न","नाही","निर्ण्य","पण","पम","परयतन","पाटील","म","मात्र","माहिती","मी","मुबी","म्हणजे","म्हणाले","म्हणून","या","याचा","याची","याच्या","याना","यानी","येणार","येत","येथील","येथे","लाख","व","व्यकत","सर्व","सागित्ले","सुरू","हजार","हा","ही","हे","होणार","होत","होता","होती","होते"]

def remove_stopwords(text):
    if not isinstance(text, str):
        return text
    words = text.split()
    filtered_words = [word for word in words if word.lower() not in stop_words_marathi]
    return " ".join(filtered_words)

df["content"] = df["content"].apply(remove_stopwords)
df

Unnamed: 0,filename,content
0,2132377.cms.txt,महाराष्ट्र नागपूर विधानभवनासमोर वृद्धाची आत्मह...
1,1802816.cms.txt,महामुंबई दिव्यांनी उजळायची दीप अमावास्या प्रति...
2,1006707.cms.txt,अर्थव्यवहार आयओसी भागभांडवल विकण्याचा प्रस्ताव...
3,963568.cms.txt,संवाद लॉटरी लागत तुम्हाला कधी लॉटरी लागल्याचे ...
4,Mumbai2A1752ACAA.htm.txt,दरवर्षी तंबाखू घेतो ५० व्यक्तींचा बळी मुंबई ३०...
...,...,...
99270,1017374.cms.txt,महामुंबई मुंबई टाइम्स वरळीत सायकल स्पर्धा मुंब...
99271,MaharashtraA294AEB2F0.htm.txt,अशोकच्या कर्मचाऱ्यांना वेतनवाढ लागू - मुरकुटे ...
99272,Mumbai82A9804E43.htm.txt,भाऊ दाजी संग्रहालयप्रकरणी महापौरांचे चौकशीचे आ...
99273,Solapur42BC2D8C97.htm.txt,अंदाजपत्रक दुरुस्ती प्रस्तावावर महापालिकेची शु...


### Stemming

In [10]:
def marathi_porter_stemmer(word):
    """
    Marathi-specific Porter stemmer to reduce words to their root form by removing suffixes.

    Args:
    - word: The input Marathi word (string).

    Returns:
    - stemmed_word: The root form of the input word (string).
    """
    # Define suffixes for Marathi
    suffixes = {
        1: [u"ा", u"े", u"ी", u"ु", u"ू", u"ो", u"े"],
        2: [u"कर", u"ने", u"नी", u"ना", u"ला", u"ता", u"ते", u"ले", u"वे", u"शी"],
        3: [u"ासाठी", u"ाकर", u"ाने", u"ाया", u"तील", u"तील"],
        4: [u"तांचा", u"तासह", u"बरोबर", u"च्या"],
        5: [u"संपर्कात", u"वाढवलेला"],
    }
    # Iterate over suffixes from longest to shortest
    for L in sorted(suffixes.keys(), reverse=True):
        if len(word) > L + 1:  # Ensure word is long enough
            for suffix in suffixes[L]:
                if word.endswith(suffix):
                    return word[:-L]  # Remove suffix and return root
    return word  # Return original word if no suffix matches


def apply_stemming(text):
    if not isinstance(text, str):
        return text
    words = text.split()
    stemmed_words = [marathi_porter_stemmer(word) for word in words]
    return " ".join(stemmed_words)

df["content"] = df["content"].apply(apply_stemming)
df

Unnamed: 0,filename,content
0,2132377.cms.txt,महाराष्ट्र नागपूर विधानभवनासमोर वृद्धाच आत्महत...
1,1802816.cms.txt,महामुंबई दिव्यां उजळायच दीप अमावास्य प्रतिनिध ...
2,1006707.cms.txt,अर्थव्यवहार आयओस भागभांडवल विकण्याच प्रस्ताव व...
3,963568.cms.txt,संवाद लॉटर लागत तुम्हा कध लॉटर लागल्याच स्वप्न...
4,Mumbai2A1752ACAA.htm.txt,दरवर्ष तंबाख घेत ५० व्यक्तींच बळ मुंबई ३० - तं...
...,...,...
99270,1017374.cms.txt,महामुंबई मुंबई टाइम्स वरळीत सायकल स्पर्ध मुंबई...
99271,MaharashtraA294AEB2F0.htm.txt,अशोक कर्मचाऱ्यां वेतनवाढ लाग - मुरकुट श्रीरामप...
99272,Mumbai82A9804E43.htm.txt,भाऊ दाज संग्रहालयप्रकरण महापौरांच चौकशीच आदेश ...
99273,Solapur42BC2D8C97.htm.txt,अंदाजपत्रक दुरुस्त प्रस्तावावर महापालिकेच शुक्...


### Calculating TF (Term-Frequency) and DF (Document-Frequency)

In [11]:
from collections import defaultdict
import pandas as pd

index = defaultdict(lambda: defaultdict(int)) # inverted indexing

for _, row in df.iterrows(): # Building the posting list
    filename = row["filename"]
    words = row["content"].split()
    for word in words:
        index[word][filename] += 1  # Increase term frequency (TF) for the word in this document

index_data = []
for term, postings in index.items():
    doc_frequency = len(postings)  # Number of documents containing this term (DF)
    for doc, tf in postings.items():
        index_data.append((term, doc, tf, doc_frequency))

index_df = pd.DataFrame(index_data, columns=["Term", "Document", "Term Frequency", "Document Frequency"])
index_df

Unnamed: 0,Term,Document,Term Frequency,Document Frequency
0,महाराष्ट्र,2132377.cms.txt,1,18542
1,महाराष्ट्र,869185.cms.txt,1,18542
2,महाराष्ट्र,857578.cms.txt,2,18542
3,महाराष्ट्र,1236199.cms.txt,1,18542
4,महाराष्ट्र,1933774.cms.txt,1,18542
...,...,...,...,...
16112203,इन्टॅक्‍टकड,Mumbai82A9804E43.htm.txt,1,1
16112204,स्वीकरा,Mumbai82A9804E43.htm.txt,1,1
16112205,संकल्पांचं,1367653.cms.txt,1,1
16112206,विण्डस्क्रीन,1367653.cms.txt,1,1


In [12]:
index_df.describe()

Unnamed: 0,Term Frequency,Document Frequency
count,16112210.0,16112210.0
mean,1.332784,4102.868
std,1.003118,7821.512
min,1.0,1.0
25%,1.0,159.0
50%,1.0,1138.0
75%,1.0,4722.0
max,153.0,68315.0


### Printing Posting List for a term

In [14]:
def print_posting_list(term):
    if term in index:
        print(f"Posting List for '{term}':")
        for doc, tf in index[term].items():
            df = index[term][doc]
            print(f"Document: {doc}, Term Frequency: {tf}, Document Frequency: {df}")
    else:
        print(f"Term '{term}' not found in the index.")

print_posting_list("फुलपाखरासारख") # ex

Posting List for 'फुलपाखरासारख':
Document: 2045406.cms.txt, Term Frequency: 1, Document Frequency: 1
Document: 1628829.cms.txt, Term Frequency: 1, Document Frequency: 1
Document: 1908563.cms.txt, Term Frequency: 1, Document Frequency: 1
Document: 2215300.cms.txt, Term Frequency: 1, Document Frequency: 1
Document: 1890723.cms.txt, Term Frequency: 1, Document Frequency: 1


## Boolean Retrieval

### AND Query

In [18]:
# AND Query: Retrieves documents that contain all specified terms
def and_query(*terms):
    result = set(index.get(terms[0], {}).keys())
    for term in terms[1:]:
        result &= set(index.get(term, {}).keys())
    return result
    
print("AND Query (फुलपाखरासारख AND स्वप्नं):", and_query("फुलपाखरासारख", "स्वप्नं"))

AND Query (फुलपाखरासारख AND स्वप्नं): {'2045406.cms.txt'}


### OR Query

In [16]:
# OR Query: Retrieves documents that contain either of the terms
def or_query(*terms):
    result = set()
    for term in terms:
        result |= set(index.get(term, {}).keys())
    return result

print("OR Query (फुलपाखरासारख OR स्वप्नं):", or_query("फुलपाखरासारख", "स्वप्नं"))

OR Query (फुलपाखरासारख OR स्वप्नं): {'1168532.cms.txt', '1093273.cms.txt', '1789525.cms.txt', '1529202.cms.txt', '1490744.cms.txt', '1153002.cms.txt', '1186453.cms.txt', '1135119.cms.txt', '1033615.cms.txt', '1209808.cms.txt', '917829.cms.txt', '1172198.cms.txt', '1224968.cms.txt', '858720.cms.txt', '1379903.cms.txt', '1319177.cms.txt', '1170828.cms.txt', '878965.cms.txt', '873237.cms.txt', '978406.cms.txt', '1539825.cms.txt', '1998351.cms.txt', '955204.cms.txt', '1313045.cms.txt', '1285564.cms.txt', '1372304.cms.txt', '1353895.cms.txt', '1238366.cms.txt', '1371328.cms.txt', '1102158.cms.txt', '957414.cms.txt', '1410664.cms.txt', 'Nagpur6F5D4D833D.htm.txt', '1281458.cms.txt', '1908563.cms.txt', 'MuktapithD334B5C234.htm.txt', '1081167.cms.txt', '1300405.cms.txt', '958667.cms.txt', '1289742.cms.txt', '1610764.cms.txt', '923434.cms.txt', '1514546.cms.txt', '1019641.cms.txt', '928253.cms.txt', '1139301.cms.txt', '1890723.cms.txt', '1136049.cms.txt', '462542.cms.txt', '1121499.cms.txt', '12

### NOT query

In [17]:
# NOT Query: Retrieves documents that do not contain the specified term
def not_query(term):
    all_docs = set(df["filename"].tolist())
    term_docs = set(index.get(term, {}).keys())  # Documents containing the term
    return all_docs - term_docs # union - given

print("NOT Query (NOT महाराष्ट्र):", not_query("महाराष्ट्र"))

NOT Query (NOT महाराष्ट्र): {'1294003.cms.txt', '1389887.cms.txt', 'MaharashtraDAC66E1549.htm.txt', 'Kolhapur57388CC845.htm.txt', '1294924.cms.txt', '905101.cms.txt', '1475505.cms.txt', '1260197.cms.txt', 'Goa9D2EE4797D.htm.txt', 'Maharashtra3E359D31F4.htm.txt', '1679104.cms.txt', 'Maharashtra72D89C74B8.htm.txt', 'TAJYABATMYASPECIALNEWSINTERNATIONAL2F0153D6C0.htm.txt', 'Goa1E783F0E42.htm.txt', '658472.cms.txt', '880235.cms.txt', '952309.cms.txt', '1172485.cms.txt', '863441.cms.txt', 'Sports6F8848D528.htm.txt', '1395113.cms.txt', '1031557.cms.txt', 'National2290CB58F2.htm.txt', '2007761.cms.txt', '1001381.cms.txt', 'PuneC8AC8AA915.htm.txt', '1702306.cms.txt', '345641.cms.txt', 'Goa95B7EAA53E.htm.txt', '1164412.cms.txt', 'Nasik16651ECF45.htm.txt', '888058.cms.txt', 'Nasik24A12C1063.htm.txt', 'AurangabadC39A550042.htm.txt', 'Solapur19CFA58EC5.htm.txt', 'Arthvishwa7D3E0D0AA5.htm.txt', '1324543.cms.txt', '1186116.cms.txt', '1859384.cms.txt', '1420403.cms.txt', '1173202.cms.txt', '1511590.cm