### Data Preprocessing

In [None]:
import os
import shutil

root_dir = "/english/en.doc.2010/English-Data"
dest_dir = "/english/en.doc.2010/English-Data/All_Files"

os.makedirs(dest_dir, exist_ok=True)

for subdir, _, files in os.walk(root_dir, topdown=False):
    for file in files:
        if file.endswith(".utf8"):
            file_path = os.path.join(subdir, file)
            shutil.move(file_path, os.path.join(dest_dir, file))

for subdir, _, _ in os.walk(root_dir, topdown=False):
    if not os.listdir(subdir):
        os.rmdir(subdir)

print("All .utf8 files moved successfully, and empty folders deleted!")

In [None]:
import os
import re

input_dir = "/home/achal/Downloads/english/documents"
tags_pattern = r'</?(DOC|TEXT)>|<DOCNO>.*?</DOCNO>'
datetime_pattern = r'\[\s*[A-Za-z]+,\s+[A-Za-z]+\s+\d{1,2},\s+\d{4}\s+\d{2}:\d{2}:\d{2}\s+[apAP][mM]\s*\]'
punctuation_pattern = r'[।!?.,:;\'\-"()\[\]{}॥]+'

for filename in os.listdir(input_dir):
    if filename.endswith(".utf8"):
        file_path = os.path.join(input_dir, filename)
        with open(file_path, 'r', encoding='utf-8') as file:
            content = file.read()
            content = re.sub(tags_pattern, '', content)
            content = re.sub(datetime_pattern, '', content)
            content = re.sub(punctuation_pattern, '', content)
            content = content.lower()
            content = ' '.join(content.split())
        with open(file_path, 'w', encoding='utf-8') as file:
            file.write(content)

print("All documents have been cleaned in-place!")

### Converting to Pandas Dataframe

In [None]:
import pandas as pd

input_dir = "/home/achal/Downloads/english/documents"
data = []

for filename in os.listdir(input_dir):
    if filename.endswith(".utf8"):
        file_path = os.path.join(input_dir, filename)
        with open(file_path, 'r', encoding='utf-8') as file:
            content = file.read()
        data.append({"filename": filename, "content": content})

df = pd.DataFrame(data)
print(df.head())
df.to_csv("documents.csv", index=False, encoding="utf-8")

In [None]:
import pandas as pd

file_path = '/kaggle/input/eng-documents/eng_documents.csv'
df = pd.read_csv(file_path)
df

### Data Cleaning

In [None]:
redundant_title = "the telegraph calcutta "

def clean_title(text):
    if not isinstance(text, str):
        return text
    text = text.strip()
    if text.lower().startswith(redundant_title):
        return text[len(redundant_title):].strip()
    return text

df["content"] = df["content"].astype(str).apply(clean_title)
df

### Stopword Removal

In [None]:
import nltk
from nltk.corpus import stopwords

nltk.download("stopwords")
stop_words = set(stopwords.words("english"))

def remove_stopwords(text):
    if not isinstance(text, str):
        return text
    words = text.split()
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return " ".join(filtered_words)

df["content"] = df["content"].apply(remove_stopwords)
df

### Stemming

In [None]:
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()

def apply_stemming(text):
    if not isinstance(text, str):
        return text
    words = text.split()
    stemmed_words = [stemmer.stem(word) for word in words]
    return " ".join(stemmed_words)

df["content"] = df["content"].apply(apply_stemming)
df

### Calculating TF (Term-Frequency) and DF (Document-Frequency)

In [None]:
from collections import defaultdict
import pandas as pd

index = defaultdict(lambda: defaultdict(int)) # inverted indexing

for _, row in df.iterrows(): # Building the posting list
    filename = row["filename"]
    words = row["content"].split()
    for word in words:
        index[word][filename] += 1  # Increase term frequency (TF) for the word in this document

index_data = []
for term, postings in index.items():
    doc_frequency = len(postings)  # Number of documents containing this term (DF)
    for doc, tf in postings.items():
        index_data.append((term, doc, tf, doc_frequency))

index_df = pd.DataFrame(index_data, columns=["Term", "Document", "Term Frequency", "Document Frequency"])
index_df

In [None]:
index_df.describe()

### Printing Posting List for a term

In [None]:
def print_posting_list(term):
    if term in index:
        print(f"Posting List for '{term}':")
        for doc, tf in index[term].items():
            df = index[term][doc]
            print(f"Document: {doc}, Term Frequency: {tf}, Document Frequency: {df}")
    else:
        print(f"Term '{term}' not found in the index.")

print_posting_list("samurai") # ex

## Boolean Retrieval

### AND Query

In [None]:
# AND Query: Retrieves documents that contain all specified terms
def and_query(*terms):
    result = set(index.get(terms[0], {}).keys())
    for term in terms[1:]:
        result &= set(index.get(term, {}).keys())
    return result
    
print("AND Query (Chinese AND Samurai):", and_query("chines", "samurai"))

### OR Query

In [None]:
# OR Query: Retrieves documents that contain either of the terms
def or_query(*terms):
    result = set()
    for term in terms:
        result |= set(index.get(term, {}).keys())
    return result

print("OR Query (Chinese AND Samurai):", or_query("chines", "samurai"))

### NOT query

In [None]:
# NOT Query: Retrieves documents that do not contain the specified term
def not_query(term):
    all_docs = set(df["filename"].tolist())
    term_docs = set(index.get(term, {}).keys())  # Documents containing the term
    return all_docs - term_docs # union - given

print("NOT Query (NOT Chinese):", not_query("chines"))