### Data Preprocessing

In [3]:
import os
import shutil

root_dir = "/english/en.doc.2010/English-Data"
dest_dir = "/english/en.doc.2010/English-Data/All_Files"

os.makedirs(dest_dir, exist_ok=True)

for subdir, _, files in os.walk(root_dir, topdown=False):
    for file in files:
        if file.endswith(".utf8"):
            file_path = os.path.join(subdir, file)
            shutil.move(file_path, os.path.join(dest_dir, file))

for subdir, _, _ in os.walk(root_dir, topdown=False):
    if not os.listdir(subdir):
        os.rmdir(subdir)

print("All .utf8 files moved successfully, and empty folders deleted!")

All .utf8 files moved successfully, and empty folders deleted!


In [12]:
import os
import re

input_dir = "/home/achal/Downloads/english/documents"
tags_pattern = r'</?(DOC|TEXT)>|<DOCNO>.*?</DOCNO>'
datetime_pattern = r'\[\s*[A-Za-z]+,\s+[A-Za-z]+\s+\d{1,2},\s+\d{4}\s+\d{2}:\d{2}:\d{2}\s+[apAP][mM]\s*\]'
punctuation_pattern = r'[।!?.,:;\'\-"()\[\]{}॥]+'

for filename in os.listdir(input_dir):
    if filename.endswith(".utf8"):
        file_path = os.path.join(input_dir, filename)
        with open(file_path, 'r', encoding='utf-8') as file:
            content = file.read()
            content = re.sub(tags_pattern, '', content)
            content = re.sub(datetime_pattern, '', content)
            content = re.sub(punctuation_pattern, '', content)
            content = content.lower()
            content = ' '.join(content.split())
        with open(file_path, 'w', encoding='utf-8') as file:
            file.write(content)

print("All documents have been cleaned in-place!")

All documents have been cleaned in-place!


### Converting to Pandas Dataframe

In [None]:
import pandas as pd

input_dir = "/home/achal/Downloads/english/documents"
data = []

for filename in os.listdir(input_dir):
    if filename.endswith(".utf8"):
        file_path = os.path.join(input_dir, filename)
        with open(file_path, 'r', encoding='utf-8') as file:
            content = file.read()
        data.append({"filename": filename, "content": content})

df = pd.DataFrame(data)
print(df.head())
df.to_csv("documents.csv", index=False, encoding="utf-8")

In [34]:
import pandas as pd

file_path = '/kaggle/input/eng-documents/eng_documents.csv'
df = pd.read_csv(file_path)
df

Unnamed: 0,filename,content
0,1040913_sports_story_3751474.utf8,the telegraph calcutta irina relents moscow ru...
1,1070118_business_index.utf8,the telegraph calcutta business fertiliser sub...
2,1050401_calcutta_story_4561404.utf8,the telegraph calcutta metro elegy to the empe...
3,1050820_frontpage_story_5135071.utf8,the telegraph calcutta frontpage puja date for...
4,1060604_sports_story_6310069.utf8,the telegraph calcutta sports highs amp lows d...
...,...,...
125581,1060609_calcutta_story_6326841.utf8,the telegraph calcutta metro timeout a desire ...
125582,1070406_frontpage_story_7613903.utf8,the telegraph calcutta frontpage some still ba...
125583,1060326_sports_story_6016578.utf8,the telegraph calcutta sports wi reach 95/1 on...
125584,1050914_sports_story_5236380.utf8,the telegraph calcutta sports indian women lif...


### Data Cleaning

In [35]:
redundant_title = "the telegraph calcutta "

def clean_title(text):
    if not isinstance(text, str):
        return text
    text = text.strip()
    if text.lower().startswith(redundant_title):
        return text[len(redundant_title):].strip()
    return text

df["content"] = df["content"].astype(str).apply(clean_title)
df

Unnamed: 0,filename,content
0,1040913_sports_story_3751474.utf8,irina relents moscow russias olympic shot put ...
1,1070118_business_index.utf8,business fertiliser subsidy for the current ye...
2,1050401_calcutta_story_4561404.utf8,metro elegy to the emperor azhar alam as shah ...
3,1050820_frontpage_story_5135071.utf8,frontpage puja date for gift of gas find sambi...
4,1060604_sports_story_6310069.utf8,sports highs amp lows day i following are stat...
...,...,...
125581,1060609_calcutta_story_6326841.utf8,metro timeout a desire to make the world appea...
125582,1070406_frontpage_story_7613903.utf8,frontpage some still bat for chappell gavaskar...
125583,1060326_sports_story_6016578.utf8,sports wi reach 95/1 on rainhit day wellington...
125584,1050914_sports_story_5236380.utf8,sports indian women lift title new delhi manga...


### Stopword Removal

In [36]:
import nltk
from nltk.corpus import stopwords

nltk.download("stopwords")
stop_words = set(stopwords.words("english"))

def remove_stopwords(text):
    if not isinstance(text, str):
        return text
    words = text.split()
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return " ".join(filtered_words)

df["content"] = df["content"].apply(remove_stopwords)
df

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,filename,content
0,1040913_sports_story_3751474.utf8,irina relents moscow russias olympic shot put ...
1,1070118_business_index.utf8,business fertiliser subsidy current year nearl...
2,1050401_calcutta_story_4561404.utf8,metro elegy emperor azhar alam shah jahan yado...
3,1050820_frontpage_story_5135071.utf8,frontpage puja date gift gas find sambit saha ...
4,1060604_sports_story_6310069.utf8,sports highs amp lows day following statistica...
...,...,...
125581,1060609_calcutta_story_6326841.utf8,metro timeout desire make world appear beautif...
125582,1070406_frontpage_story_7613903.utf8,frontpage still bat chappell gavaskars name bo...
125583,1060326_sports_story_6016578.utf8,sports wi reach 95/1 rainhit day wellington we...
125584,1050914_sports_story_5236380.utf8,sports indian women lift title new delhi manga...


### Stemming

In [37]:
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()

def apply_stemming(text):
    if not isinstance(text, str):
        return text
    words = text.split()
    stemmed_words = [stemmer.stem(word) for word in words]
    return " ".join(stemmed_words)

df["content"] = df["content"].apply(apply_stemming)
df

Unnamed: 0,filename,content
0,1040913_sports_story_3751474.utf8,irina relent moscow russia olymp shot put cham...
1,1070118_business_index.utf8,busi fertilis subsidi current year nearli doub...
2,1050401_calcutta_story_4561404.utf8,metro elegi emperor azhar alam shah jahan yado...
3,1050820_frontpage_story_5135071.utf8,frontpag puja date gift ga find sambit saha ca...
4,1060604_sports_story_6310069.utf8,sport high amp low day follow statist highligh...
...,...,...
125581,1060609_calcutta_story_6326841.utf8,metro timeout desir make world appear beauti p...
125582,1070406_frontpage_story_7613903.utf8,frontpag still bat chappel gavaskar name board...
125583,1060326_sports_story_6016578.utf8,sport wi reach 95/1 rainhit day wellington wes...
125584,1050914_sports_story_5236380.utf8,sport indian women lift titl new delhi mangal ...


In [5]:
from collections import defaultdict
import pandas as pd

index = defaultdict(lambda: defaultdict(int)) # inverted indexing

for _, row in df.iterrows(): # Building the posting list
    filename = row["filename"]
    words = row["content"].split()
    for word in words:
        index[word][filename] += 1  # Increase term frequency (TF) for the word in this document

index_data = []
for term, postings in index.items():
    doc_frequency = len(postings)  # Number of documents containing this term (DF)
    for doc, tf in postings.items():
        index_data.append((term, doc, tf, doc_frequency))

index_df = pd.DataFrame(index_data, columns=["Term", "Document", "Term Frequency", "Document Frequency"])
index_df

Unnamed: 0,Term,Document,Term Frequency,Document Frequency
0,irina,1040913_sports_story_3751474.utf8,3,107
1,irina,1061208_sports_story_7112259.utf8,2,107
2,irina,1051017_sports_story_5363583.utf8,1,107
3,irina,1051204_sports_story_5556871.utf8,1,107
4,irina,1060804_sports_story_6567505.utf8,1,107
...,...,...,...,...
21817647,centrallyaircondit,1050401_calcutta_story_4557661.utf8,1,1
21817648,reprograph,1050401_calcutta_story_4557661.utf8,1,1
21817649,prochappel,1070406_frontpage_story_7613903.utf8,1,1
21817650,letsgetsunni,1070406_frontpage_story_7613903.utf8,1,1


In [6]:
index_df.describe()

Unnamed: 0,Term Frequency,Document Frequency
count,21817650.0,21817650.0
mean,1.495464,7643.025
std,2.328547,10682.15
min,1.0,1.0
25%,1.0,603.0
50%,1.0,3401.0
75%,1.0,10444.0
max,293.0,76781.0


In [10]:
def print_posting_list(term):
    if term in index:
        print(f"Posting List for '{term}':")
        for doc, tf in index[term].items():
            df = index[term][doc]
            print(f"Document: {doc}, Term Frequency: {tf}, Document Frequency: {df}")
    else:
        print(f"Term '{term}' not found in the index.")

print_posting_list("samurai") # ex

Posting List for 'samurai':
Document: 1051205_opinion_story_5555401.utf8, Term Frequency: 1, Document Frequency: 1
Document: 1040908_calcutta_story_3728670.utf8, Term Frequency: 2, Document Frequency: 2
Document: 1040913_calcutta_story_3752030.utf8, Term Frequency: 1, Document Frequency: 1
Document: 1050515_foreign_story_4741808.utf8, Term Frequency: 7, Document Frequency: 7
Document: 1050105_calcutta_story_4215009.utf8, Term Frequency: 1, Document Frequency: 1
Document: 1041206_sports_story_4092122.utf8, Term Frequency: 2, Document Frequency: 2
Document: 1061004_sports_index.utf8, Term Frequency: 1, Document Frequency: 1
Document: 1061215_calcutta_story_7139005.utf8, Term Frequency: 1, Document Frequency: 1
Document: 1070822_nation_story_8222142.utf8, Term Frequency: 1, Document Frequency: 1
Document: 1060409_calcutta_story_6075372.utf8, Term Frequency: 1, Document Frequency: 1
Document: 1060705_calcutta_story_6399268.utf8, Term Frequency: 1, Document Frequency: 1
Document: 1041021_fo

In [25]:
# AND Query: Retrieves documents that contain all specified terms
def and_query(*terms):
    result = set(index.get(terms[0], {}).keys())
    for term in terms[1:]:
        result &= set(index.get(term, {}).keys())
    return result
    
print("AND Query (chinese AND samurai):", and_query("chines", "samurai"))

AND Query (calcutta AND telegraph): {'1051212_foreign_story_5589906.utf8', '1050105_calcutta_story_4215009.utf8', '1051205_opinion_story_5555401.utf8', '1040913_calcutta_story_3752030.utf8'}


In [26]:
# OR Query: Retrieves documents that contain either of the terms
def or_query(*terms):
    result = set()
    for term in terms:
        result |= set(index.get(term, {}).keys())
    return result

print("AND Query (chinese AND samurai):", or_query("chines", "samurai"))

AND Query (calcutta AND telegraph): {'1060905_sports_story_6701660.utf8', '1041011_nation_story_3867682.utf8', '1050924_business_story_5276797.utf8', '1050920_foreign_story_5260499.utf8', '1040917_nation_story_3767218.utf8', '1070919_sports_story_8332517.utf8', '1070718_calcutta_hello.utf8', '1061126_business_index.utf8', '1070415_calcutta_hello.utf8', '1041020_sports_story_3904729.utf8', '1070212_foreign_story_7380828.utf8', '1041015_foreign_story_3885829.utf8', '1041129_opinion_story_4063244.utf8', '1070712_sports_story_8047072.utf8', '1070708_calcutta_story_8026106.utf8', '1060114_calcutta_story_5719540.utf8', '1051111_sports_story_5463482.utf8', '1060319_calcutta_story_5983329.utf8', '1060501_sports_story_6167440.utf8', '1050216_calcutta_story_4382676.utf8', '1070319_calcutta_restadd.utf8', '1060829_opinion_story_6657643.utf8', '1050128_nation_story_4306485.utf8', '1070524_calcutta_story_7815934.utf8', '1060421_foreign_story_6126452.utf8', '1040915_calcutta_story_3753281.utf8', '10

In [39]:
# NOT Query: Retrieves documents that do not contain the specified term
def not_query(term):
    all_docs = set(df["filename"].tolist())
    term_docs = set(index.get(term, {}).keys())  # Documents containing the term
    return all_docs - term_docs # union - given

print("AND Query (calcutta AND telegraph):", not_query("chines"))

AND Query (calcutta AND telegraph): {'1060706_sports_story_6442986.utf8', '1060821_calcutta_story_6626025.utf8', '1051231_nation_story_5665410.utf8', '1050524_sports_story_4778840.utf8', '1070628_foreign_story_7986107.utf8', '1060208_nation_story_5819359.utf8', '1060703_opinion_story_6408708.utf8', '1050226_calcutta_story_4424655.utf8', '1060912_calcutta_story_6731174.utf8', '1060610_business_story_6333276.utf8', '1041214_sports_story_4125977.utf8', '1041120_calcutta_story_4025073.utf8', '1060407_opinion_story_6056430.utf8', '1070412_bengal_story_7637876.utf8', '1050304_sports_story_4452329.utf8', '1041114_business_index.utf8', '1070509_calcutta_story_7750052.utf8', '1041025_foreign_story_3921517.utf8', '1050411_sports_story_4600476.utf8', '1050911_bengal_story_5224362.utf8', '1060909_business_story_6722445.utf8', '1060308_frontpage_story_5941755.utf8', '1060902_sports_story_6689643.utf8', '1050128_sports_story_4306709.utf8', '1070504_sports_story_7732495.utf8', '1070723_opinion_index.