In [13]:
import os
import nltk
import json
from collections import defaultdict
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

# Download resources (only need to run once)
nltk.download('punkt')
nltk.download('stopwords')

# Path to folder with .txt files
docs_folder = r"C:\Users\Admin\Desktop\my_text_files" # <-- update this

stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

inverted_index = defaultdict(set)
total_docs = 0

for filename in os.listdir(docs_folder):
    if filename.endswith('.txt'):
        total_docs += 1
        doc_id = total_docs
        file_path = os.path.join(docs_folder, filename)
        
        with open(file_path, 'r', encoding='utf-8') as f:
            text = f.read()
        
        tokens = word_tokenize(text.lower())
        tokens = [stemmer.stem(word) for word in tokens if word.isalnum() and word not in stop_words]

        for word in set(tokens):
            inverted_index[word].add(doc_id)

print(f"Total documents indexed: {total_docs}")
print(f"Vocabulary size: {len(inverted_index)}")

# Convert sets to sorted lists for JSON serialization
inverted_index = {term: sorted(list(doc_ids)) for term, doc_ids in inverted_index.items()}

with open('inverted_index.json', 'w', encoding='utf-8') as f:
    json.dump(inverted_index, f, indent=4)

print("Inverted index saved to 'inverted_index.json'.")

# Show first 10 terms and their posting lists
for term in list(inverted_index.keys())[:10]:
    print(f"'{term}': {inverted_index[term]}")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...


Total documents indexed: 1
Vocabulary size: 34
Inverted index saved to 'inverted_index.json'.
'rapidli': [1]
'industri': [1]
'drive': [1]
'essenti': [1]
'interconnect': [1]
'innov': [1]
'progress': [1]
'work': [1]
'dynam': [1]
'solv': [1]


[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
