Importing Libraries

In [18]:
import os
import docx
import pdfplumber
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

Folder Containing the files

In [None]:
INPUT_FOLDER = "input_files"

Function to Read .docx Files

In [20]:
def read_docx(file_path):
    doc = docx.Document(file_path)    # Open the .docx file
    full_text = []                    # List to store paragraphs
    for para in doc.paragraphs:       # Iterate through each paragraph
        full_text.append(para.text)   # Append text to the list
    return "\n".join(full_text)       # Join paragraphs into a single text block /n.join


Function to Read .pdf Files

In [21]:
def read_pdf(file_path):
    text = ""
    with pdfplumber.open(file_path) as pdf:      # Open the PDF file
        for page in pdf.pages:                   # Iterate through each page
            text += page.extract_text() + "\n"   # Extract text and add newline
    return text


Function to Read .txt Files

In [22]:
# Function to read text from a TXT file
def read_txt(file_path):
    with open(file_path, "r", encoding="utf-8") as file:
        return file.read()

Function to Read files.

In [23]:
def extract_text(file_path):
    if file_path.endswith(".docx"):   # Check if the file is .docx
        return read_docx(file_path)
    elif file_path.endswith(".pdf"):  # Check if the file is .pdf
        return read_pdf(file_path)
    else:
        return "Unsupported file format."


TEXT PRE - PROCESSING

Tokenization (Splitting Text into Words)

In [24]:
def tokenize_text(text):
    return word_tokenize(text.lower())  # Convert to lowercase and split into words

Remove Stopwords (Unimportant Words)

In [25]:
def remove_stopwords(tokens):
    stop_words = set(stopwords.words("english"))                # Load stopwords
    return [word for word in tokens if word not in stop_words]  # Remove stopwords

In [26]:
import string

def remove_punctuation(tokens):
    return [word for word in tokens if word not in string.punctuation]  # Remove punctuation marks


Lemmatization (Convert to Base Form)

In [27]:
from nltk.stem import WordNetLemmatizer

def lemmatize_tokens(tokens):
    lemmatizer = WordNetLemmatizer()                        # Initialize lemmatizer
    return [lemmatizer.lemmatize(word) for word in tokens]  # Convert words to base form

Text Preprocessing Pipeline

In [28]:
def preprocess_text(text):
    tokens = tokenize_text(text)
    tokens = remove_stopwords(tokens) 
    tokens = remove_punctuation(tokens)
    tokens = lemmatize_tokens(tokens)
    return tokens


Function to load documents from a folder

In [29]:
def load_documents(folder_path):
    documents = []
    for filename in os.listdir(folder_path):
        file_path = os.path.join(folder_path, filename)

        # Extract text based on file type
        if filename.endswith(".txt"):
            text = read_txt(file_path)
        elif filename.endswith(".docx"):
            text = read_docx(file_path)
        elif filename.endswith(".pdf"):
            text = read_pdf(file_path)
        else:
            continue  

        documents.append((filename, text))   # Store (filename, text)
    return documents

Program Execution

In [31]:
if __name__ == "__main__":
    # Load and process all documents
    documents = load_documents(INPUT_FOLDER)

    # Print preprocessed content for each document
    for filename, text in documents:
        print(f"\n  Document : {filename}")
        preprocessed_text = preprocess_text(text)
        print(f" --Preprocessed Text : {preprocessed_text}")



  Document : CCNLP readmy.docx
 --Preprocessed Text : ['input', 'user', 'upload', 'multiple', 'document', 'processing', 'extract', 'text', 'document', 'convert', 'text', 'meaningful', 'representation', 'cluster', 'document', 'based', 'similarity', 'generate', 'summary', 'cluster', 'output', 'clustered', 'document', 'summary', 'report', 'group', 'web', 'interface', 'user-friendly', 'ui', 'document', 'upload', 'retrieval', 'backend', 'python', 'fastapi', 'handling', 'request', 'efficiently', 'nlp', 'ml', 'nltk', 'text', 'processing', 'word', 'embeddings', 'vectorization', 'k-means', 'hierarchical', 'clustering', 'dbscan', 'clustering', 'provide', 'great', 'result', 'bert', 'textrank', 'summarization', 'database', 'postgresql', 'document', 'storage', 'retrieval', 'frontend', 'react.js', 'next.js', 'responsive', 'ui', 'requirement', 'installation', 'python-docx', '→', 'extracting', 'text', '.docx', 'file', 'microsoft', 'word', 'pdfplumber', '→', 'extracting', 'text', '.pdf', 'file', 'nltk