Importing Libraries

In [1]:
import os
import docx
import pdfplumber
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

Folder Containing the files

In [2]:
INPUT_FOLDER = "input_files"

Function to Read .docx Files

In [3]:
def read_docx(file_path):
    doc = docx.Document(file_path)    # Open the .docx file
    full_text = []                    # List to store paragraphs
    for para in doc.paragraphs:       # Iterate through each paragraph
        full_text.append(para.text)   # Append text to the list
    return "\n".join(full_text)       # Join paragraphs into a single text block /n.join


Function to Read .pdf Files

In [4]:
def read_pdf(file_path):
    text = ""
    with pdfplumber.open(file_path) as pdf:      # Open the PDF file
        for page in pdf.pages:                   # Iterate through each page
            text += page.extract_text() + "\n"   # Extract text and add newline
    return text


Function to Read .txt Files

In [5]:
# Function to read text from a TXT file
def read_txt(file_path):
    with open(file_path, "r", encoding="utf-8") as file:
        return file.read()

Function to Read files.

In [6]:
def extract_text(file_path):
    if file_path.endswith(".docx"):   # Check if the file is .docx
        return read_docx(file_path)
    elif file_path.endswith(".pdf"):  # Check if the file is .pdf
        return read_pdf(file_path)
    else:
        return "Unsupported file format."


TEXT PRE - PROCESSING

Tokenization (Splitting Text into Words)

In [7]:
def tokenize_text(text):
    return word_tokenize(text.lower())  # Convert to lowercase and split into words

Remove Stopwords (Unimportant Words)

In [8]:
def remove_stopwords(tokens):
    stop_words = set(stopwords.words("english"))                # Load stopwords
    return [word for word in tokens if word not in stop_words]  # Remove stopwords

In [9]:
import string

def remove_punctuation(tokens):
    return [word for word in tokens if word not in string.punctuation]  # Remove punctuation marks


Lemmatization (Convert to Base Form)

In [10]:
from nltk.stem import WordNetLemmatizer

def lemmatize_tokens(tokens):
    lemmatizer = WordNetLemmatizer()                        # Initialize lemmatizer
    return [lemmatizer.lemmatize(word) for word in tokens]  # Convert words to base form

Text Preprocessing Pipeline

In [11]:
def preprocess_text(text):
    tokens = tokenize_text(text)
    tokens = remove_stopwords(tokens) 
    tokens = remove_punctuation(tokens)
    tokens = lemmatize_tokens(tokens)
    return tokens


Function to load documents from a folder

In [12]:
def load_documents(folder_path):
    documents = []
    for filename in os.listdir(folder_path):
        file_path = os.path.join(folder_path, filename)

        # Extract text based on file type
        if filename.endswith(".txt"):
            text = read_txt(file_path)
        elif filename.endswith(".docx"):
            text = read_docx(file_path)
        elif filename.endswith(".pdf"):
            text = read_pdf(file_path)
        else:
            continue  

        documents.append((filename, text))   # Store (filename, text)
    return documents

Program Execution

In [13]:
if __name__ == "__main__":
    # Load and process all documents
    documents = load_documents(INPUT_FOLDER)

    # Print preprocessed content for each document
    for filename, text in documents:
        print(f"\n  Document : {filename}")
        preprocessed_text = preprocess_text(text)
        print(f" --Preprocessed Text : {preprocessed_text}")


FileNotFoundError: [WinError 3] The system cannot find the path specified: 'input_files'