In [1]:
import re
import os
import nltk
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

def clean_text(text):
    # Remove punctuations and unnecessary chars
    text1 = re.sub(r'\t', '', text)
    text1 = ' '.join(text1.split())
    text1 = re.sub(r'[^A-Za-z0-9\s]', '', text1)
    text1 = text1.lower()
    return text1

def tokenize_text(text):
    tokens = word_tokenize(text)
    return tokens

def stem_text(tokens):
    stemmer = PorterStemmer()
    stemmed_tokens = [stemmer.stem(token) for token in tokens]
    return stemmed_tokens

def lemmatize_text(tokens):
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return lemmatized_tokens

In [2]:
folder = 'Data/text_files/'
output_folder = 'Data/preprocessed_files'

if not os.path.exists(output_folder):
    os.mkdir(output_folder)

file_list = os.listdir(folder)

# Process each file
for filename in file_list:
    if filename.endswith('.txt'):
        with open(os.path.join(folder, filename), 'r', encoding='utf8') as file:
            text = file.read()

        clean_texts = clean_text(str(text))

        # Tokenization
        tokens = tokenize_text(clean_texts)

        # Stemming
        stemmed_tokens = stem_text(tokens=tokens)

        # Lemmatization
        lemmatized_tokens = lemmatize_text(tokens=tokens)

        # Save the preprocessed text to a new file
        output_filename = os.path.join(output_folder, filename)
        with open(output_filename, 'w') as output_file:
            output_file.write(str(lemmatized_tokens))