In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer,WordNetLemmatizer
import os
import sys

In [None]:
def preprocess_text(texts):
    processed_sentences = []
    for text in sent_tokenize(texts):
        
        #Removing all special characters'
        processed_feature = re.sub(r'\W', ' ', text)

        # remove all single characters
        processed_feature= re.sub(r'\s+[a-zA-Z]\s+', ' ', processed_feature)

        # Remove single characters from the start
        processed_feature = re.sub(r'\^[a-zA-Z]\s+', ' ', processed_feature) 

        # Substituting multiple spaces with single space
        processed_feature = re.sub(r'\s+', ' ', processed_feature, flags=re.I)

        # Converting to Lowercase
        processed_feature = processed_feature.lower()

        # Storing each processed sent in the list
        processed_sentences.append(processed_feature)
        
    processed_sent = '.'.join(processed_sentences)
    #print(processed_sent)
    return processed_sent

In [None]:
#Removing stopwords

def remove_stopwords(text):
    
    sentences = sent_tokenize(text)
    
    #Using stopwords library
    
    stop_words_ = set(stopwords.words('english'))
    filter_stopwords = []
    
    for sent in sentences:
        words = word_tokenize(sent)
        filtered_words = []
        for w in words:
            if(w not in stop_words_):
                filtered_words.append(w)

        sentence =' '.join(filtered_words)
        filter_stopwords.append(sentence)
        
    sent = '.'.join(filter_stopwords)
    return sent

In [None]:
#Lemmatizing the words

def lemmatize_text(text):
    
    lemmatizer = WordNetLemmatizer() 
    processed_features = []
    filter_stopwords = sent_tokenize(text)
    
    
    for sent in filter_stopwords:
        words = word_tokenize(sent)
        filter_lemma = []
        for w in words:
            filter_lemma.append(lemmatizer.lemmatize(w))

        sentence = ' '.join(filter_lemma)
        processed_features.append(sentence)
        
    processed_text = '.'.join(processed_features)
    return processed_text

In [None]:
path = r'C:\Users\Darshan\Music\raman_kannan\NLP\preprocessing\preprocessing_articles\2017'
os.chdir(path)

for d in os.listdir('.'):
    os.chdir(path+f'\\{d}')
    print(f'Directory {d} is getting processed')
    for file in os.listdir('.'):
        if(file.find('.txt')):
            #Spilting the basename and extension
            file_n,file_ext = os.path.splitext(file)

            #Reading data from the file
            lst = open(file,'r',encoding='utf-8').readlines()
            if(len(lst[0]) == 0):
                continue
            texts = str(lst[0])

            #print(texts)

            #Saving the number of words in original file to compare with preprocessed file words
            original_sent = word_tokenize(texts)

            #Cleaning the text
            clean_sent = preprocess_text(texts)
            #print('Cleaning words Done')


            #Removing stopwords
            clean_stop = remove_stopwords(clean_sent)
            #print('Cleaning stop words Done')

            #Lemmatizing the words
            clean_lemma = lemmatize_text(clean_stop)
            #print('Cleaning lemma words Done')

            #Giving the comparision in percentage of words reduced
            #print(f'{file} processsing',(len(original_sent)-len(word_tokenize(clean_lemma)))/len(original_sent)*100)

            clean_file = preprocess_text(clean_lemma)

            #Writing it in a file
            fp = open(f'{file_n}_pp{file_ext}','w',encoding = 'utf-8')
            fp.write(clean_file)
            fp.close()

            #Deleting original file

            os.remove(file)
print('Request Completed')
