# Preprocessing


1. Case Folding(Converting all document texts to lower case)

In [1]:
import os
files = os.listdir('Abstracts') # Gets all file names from the directory
files = sorted(files) #Sorting for simplicity

for file in files:
    with open('Abstracts/'+file,'r') as f:
        data = f.read()
        
    data = data.lower()

    with open('Abstracts/'+file,'w') as f:
        f.write(data)

print("All Documents are converted to lowercase")


All Documents are converted to lowercase


2. Stopword Removal

In [2]:
stopWords = []

with open('Stopword-List.txt','r') as f:
    data = f.read()
words = data.split('\n')

for word in words:
    cleaned = word.strip() #just to remove extra spaces
    if cleaned:
        stopWords.append(cleaned)

print("Stopwords array created!")

files = os.listdir('Abstracts')
files = sorted(files)
for file in files:
    with open('Abstracts/'+file,'r') as f:
        data = f.read()
    
    for word in stopWords:
        word = ' '+word+' '  
        #if i directly remove stop word then this will happen: heurtistic -> heurt tic . Is would be removed so I have added extra spaces around stop word to remove only the stopwords
        data = data.replace(word,' ')    
    
    with open('Abstracts/'+file,'w') as f:
        f.write(data)

print("Stopwords removed from all documents!")



Stopwords array created!
Stopwords removed from all documents!


3. Handling Edge case where stopword may be before or after a full stop

In [3]:
#Going to use regex for removing stopwords at boundaries
import re

for file in files:
    with open('Abstracts/'+file,'r') as f:
        data = f.read()

    for word in stopWords:
        pattern = r'\b' + re.escape(word) + r'\b'
        data = re.sub(pattern,' ',data)

    with open('Abstracts/'+file,'w') as f:
        f.write(data)

print("Edge Case Handled!")


Edge Case Handled!


4. Removing non alphabetic characters using regex

In [4]:
for file in files:
    with open('Abstracts/'+file,'r') as f:
        data = f.read()
    data = re.sub('[^A-Za-z]+',' ',data)
    
    with open('Abstracts/'+file,'w') as f:
        f.write(data)

print("Punctuations removed from all documents!")

Punctuations removed from all documents!


5. PorterStemmer Time

In [1]:
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize
import os

files = os.listdir('Abstracts') 
files = sorted(files) 

porterStemmer = PorterStemmer()
for file in files:
    with open('Abstracts/'+file,'r') as f:
        data = f.read()
    terms = word_tokenize(data)
    stemmed = []
    for word in terms:
        if len(word)>1:
            stemmed.append(porterStemmer.stem(word))
    with open('Abstracts/'+file,'w') as f:
         f.write(" ".join(stemmed))
            
print("Porter Stemmer stemmed the data the way it needs to be stemmed!")
    

Porter Stemmer stemmed the data the way it needs to be stemmed!


In [46]:
from nltk.stem.porter import PorterStemmer

porterStemmer = PorterStemmer()

for file in files:
    with open('Abstracts/'+file,'r') as f:
        data = f.read()
        
    data = data.split() #tokenizing words
    stemmed = []
    for word in data:
        stemmed.append(porterStemmer.stem(word))
    modified_data = ' '.join(stemmed)
    
    with open('Abstracts/'+file,'w') as f:
        f.write(modified_data)
    
print("Porter Stemmer stemmed the data the way it needs to be stemmed!")
    

Porter Stemmer stemmed the data the way it needs to be stemmed!
