### Pre-processing of a Text Document: Stop Word Removal and Stemming.

In [49]:
import nltk
from nltk.corpus import stopwords,PlaintextCorpusReader
from nltk.tokenize import word_tokenize, RegexpTokenizer
from nltk.stem import PorterStemmer, WordNetLemmatizer

stopw = set(stopwords.words())

# Input: Filename
print("Enter Filename : ")
name = input()+'.txt'

# Tokenizer to Read Words only and Ignore Punctuations
tokenizer = RegexpTokenizer(r'\w+')  
with open (name) as fin:
    tokens = tokenizer.tokenize(fin.read()) 
for item in tokens:
    tokens[tokens.index(item)]=item.lower() 
print("\nTokens before Stop Word Removal\n",tokens)

# Stopword Removal
for item in tokens:
    if item in stopw:
        tokens.remove(item)
print("\nTokens after Stop Word Removal\n",tokens)

# Stemming
ps = PorterStemmer()
stemmed = []
for item in tokens:
    stemmed.append(ps.stem(item)) 
print("\nTokens after Stemming\n",stemmed)

# Lemmatization
lem = WordNetLemmatizer()
lemmatized = []
for item in tokens:
    lemmatized.append(lem.lemmatize(item))
print("\nTokens after Lemmatization\n",lemmatized)

Enter Filename : 
a

Tokens before Stop Word Removal
 ['deep', 'learning', 'allows', 'computational', 'models', 'that', 'are', 'composed', 'of', 'multiple', 'processing', 'layers', 'to', 'learn', 'representations', 'of', 'data', 'with', 'multiple', 'levels', 'of', 'abstraction', 'these', 'methods', 'have', 'dramatically', 'improved', 'the', 'state', 'of', 'the', 'art', 'in', 'speech', 'recognition', 'visual', 'object', 'recognition', 'object', 'detection', 'and', 'many', 'other', 'domains', 'such', 'as', 'drug', 'discovery', 'and', 'genomics', 'deep', 'learning', 'discovers', 'intricate', 'structure', 'in', 'large', 'data', 'sets', 'by', 'using', 'the', 'backpropagation', 'algorithm', 'to', 'indicate', 'how', 'a', 'machine', 'should', 'change', 'its', 'internal', 'parameters', 'that', 'are', 'used', 'to', 'compute', 'the', 'representation', 'in', 'each', 'layer', 'from', 'the', 'representation', 'in', 'the', 'previous', 'layer', 'deep', 'convolutional', 'nets', 'have', 'brought', 'abou

In [50]:
# Saving the Output in Files
tokens = list(dict.fromkeys(tokens))  
for item in tokens:
    print(item,file=open("stopwordremoved.txt","a")) 

stemmed = list(dict.fromkeys(stemmed)) 
for item in stemmed:
    print(item,file=open("stemmedword.txt","a")) 

lemmatized = list(dict.fromkeys(lemmatized)) 
for item in lemmatized:
    print(item,file=open("lemmatizedword.txt","a")) 