# Text Processing 

In [2]:
from urllib.request import urlopen
import re
import numpy as np

import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
# opening dataset
data = urlopen("http://www.gutenberg.org/files/36/36-0.txt")

# list holds all the lines from text
text = []

for line in data:
    text.append(str(line))
    
text[1:10]

["b'\\r\\n'",
 'b"Project Gutenberg\'s The War of the Worlds, by H. G. Wells\\r\\n"',
 "b'\\r\\n'",
 "b'This eBook is for the use of anyone anywhere in the United States and most\\r\\n'",
 "b'other parts of the world at no cost and with almost no restrictions\\r\\n'",
 "b'whatsoever. You may copy it, give it away or re-use it under the terms of\\r\\n'",
 "b'the Project Gutenberg License included with this eBook or online at\\r\\n'",
 'b"www.gutenberg.org. If you are not located in the United States, you\'ll have\\r\\n"',
 "b'to check the laws of the country where you are located before using this ebook.\\r\\n'"]

In [4]:
# removing unnecessary characters

fixed_text = [re.sub("[^a-zA-Z]", " ", i.lower()) for i in text[100:]]

# removing single characters
fixed_text = [re.sub(r'\b\w{1,1}\b', '', i) for i in fixed_text]

# removing more than 1 spaces by splitting and rejoinig by 1 space 
fixed_text = [' '.join(fixed_text[i].split()) for i in range(len(fixed_text))]

# removing strings with length < 3
fixed_text = [i for i in fixed_text if len(i) >3]

fixed_text[1:10]

['upon them as impossible or improbable it is curious to recall some of',
 'the mental habits of those departed days at most terrestrial men',
 'fancied there might be other men upon mars perhaps inferior to',
 'themselves and ready to welcome missionary enterprise yet across the',
 'gulf of space minds that are to our minds as ours are to those of the',
 'beasts that perish intellects vast and cool and unsympathetic',
 'regarded this earth with envious eyes and slowly and surely drew their',
 'plans against us and early in the twentieth century came the great',
 'disillusionment']

In [5]:
# removing stopwords

# importing the stopwords list
from nltk.corpus import stopwords
stopwords = list(set(stopwords.words("english")))

# function for removing stop words
def remove_stopwords(text):
    text = [word for word in text if not word in stopwords]
    return text

# removing stop words
for i in range(len(fixed_text)):
    fixed_text[i] = remove_stopwords(fixed_text[i].split())
    
fixed_text[1:10]

[['upon', 'impossible', 'improbable', 'curious', 'recall'],
 ['mental', 'habits', 'departed', 'days', 'terrestrial', 'men'],
 ['fancied', 'might', 'men', 'upon', 'mars', 'perhaps', 'inferior'],
 ['ready', 'welcome', 'missionary', 'enterprise', 'yet', 'across'],
 ['gulf', 'space', 'minds', 'minds'],
 ['beasts', 'perish', 'intellects', 'vast', 'cool', 'unsympathetic'],
 ['regarded', 'earth', 'envious', 'eyes', 'slowly', 'surely', 'drew'],
 ['plans', 'us', 'early', 'twentieth', 'century', 'came', 'great'],
 ['disillusionment']]

In [6]:
# creating vocabulary
vocab = list(set([word for line in fixed_text for word in line]))
# sorting alphabetically
vocab = sorted(vocab)

# appending token for null word, i.e., word for when there is no actual word in the word window
vocab.append("<null>")

# creating a word to index dictionary; will be useful in one_hot encoding
word_to_id = {word:i for i, word in enumerate(vocab)}

# creating a index to word dictionary; will be useful in one_hot decoding
id_to_word = {i:word for i, word in enumerate(vocab)}

## Summary
print("Summary")
print("No. of Sentences: ", len(fixed_text))
print("vocab length: ", len(vocab))

Summary
No. of Sentences:  5546
vocab length:  7013


In [0]:
# function for one hot encoding
def one_hot(text, vocab):
    hot_matrix = np.zeros([len(text), len(vocab)])
    for i in range(len(text)):
        if text[i] in word_to_id:
            hot_matrix[i, word_to_id[text[i]]] = 1

    return hot_matrix

In [0]:
def create_word_window(line):
    """
        line: a single sentence or list of words
        
        append words in the format [ center word, left word, right word] 
    """    
    
    # empty list for each word window
    window = []
    
    # loop over the sentence
    for i in range(len(line)):
        
        # if there's nothing on the left, <null> tag will be appended
        if i == 0 and i != len(line)-1:
            left = "<null>"
            right = line[i+1]
            window.append([line[i], left, right])

        elif i == 0 and i == len(line)-1:
            left = "<null>"
            right = "<null>"
            window.append([line[i], left, right])
            
        elif i == len(line)-1:
            left = line[i-1]
            right = "<null>"
            window.append([line[i], left, right])
        else: 
            left = line[i-1]
            right = line[i+1]
            window.append([line[i], left, right])
            
    return window

In [0]:
# list of word windows

final_text = []

for i in fixed_text:
    w_w = create_word_window(i)
    final_text = final_text + w_w
    
# converting to one hot
final_text = [one_hot(line, vocab) for line in final_text]

In [10]:
final_text_one_hot = np.asarray(final_text)

print("final one hot vector: ",final_text_one_hot[0].shape)

(33205, 3, 7013)