## Text Preprocessing for Keras Model

The first step in working with text data is to pre-process it. I cannot go straight from raw text to fitting a machine learning model. I must clean the text first, which means splitting it into words and handling punctuation and case.

After cleaing the text data, I used **Word Embedding** which is a technique for representation of words in a low dimensional vector space. Each word represents by a fixed length vector. Semantic relations between words are captured by this technique.
there are many ways to implement the word space, one of them is using pre-build Word Embedding such as [GloVe](https://nlp.stanford.edu/projects/glove/)

In [2]:
# import libs
import os
import re
import sys
import string
import zipfile
import pickle
import nltk
import urllib.request
import numpy as np
from collections import Counter

### Parameters

In [3]:
# GLOVE Embedding Size
GLOVE_EMBEDDING_SIZE = 100

# Max number of words in each sentance (source)
MAX_INPUT_SEQ_LENGTH =  40

# Max number of words in each sentance (target)
MAX_TARGET_SEQ_LENGTH = 40

# vocabulary size
MAX_VOCAB_SIZE = 14000 #1000

# Dataset files path
SOURCE_DATA_PATH = 'data/datasets/train.from'
TARGET_DATA_PATH = 'data/datasets/train.to'

# Glove files path
GLOVE_MODEL = "data/glove-data/glove.6B." + str(GLOVE_EMBEDDING_SIZE) + "d.txt"

### Download and load Glove files

In [4]:
def reporthook(block_num, block_size, total_size):
    read_so_far = block_num * block_size
    if total_size > 0:
        percent = read_so_far * 1e2 / total_size
        s = "\r%5.1f%% %*d / %d" % (
            percent, len(str(total_size)), read_so_far, total_size)
        sys.stderr.write(s)
        if read_so_far >= total_size:  # near the end
            sys.stderr.write("\n")
    else:  # total size is unknown
        sys.stderr.write("read %d\n" % (read_so_far,))

In [5]:
def download_glove():
    '''
    Function to download GloVe files if not exist
    '''
    if not os.path.exists(GLOVE_MODEL):

        glove_zip = 'data/glove-data/glove.6B.zip'

        if not os.path.exists('data/glove-data'):
            os.makedirs('data/glove-data')

        if not os.path.exists(glove_zip):
            print('glove file does not exist, downloading from internet')
            urllib.request.urlretrieve(url='http://nlp.stanford.edu/data/glove.6B.zip', filename=glove_zip,
                                       reporthook=reporthook)

        print('unzipping glove file')
        zip_ref = zipfile.ZipFile(glove_zip, 'r')
        zip_ref.extractall('data/glove-data')
        zip_ref.close()

In [6]:
def load_glove():
    '''
    Function to read gloVe files
        * return: dict of all words and their embedding vectors
    '''
    # download gloVe files
    download_glove()
    
    # dict with key= word , value = embedding vector 100
    _word2em = {}
    file = open(GLOVE_MODEL, mode='rt', encoding='utf8')
    for line in file:
        words = line.strip().split()
        word = words[0]
        embeds = np.array(words[1:], dtype=np.float32)
        _word2em[word] = embeds
    file.close()
    return _word2em

In [8]:
# call function
word2em = load_glove()

glove file does not exist, downloading from internet


100.0% 862183424 / 862182613


unzipping glove file


### Read Dataset Files

In [7]:
def load_data(path):
    ''' Function to read training and testing files
            *args:
                path: file path as string 
            *return:
                data: raw string text
    '''
    input_file = os.path.join(path)
    with open(input_file, 'r', encoding='utf-8') as f:
        data = f.read()
    return data

In [9]:
# load original training data (comment, replay) Movies
mov_source_text = load_data(SOURCE_DATA_PATH).lower()
mov_target_text = load_data(TARGET_DATA_PATH).lower()

### Clean Data and Create vocabulary

In [14]:
def vocab_accurance(text, vocab_counter, is_padding):
    '''
    Function to create a counter for each word occurrence and list for all observation sentance
        *args:
            text: string
            vocab_counter: Vocabulary counter
            is_padding: boolean, do padding or not 
        *return:
            text_lst: list of all observation sentences
            vocab_counter: Vocabulary counter
        
    '''
    text_lst = [] 
    for sen in text.split("\n"):
        # split words
        sen = [w for w in nltk.word_tokenize(sen)]
        # check length of sentence
        if len(sen) > MAX_TARGET_SEQ_LENGTH:
            sen = sen[0:MAX_TARGET_SEQ_LENGTH]
        # fill Vocabulary counter
        for w in sen:
            vocab_counter[w] += 1
        # check padding request 
        if is_padding:
            sen.insert(0, 'start')
            sen.append('end')  
        text_lst.append(sen) 
    return text_lst, vocab_counter

In [15]:
# create a vocab counter for each word in training data (comment, replay) 
vocab_counter = Counter()

# preprocess comments
input_texts, vocab_counter = vocab_accurance(mov_source_text, vocab_counter, is_padding=False)

# preprocess replay
target_texts, vocab_counter = vocab_accurance(mov_target_text, vocab_counter, is_padding=True)

In [16]:
# check data pairs after cleaning and padding start and end tags
for idx, (input_words, target_words) in enumerate(zip(input_texts[-5:], target_texts[-5:])):
    print('Source:', input_words)
    print('Target:', target_words, '\n')


Source: ['colonel', 'durnford', '...', 'william', 'vereker', '.', 'i', 'hear', 'you', "'ve", 'been', 'seeking', 'officers', '?']
Target: ['start', 'good', 'ones', ',', 'yes', ',', 'mr', 'vereker', '.', 'gentlemen', 'who', 'can', 'ride', 'and', 'shoot', 'end'] 

Source: ['your', 'orders', ',', 'mr', 'vereker', '?']
Target: ['start', 'i', "'m", 'to', 'take', 'the', 'sikali', 'with', 'the', 'main', 'column', 'to', 'the', 'river', 'end'] 

Source: ['lord', 'chelmsford', 'seems', 'to', 'want', 'me', 'to', 'stay', 'back', 'with', 'my', 'basutos', '.']
Target: ['start', 'i', 'think', 'chelmsford', 'wants', 'a', 'good', 'man', 'on', 'the', 'border', 'why', 'he', 'fears', 'a', 'flanking', 'attack', 'and', 'requires', 'a', 'steady', 'commander', 'in', 'reserve', '.', 'end'] 

Source: ['well', 'i', 'assure', 'you', ',', 'sir', ',', 'i', 'have', 'no', 'desire', 'to', 'create', 'difficulties', '.', '45']
Target: ['start', 'and', 'i', 'assure', 'you', ',', 'you', 'do', 'not', 'in', 'fact', 'i', "'d"

In [17]:
#build vocab with most accurance words (more than 3 times)
target_word2idx = dict()
for idx, word in enumerate(vocab_counter.most_common(len(vocab_counter))):
    # if wird accrue more than 3 times
    if word[1] > 3:
        target_word2idx[word[0]] = idx + 1
    

target_idx2word = dict([(idx, word) for word, idx in target_word2idx.items()])

In [18]:
# should have many unknown words if MAX_VOCAB_SIZE < len(target_counter)
if 'unknown' not in target_word2idx:
    target_word2idx['unknown'] = 0

In [19]:
print('Vocabulary size:',len(target_word2idx))

Vocabulary size: 15856


### Embedding Inputs

In [20]:
#Embedding comments words

input_texts_word2em = []

# longest sentance in the dataset
encoder_max_seq_length = 0
decoder_max_seq_length = 0

#embedding input of encoder
for input_sentance, target_sentance in zip(input_texts, target_texts):
    # each sentance
    encoder_input_wids = []
    for word in input_sentance:
        # each word
        emb = np.zeros(shape=GLOVE_EMBEDDING_SIZE)
        if word in word2em:
            emb = word2em[word]
        encoder_input_wids.append(emb)

    input_texts_word2em.append(encoder_input_wids)
    encoder_max_seq_length = max(len(encoder_input_wids), encoder_max_seq_length)
    decoder_max_seq_length = max(len(target_sentance), decoder_max_seq_length)

### Save Parameters

In [21]:
num_decoder_tokens = len(target_idx2word)+1

In [22]:
context = dict()
context['num_decoder_tokens'] = num_decoder_tokens
context['encoder_max_seq_length'] = encoder_max_seq_length
context['decoder_max_seq_length'] = decoder_max_seq_length
print(context)

{'decoder_max_seq_length': 42, 'encoder_max_seq_length': 40, 'num_decoder_tokens': 15857}


In [23]:
# save dicts
pickle.dump(((context),
             (input_texts_word2em),
             (target_texts),
             (word2em),
             (target_word2idx, target_idx2word)), open('models/preprocess.p', 'wb'))