## Text Preprocessing for Tensorflow Model

The first step in working with text data is to pre-process it. I cannot go straight from raw text to fitting a machine learning model. I must clean the text first, which means splitting it into words and handling punctuation and case.

In [2]:
# import libs
import os
import pickle
import re
import string
import nltk
import numpy as np
import pandas as pd
from string import digits
import matplotlib.pyplot as plt

In [3]:
### paramters

# training file path (movies) ,(reddit)
source_path = ['data/train.from', 'data/datasets/reddit/train1.from']
target_path = ['data/train.to', 'data/datasets/reddit/train1.to']

# special codes
CODES = ['<PAD>', '<EOS>', '<UNK>', '<GO>']

MAX_TARGET_SEQ_LENGTH = 25


In [4]:
# read file data
def load_data(path):
    ''' Function to read training and testing files
            *args:
                path: file path as string 
            *return:
                data: raw string text
    '''
    input_file = os.path.join(path)
    with open(input_file, 'r', encoding='utf-8') as f:
        data = f.read()
    return data

In [5]:
def vocab_accurance(text):
    ''' 
    Function to split big text to words, then give each word a unique number as code
        *args:
            text: raw string
        *return:
            vocab_to_int: dictionary {word:int_code}
            int_to_vocab: dictionary {int_code:word} 
    '''
    vocab = {}
    for sentance in text.split("\n"):
        sentance = [w for w in nltk.word_tokenize(sentance)]
        if len(sentance) <= MAX_TARGET_SEQ_LENGTH:
            #sentance = sentance[0:MAX_TARGET_SEQ_LENGTH]
            for word in sentance:
                if vocab.get(word) == None:
                    vocab[word] = 1
                else:
                    vocab[word] = vocab.get(word)+1

    vocab_to_int = {'<PAD>':0, '<EOS>':1, '<UNK>':2, '<GO>':3}
    i = 4
    for key, item in (vocab.items()):
        if item > 1:
            vocab_to_int[key] = i
            i += 1
            
    int_to_vocab = {v_i: v for v, v_i in vocab_to_int.items()}
    return vocab_to_int, int_to_vocab

In [6]:
def text_to_index(source_text, target_text, target_vocab_to_int, source_vocab_to_int):
    """
    Function to covert string (words) to index
        *args:
            source_text: raw string text for comments
            target_text: raw string text for replies
            vocab_to_int: lookup tables
        *return:
            source_text_id: A list of lists source_id_text converted
            target_text_id: A list of lists target_id_text converted
    """
    # empty list of converted sentences
    source_text_id = []
    target_text_id = []
    
    # make a list of sentences (extraction)
    source_sentences = source_text.split("\n")
    target_sentences = target_text.split("\n")
    
    max_source_sentence_length = max([len(sentence.split(" ")) for sentence in source_sentences])
    max_target_sentence_length = max([len(sentence.split(" ")) for sentence in target_sentences])
    
    # iterating through each sentences 
    for i in range(len(source_sentences)): # of sentences in source&target is the same)
        source_sentence = source_sentences[i]
        target_sentence = target_sentences[i]
        
        
        # make a list of tokens/words (extraction) from the chosen sentence
        source_tokens = source_sentence.split(" ")
        target_tokens = target_sentence.split(" ")
        
        if len(source_tokens) <= MAX_TARGET_SEQ_LENGTH and len(target_tokens) <= MAX_TARGET_SEQ_LENGTH:
            
            # empty list of converted words to index in the chosen sentence
            source_token_id = []
            target_token_id = []

            for index, token in enumerate(source_tokens):
                if (token != ""):
                    source_token_id.append(source_vocab_to_int.get(token,source_vocab_to_int['<UNK>']))

            for index, token in enumerate(target_tokens):
                if (token != ""):
                    target_token_id.append(target_vocab_to_int.get(token,target_vocab_to_int['<UNK>']))

            # put <EOS> token at the end of the chosen target sentence
            # this token suggests when to stop creating a sequence
            target_token_id.append(target_vocab_to_int['<EOS>'])

            # add each converted sentences in the final list
            source_text_id.append(source_token_id)
            target_text_id.append(target_token_id)
    
    return source_text_id, target_text_id

In [7]:
def preprocess_and_save_data(source_path, target_path):
    '''
    Function to preprocess data files and save the following
        1- source_text after encoding to index (comment) 
        2- target_text after encoding to index (replay)
        3- vocab_to_int dict
        4- int_to_vocab dict
        *args:
            source_path: list of comment files path as string 
            target_path: list of replay files path as string 
    '''
    # Preprocess
    
    # load original training data (comment, replay) Movies
    mov_source_text = load_data(source_path[0]).lower()
    mov_target_text = load_data(target_path[0]).lower()
    
#     # load original training data (comment, replay) Reddit
#     red_source_text = load_data(source_path[1]).lower()
#     red_target_text = load_data(target_path[1]).lower()


    # create lookup dict for (comment, replay) data
    source_vocab_to_int, source_int_to_vocab = vocab_accurance(mov_source_text)
    target_vocab_to_int, target_int_to_vocab =  vocab_accurance(mov_target_text)
    
    
    # create list of sentences whose words are represented in index
    mov_source_text, mov_target_text = text_to_index(mov_source_text,
                                                     mov_target_text,
                                                     target_vocab_to_int,
                                                     source_vocab_to_int)

    # Save data for later use
    pickle.dump((
        (mov_source_text, mov_target_text),
        (source_vocab_to_int, source_int_to_vocab),
        (target_vocab_to_int, target_int_to_vocab)), open('models/preprocess.p', 'wb'))

In [8]:
# process source and target files
preprocess_and_save_data(source_path, target_path)