# settings

In [1]:
import os
import sys
import time
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import re
import unicodedata
import nltk
from nltk.tokenize.toktok import ToktokTokenizer
import json
import pickle
from sklearn.model_selection import train_test_split
%matplotlib inline

# pre-trained word embedding by GolVe
https://github.com/stanfordnlp/GloVe
...Wikipedia 2014 + Gigaword 5 (6B tokens, 400K vocab, uncased, 300d vectors, 822 MB download): glove.6B.zip
Due to the nature of WikiHow dataset, we choose word embedding result file 'glove.6B' which is pre-trained on Wikipedia and Gigaword dataset. Besides, it contains four .text for different embdedding vector length: 50, 100, 200, 300. 

# load WikiHow data

In [2]:
%%time
data = pd.read_csv('../data/wikihowSep.csv')
data = data.astype(str)
rows, columns = data.shape

Wall time: 22.6 s


expand contraction by https://github.com/khurram6968/NLP-Expand-Contraction-Python/blob/master/NLP.py

In [3]:
contraction_map={
    "ain't": "is not",
    "aren't": "are not",
    "can't": "cannot",
    "cause": "because",
    "could've": "could have",
    "couldn't": "could not",
    "couldn't've": "could not have",
    "didn't": "did not",
    "doesn't": "does not",
    "don't": "do not",
    "hadn't": "had not",
    "hadn't've": "had not have",
    "hasn't": "has not",
    "haven't": "have not",
    "he'd": "he would",
    "he'd've": "he would have",
    "he'll": "he will",
    "he'll've": "he will have",
    "he's": "he is",
    "how'd": "how did",
    "how'd've": "how did have",
    "how'll": "how will",
    "how's": "how is",
    "I'd": "I would",
    "I'd've": "I would have",
    "I'll": "I will",
    "I'll've": "I will have",
    "I'm": "I am",
    "I've": "I have",
    "i'd": "i would",
    "i'd've": "i would have",
    "i'll": "i will",
    "i'll've": "i will have",
    "i'm": "i am",
    "i've": "i have",
    "isn't": "is not",
    "it'd": "it would",
    "it'd've": "it would have",
    "it'll": "it will",
    "it'll've": "it will have",
    "it's": "it is",
    "let's": "let us",
    "ma'am": "madam",
    "mayn't": "may not",
    "might've": "might have",
    "mightn't": "might not",
    "mightn't've": "might not have",
    "must've": "might have",
    "mustn't": "must not",
    "mustn't've": "must not have",
    "needn't": "need not",
    "needn't've": "need not have",
    "o'clock": "of the clock",
    "oughtn't": "ought not",
    "oughtn't've": "ought not have",
    "shan't": "shall not",
    "shall'n't": "shall not",
    "shan't've": "shall not have",
    "she'd": "she would",
    "she'd've": "she would have",
    "she'll": "she will",
    "she'll've": "she will have",
    "she's": "she is",
    "should've": "should have",
    "shouldn't": "should not",
    "shouldn't've": "should not have",
    "so've": "so have",
    "so's": "so as",
    "that'd": "that would",
    "that'd've": "that would have",
    "that's": "that is",
    "there'd": "there would",
    "there'd've": "there would have",
    "there's": "there is",
    "they'd": "they would",
    "they'd've": "they would have",
    "they'll": "they will",
    "they'll've": "they will have",
    "they're": "they are",
    "they've": "they have",
    "to've": "to have",
    "wasn't": "was not",
    "we'd": "we would",
    "we'd've": "we would have",
    "we'll": "we will",
    "we'll've": "we will have",
    "we're": "we are",
    "weren't": "were not",
    "what'll": "what will",
    "what'll've": "what will have",
    "what're": "what are",
    "what's": "what is",
    "what've": "what have",
    "when's": "when is",
    "when've": "when have",
    "where'd": "where did",
    "where's": "where is",
    "where've": "where have",
    "who'll": "who will",
    "who'll've": "who will have",
    "who's": "who is",
    "who've": "who have",
    "why's": "why is",
    "why've": "why have",
    "will've": "will have",
    "won't": "will not",
    "will't've": "will not have",
    "would've": "would have",
    "would't": "would not",
    "would't've": "would not have",
    "y'all": "you all",
    "y'all'd": "you all would",
    "y'all'd've": "you all would have",
    "y'all're": "you all are",
    "y'all've": "you have all",
    "you'd": "you would",
    "you'd've": "you would have",
    "you'll": "you will",
    "you'll've": "you will have",
    "you're": "you are",
    "you've": "you have",
}

def expand_contractions(sent, mapping):
    #pattern for matching contraction with their expansions
    pattern = re.compile('({})'.format('|'.join(mapping.keys())), flags=re.IGNORECASE|re.DOTALL)
    
    def expand_map(contraction):
        #using group method to access subgroups of the match
        match = contraction.group(0)
        #to retain correct case of the word
        first_char = match[0]
        #find out the expansion
        expansion = mapping.get(match) if mapping.get(match) else mapping.get(match.lower())
        expansion = first_char + expansion[1:]
        return expansion
    #using sub method to replace all contractions with their expansions for a sentence
    #function expand_map will be called for every non overlapping occurence of the pattern
    expand_sent = pattern.sub(expand_map, sent)
    return expand_sent

simple pre-processing by https://towardsdatascience.com/nlp-building-text-cleanup-and-preprocessing-pipeline-eba4095245a0

In [4]:
def data_loader(dataframe, target_col): 
    # extraction from dataframe in to list
    text = [article for article in getattr(dataframe, target_col)]
    
    # Removing Accented Characters
    text = [unicodedata.normalize('NFKD', sentence).encode('ascii', 'ignore').decode('utf-8', 'ignore') for sentence in text]
    
    # Expanding Contractions
    text = [expand_contractions(sentence, contraction_map) for sentence in text]

    # Removing Special Characters
    pat1 = r'[^a-zA-z0-9.,!?\s]' 
    # pat1 = r'[^a-zA-z0-9.,!?/:;\"\'\s]' 
    text = [re.sub(pat1, '', sentence) for sentence in text]
    
    # Removing Extra Commas
    pat2 = r'[.]+[\n]+[,]'
    text = [re.sub(pat2,".\n", sentence) for sentence in text]
    
    # Removing extra whitespaces and tabs
    # pat3 = r'^\s*|\s\s*'
    pat3 = r'^\s+$|\s+$'
    text = [re.sub(pat3, '', sentence).strip() for sentence in text]
    
    # Lowercase
    text = [sentence.lower() for sentence in text]
    
    # tokenize
    text = [('sos ' + sentence + ' eos').split() for sentence in text]
    
    return np.array(text, dtype=object)

In [5]:
%%time
text_data = data_loader(data, 'text')
headline_data = data_loader(data, 'headline')

Wall time: 24min 45s


discard unnecessary data, due to computational resource limitation

In [6]:
text_length_threshold = 123 # mean=65.62, std=58.83
headline_ratio_threshold = 0.75

del_idx = []
for i in range(data.shape[1]):
    if len(text_data[i]) < text_length_threshold:
        if len(headline_data[i]) < headline_ratio_threshold*len(text_data[i]):
            pass
        else:
            del_idx.append(i)
    else: 
        del_idx.append(i)
text_data, headline_data = np.delete(text_data, del_idx), np.delete(headline_data, del_idx)

train, test, validation split

In [8]:
text_train, text_test, headline_train, headline_test = train_test_split(text_data, headline_data, test_size=0.1, random_state=1)

text_train, text_dev, headline_train, headline_dev = train_test_split(text_train, headline_train, test_size=0.1, random_state=1)

sort sentence from longer to shorter length, for more efficient processing

In [None]:
def data_sorter(text, headline): 
    headline = [y for x,y in sorted(zip(text, headline), key = lambda pair: len(pair[0]), reverse = True)]
    text = list(text)
    text.sort(key = lambda x: len(x), reverse = True)

    return np.array(text), np.array(headline)

In [None]:
text_train, headline_train = data_sorter(text_train, headline_train)
text_test,  headline_test  = data_sorter(text_test, headline_test)
text_dev,   headline_dev   = data_sorter(text_dev, headline_dev)

# save data

In [None]:
# test
np.save('../text_train.npy', text_train)
np.save('../headline_train.npy', headline_train)

# dev
np.save('../text_dev.npy', text_val)
np.save('../headline_dev.npy', headline_val)

# test
np.save('../text_test.npy', text_test)
np.save('../headline_test.npy', headline_test)

# Build Vocabulary
https://www.kdnuggets.com/2019/11/create-vocabulary-nlp-tasks-python.html

In [None]:
class Vocabulary:
    PAD_token = 0   # Used for padding short sentences
    SOS_token = 1   # Start-of-sentence token
    EOS_token = 2   # End-of-sentence token

    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {PAD_token: "PAD", SOS_token: "SOS", EOS_token: "EOS"}
        self.num_words = 3
        self.num_sentences = 0
        self.longest_sentence = 0

    def add_word(self, word):
        if word not in self.word2index:
            # First entry of word into vocabulary
            self.word2index[word] = self.num_words
            self.word2count[word] = 1
            self.index2word[self.num_words] = word
            self.num_words += 1
        else:
            # Word exists; increase word count
            self.word2count[word] += 1
            
    def add_sentence(self, sentence):
        sentence_len = 0
        for word in sentence.split(' '):
            sentence_len += 1
            self.add_word(word)
        if sentence_len > self.longest_sentence:
            # This is the longest sentence
            self.longest_sentence = sentence_len
        # Count the number of sentences
        self.num_sentences += 1

    def to_word(self, index):
        return self.index2word[index]

    def to_index(self, word):
        return self.word2index[word]

In [None]:
text_vocab = Vocabulary('text')
headline_vocab = Vocabulary('headline')

for sentence in text_train:
    text_vocab.add_sentence(sentence)
for sentence in headline_train:
    headline_vocab.add_sentence(sentence)
    
# leave <PAD>, add when using nn.Embedding(...,padding_idx=...)