# Generate training data

### ideas not yet done

1. if a,b are labeled as same question pair, we can easily gen new training data by

        a. for c!=a  =>  c!=b
        b. for c==b && d==a  =>  c==d
        

In [3]:
import numpy as np
import pandas as pd
import re
import pickle
import json

## Load data

In [5]:
df_train = pd.read_csv('../dataset/raw/train.csv', delimiter=',')
df_test = pd.read_csv('../dataset/raw/test.csv', delimiter=',')
df_train_size = 404290
df_train.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [6]:
all_questions = np.concatenate([np.array(df_train['question1']), np.array(df_train['question2'])])

## Parse words

Identify the words which are worth for embedding.

In [36]:
def clean_string(text):
    # Clean the text
    text = re.sub("what's", "what is ", text)
    text = re.sub("\'s", " ", text)
    text = re.sub("\'ve", " have ", text)
    text = re.sub("can't", "cannot ", text)
    text = re.sub("n't", " not ", text)
    text = re.sub("i'm", "i am ", text)
    text = re.sub("\'re", " are ", text)
    text = re.sub("\'d", " would ", text)
    text = re.sub("\'ll", " will ", text)
    text = re.sub(" e.g.", " eg ", text)
    text = re.sub(" b.g.", " bg ", text)
    text = re.sub("(\d+)(kK)", "\g<1>000", text)
    text = re.sub("e-mail", "email", text)
    
    return text

def parse_single_words(question, idx=None):
    
    '''
    Usage:

        Use regex parse all single words without special characters and non-ascii terms

    Returns:

        List of string which is identified as a single word without special characters

    Testcases:

        # test first ten questions in dataset
        for i in range(10):
            print(i)
            print(all_questions[i])
            print(parse_single_words(all_questions[i]))
            
        # another good testcase
        print(parse_single_words(" '我是Hubert Lin, 這 code? 看起來還不錯ㄅ ? ha-ha-ha' "))

    '''
    
    ascii_only = False
    
    # identify special characters that separate words : (space) ' ! " ? @ ^ + * / . , ~ ( ) [ ] { } & | ` $ % = : ; < > -
    separator_include_ascii = '(?=[\s\'!"?@\^+*/\.,~\(\)\[\]\{\}\&\|`\$\%\=:;\<\>\-]|[^\x00-\x7F])'
    separator = '(?=[\s\'!"?@\^+*/\.,~\(\)\[\]\{\}\&\|`\$\%\=:;\<\>\-])'
    
    single_word = '[a-zA-Z0-9]+[a-zA-Z0-9_]*' # rule is at least one eng character or number
    
    try:
        question = clean_string(question)
        if not ascii_only:
            words = re.findall( single_word + separator_include_ascii + '|[^\x00-\x7F]+' , question)

        elif ascii_only:
            words = re.findall( single_word + separator , question)
    except:
        idx = idx if idx<df_train_size else idx-df_train_size
        print('Got execption while handling' , idx, '-th question: ', question)
        print('There are two question pairs has empty string, got to deal with that')
        words = []
        
    return words

In [37]:
# Can't do this since return values are lists with different lengths, 
# this will cause error while np tries to transform all results into np array
#
# parse_single_words = np.vectorize(parse_single_words)
# all_questions_parsed = parse_single_words(all_questions)

all_questions_parsed = [parse_single_words(question, idx=i) for i,question in enumerate(all_questions)]

Got execption while handling 105780 -th question:  nan
There are two question pairs has empty string, got to deal with that
Got execption while handling 201841 -th question:  nan
There are two question pairs has empty string, got to deal with that


In [38]:
flatten_word_vec = np.concatenate(all_questions_parsed)
unique_word_vec, unique_word_count = np.unique(flatten_word_vec, return_counts=True)

In [40]:
# Remove noise words, whom occurs less than threshold
# More words may increase performance, but increases size of word-embedding matrix
occurrence_threshold = 5
selected_words = unique_word_vec[unique_word_count>=occurrence_threshold]

print(len(unique_word_vec), ' unique words')
print(len(selected_words), 'words occurs more (or equal) than', occurrence_threshold , 'times')

json.dump(list(selected_words), open('../dataset/processed/selected_words.json', 'w'))

109755  unique words
35648 words occurs more (or equal) than 5 times


In [49]:
import random

trash_words = unique_word_vec[np.logical_and(unique_word_count>3, unique_word_count<5)]
' | '.join([unique_word_vec[random.randint(0,len(trash_words)-1)] for i in range(50)])

'0dev | 2f | 4d | 797 | 182cm | 002 | 6yr | 820s | 4mm | 1550nm | 1th | 1180 | 1m3 | A1c | 1985 | AFBC | 2000 | AA | 2cgpa | 8300 | 1267 | ANSYS | 7segment | 11808 | 37a | 12071 | 80000m | 5lbs | 3121 | AFFECTIVE | 63000gs | 200000ph | AGO | 692 | 100000m | ASTRONOMY | 7mo | 20cm | 800rpm | AUTOCAD | 7500u | 7engine | 291 | 5500U | 8440p | 09 | 1334 | 4167 | 5cm | 38000'

## Generate encode/decode mapping

Encode the question into integers lists, this can reduce time consumption while we do word embedding on training phase

In [50]:
def build_enc_dec_mapping(words):
    """
    enc_map: voc --encode--> id
    dec_map: id --decode--> voc
    """
    
    enc_map, dec_map = {}, {}
    
    # add special control characters
    # <ST>    : start of question
    # <ED>    : end of question
    # <RARE>  : rare words occurrence is lower than threshold
    # <EMPTY> : no word in such position
    special_characters = [ '<ST>', '<ED>', '<RARE>', '<EMPTY>']
    sp_len = len(special_characters)
    for i,sp in enumerate(special_characters):
        enc_map[sp] = i
        dec_map[i] = sp
    
    # add our words into mapping
    for i,word in enumerate(words):
        enc_map[word] = sp_len+i
        dec_map[sp_len+i] = word
    
    return enc_map, dec_map

In [52]:
enc_map, dec_map = build_enc_dec_mapping(selected_words)

# save enc/decoding map to disk
# json.dump(enc_map, open('dataset/processed/enc_map.json', 'w'))
# json.dump(dec_map, open('dataset/processed/dec_map.json', 'w'))
pickle.dump(enc_map, open('../dataset/processed/enc_map.pkl', 'wb'))
pickle.dump(dec_map, open('../dataset/processed/dec_map.pkl', 'wb'))
vocab_size = len(dec_map)

In [59]:
def enc_question(question, enc_map):
    
    if type(question)!=str or question=="":
        return [enc_map['<ST>'], enc_map['<ED>']]
    
    def map_wrod(word):
        return enc_map[word] if word in enc_map else enc_map['<RARE>']
         
    
    # identify special characters that separate words : (space) ' ! " ? @ ^ + * / . , ~ ( ) [ ] { } & | ` $ % = : ; < >  
    separator = '(?=[\s\'!"?@\^+*/\.,~\(\)\[\]\{\}\&\|`\$\%\=:;\<\>\-]|$)'
    single_word = '[^\s\-]+' # non-empty is enough here
    
    words_list = re.findall(single_word+separator, question)
    
    return [enc_map['<ST>']] + [map_wrod(word) for word in words_list] + [enc_map['<ED>']] 
    
    
def dec_question(question, dec_map):
    return [dec_map[enc_value] for enc_value in question]

In [60]:
enc_map = pickle.load(open('../dataset/processed/enc_map.pkl', 'rb'))
enc_question("two-year-old", enc_map)

[0, 34035, 35445, 27101, 1]

In [61]:
# testcase
question = 'glove is a good library'
enc = enc_question(question, enc_map)
dec = dec_question(enc, dec_map)

print('Original question: ', question)
print('Encoded: ', enc)
print('Decoded: ', dec)

Original question:  glove is a good library
Encoded:  [0, 22216, 24245, 14111, 22258, 24979, 1]
Decoded:  ['<ST>', 'glove', 'is', 'a', 'good', 'library', '<ED>']


## Encode original training data frame and store it

In [62]:
enc_map = pickle.load(open('../dataset/processed/enc_map.pkl','rb'))

In [63]:
def preprocess_df(df, enc_map):
    
    def process_frame(frame):
        frame['question1'] = enc_question(frame['question1'], enc_map)
        frame['question2'] = enc_question(frame['question2'], enc_map)
        return frame
    
    return df.apply(process_frame, axis=1)

In [64]:
df_preprocessed = preprocess_df(df_train, enc_map)

# remove these two useless columns
del df_preprocessed['qid1']
del df_preprocessed['qid2']
# df_preprocessed.to_csv('../dataset/processed/processed_training_data.csv', index=False)
pickle.dump(df_preprocessed, open('../dataset/processed/processed_training_data.pkl', 'wb'))

df_preprocessed.head()

Unnamed: 0,id,question1,question2,is_duplicate
0,0,"[0, 13732, 24245, 33281, 32261, 16641, 32261, ...","[0, 13732, 24245, 33281, 32261, 16641, 32261, ...",0
1,1,"[0, 13732, 24245, 33281, 32345, 27057, 7357, 2...","[0, 13732, 35366, 22643, 23380, 33281, 6593, 2...",0
2,2,"[0, 6226, 16744, 6303, 23621, 33281, 31961, 27...","[0, 6226, 16744, 6691, 31961, 15774, 23622, 16...",0
3,3,"[0, 13767, 14758, 6303, 25805, 34670, 2, 6226,...","[0, 5093, 33281, 29916, 35126, 2, 24245, 19780...",0
4,4,"[0, 13747, 27137, 19720, 23551, 34985, 2, 2, 2...","[0, 13747, 21442, 35366, 32790, 23551, 30656, ...",0


In [69]:
df_preprocessed = preprocess_df(df_test, enc_map)
# df_preprocessed.to_csv('./dataset/processed/processed_testing_data.csv', index=False)
pickle.dump(df_preprocessed, open('../dataset/processed/processed_testing_data.pkl', 'wb'))

df_preprocessed.head()

Unnamed: 0,test_id,question1,question2
0,0,"[0, 6226, 19835, 33281, 12365, 10330, 22937, 6...","[0, 13767, 19438, 8587, 17288, 18329, 25335, 1..."
1,1,"[0, 11817, 6303, 22725, 14111, 22564, 33775, 1...","[0, 6226, 26367, 18389, 19835, 22564, 33775, 2..."
2,2,"[0, 13732, 16617, 24245, 33281, 15908, 35015, ...","[0, 13732, 35474, 31037, 26217, 33512, 2, 1]"
3,3,"[0, 13747, 21633, 26847, 2, 1]","[0, 13732, 21635, 2, 1]"
4,4,"[0, 6226, 2, 32194, 2, 1]","[0, 6226, 33291, 16744, 6303, 32194, 2, 1]"


## Generate embedding matrix for GloVe

Warning:

Don't run this if not necessary. This takes about 10 minutes.

In [70]:
def generate_embedding_matrix(w2v_path, dec_map, lang_dim):    
    
    out_vocab = []
    embeddings_index = {}
    
    print('Loading from GloVe file on hard disk ... ')
    f = open(w2v_path, 'r', encoding='utf-8')
    for line in f:
        values = line.split()
        word = values[0]
        try:
            coefs = np.asarray(values[1:], dtype='float32')
        except Exception as e:
            if e==KeyboardInterrupt:
                raise KeyBoardInterrupt
            print('exception word: ', values[0], ' ', values[1], ' ', values[2])
            continue
        embeddings_index[word] = coefs
    f.close()
    
    # prepare embedding matrix
    print('Load complete, start embedding ... ')
    embedding_matrix = np.random.rand(len(dec_map), lang_dim)
    for idx, wd in dec_map.items():
        if wd in embeddings_index.keys():
            embedding_matrix[idx] = embeddings_index[wd]
        else:
            out_vocab.append(wd)
    print('\nNot in pre-trained vocabulary list:\n', ','.join(out_vocab))
    return embedding_matrix

In [71]:
dec_map = pickle.load(open('../dataset/processed/dec_map.pkl', 'rb'))

EMBEDDING_SIZE = 300
embedding_matrix = generate_embedding_matrix('../dataset/raw/glove.840B.300d.txt', dec_map, EMBEDDING_SIZE)

pickle.dump(embedding_matrix, open('../dataset/processed/embedding_matrix.pkl', 'wb'))

Loading from GloVe file on hard disk ... 
exception word:  .   .   .
exception word:  at   name@domain.com   0.0061218
exception word:  .   .   .
exception word:  to   name@domain.com   0.33865
exception word:  .   .   0.035974
exception word:  .   .   .
exception word:  email   name@domain.com   0.33529
exception word:  or   name@domain.com   0.48374
exception word:  contact   name@domain.com   0.016426
exception word:  Email   name@domain.com   0.37344
exception word:  on   name@domain.com   0.037295
exception word:  At   Killerseats.com   -0.13854
exception word:  by   name@domain.com   0.6882
exception word:  in   mylot.com   -0.18148
exception word:  emailing   name@domain.com   0.39173
exception word:  Contact   name@domain.com   0.14933
exception word:  at   name@domain.com   0.44321
exception word:  •   name@domain.com   -0.13288
exception word:  at   Amazon.com   -0.5275
exception word:  is   name@domain.com   -0.1197
Load complete, start embedding ... 

Not in pre-trained voc

<br>
↑↑↑↑↑ WOW ↑↑↑↑↑

(TensorFlow is in this list LOL)

<br>

In [42]:
a = np.arange(9)
np.square(a)

array([ 0,  1,  4,  9, 16, 25, 36, 49, 64], dtype=int32)

## Let's do a short experiment see if our embedding matrix works or not

Does vector : " London -> Paris " contains semantic relationship and can approximate relationship of " England -> France " ?

In [57]:
from math import sqrt

print(enc_map['London'], enc_map['Paris'], enc_map['England'])

London = embedding_matrix[enc_map['London']]
Paris = embedding_matrix[enc_map['Paris']]
England = embedding_matrix[enc_map['England']]
target = Paris - London + England

distances = np.zeros(embedding_matrix.shape[0])
for i,vec in enumerate(embedding_matrix):
    distances[i] = sqrt(np.sum(np.square(vec-target)))

bests = np.argsort(distances)[:5]
for best in bests:
    print(dec_map[best], '  dist = ', distances[best])

7265 9282 4135
France   dist =  5.64297987282
Paris   dist =  6.27527457426
England   dist =  7.1538742503
French   dist =  7.24948883275
Lyon   dist =  7.3931232693


It works !

## How to use this embedding matrix afterward (Example in Keras)

Note:

Since some weird words is not embeddable, so we random assign a random value vector to related word. Although this workaround might have some problems such as : the random assigned vector is probably being very close to other non-related word vector, which makes our model mis-understanding the meaning of such word. But at least our model has ability to identify if two words are same or not.

翻譯蒟蒻 (translation) :

因為我們選定的不少字不存在於 embedding file 裡面，而我們的處理方法是直接隨機給予一個 embedding vector ，這雖然會讓我們的 model 誤認為這些字與某些其他字有相關性，但至少這樣的做法讓我們的 model 有能力辨認兩個字是不是一樣的。

In [None]:
# a short example how to embed encoded text into word vector

def model():
    
    '''This model example is for Keras'''

    # hyper-parameters that should be passed as function argument
    
    vocab_size = len(enc_map)
    vocab_dim = EMBEDDING_SIZE # in my case is 300
    text_length = 1 # how many words are we going to feed in one time

    
    '----------------------------------------------------------------------------'

    
    # embed our encoded question to embedded vector

    encoded_question_input = Input(shape=(text_length,))

    x = Embedding(output_dim = vocab_dim, 
                  input_dim = vocab_size, 
                  init = 'glorot_uniform',       # to be honest, I don't know what is this
                  input_length = text_length, 
                  weights = [embedding_matrix]   # our embedding_matrix
                 )(encoded_question_input)

    text_embedded = Reshape((vocab_dim*text_length,))(x)

    # maybe other input source

    other_input = . . . 
    other_input_dim = . . .

    # feed to RNN model

    x = merge([text_embedded, other_input], mode='concat', concat_axis=-1)
    x = Reshape((1, vocab_dim*text_length + other_input_dim ))(x)
    x = GRU(128)(x)
    out = Dense(vocab_size, activation='softmax')(x)
    
    # compile the model
    
    model = Model(input=[encoded_question_input, other_input], output=out)
    # choose objective and optimizer
    model.compile(loss='categorical_crossentropy', optimizer=RMSprop(lr=1e-3, clipnorm=clipnorm))
    
    return model