## Data augmentation

Since many question pairs has same qid involve, 

In [38]:
import numpy as np
import pandas as pd
import re
import pickle
import json

In [39]:
df_train = pd.read_csv('../dataset/raw/train.csv', delimiter=',')
df_test = pd.read_csv('../dataset/raw/test.csv', delimiter=',')
df_train.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


↓ Although I think this might be an issue, but things seems like working very well. So, I won't do this for now.

In [67]:
# since we are very probable to use a single question for several times, we should remove validation samples directly at this point
# Fro example, Q_a == Q_b and Q_b == Q_c ,
# In my method, we'll generate a new data Q_a == Q_c
# If we move this Q_a == Q_c sample to validation set,
#     it is very weird that our training set already has this kind of information (can be recognized from the Q_a == Q_b == Q_c relation).
import random

validation_size = 10000

def split_train_val(df):
    rnd_idxes = np.array([random.randint(0,len(df)-1) for i in range(validation_size)])
    
    trainin_idxes = np.repeat(True, len(df))
    trainin_idxes[rnd_idxes] = False
    trainin_idxes = np.where(trainin_idxes==True)
    
    return df.ix[trainin_idxes], df.ix[rnd_idxes]
    

In [68]:
df_train = pd.read_csv('../dataset/raw/train.csv', delimiter=',')
df_train, df_val = split_train_val(df_train)

In [90]:
pickle.dump(df_val, open('../dataset/processed/validation_df.pkl','wb'))

## Get all duplicated questions

We'll construct two data structures:
1. A dictionary records { qid : question_text } pair. 

         The question_text is only splited as a list of words, not yet transformed into encoded form. This gives us chances to do training more flexible.
         
2. A list of duplicated qid pairs 

        Recording only qid saves us data loading time (since we'll try to augment and enumerate a huge amount of duplicated question pairs).

Note:

I didn't record any single non-duplicated question relation. 
I decided to random assign any question pairs to be non-duplicated question pairs. This might causes some issue that similar to too-much-upsampling case, but this extremely increases the variety of non-duplicated question samples.

In [73]:
def get_max_qid(df):
    max_qid = 0
    for idx,frame in df.iterrows():
        qid1 = int(frame['qid1'])
        qid2 = int(frame['qid2'])
        if qid1>max_qid:
            max_qid = qid1
        elif qid2>max_qid:
            max_qid = qid2
    print('Max qid = ', max_qid)
    return max_qid

max_qid = get_max_qid(df_train)

Max qid =  537932


Forming groups of duplicated question pairs.

EX: 
```
if A==B and B==C:
    group A,B,C as a group, then we can enumerate all combinations, including A==C as a new sample
```

In [74]:
def group_questions(df):
    group_id = 0
    group_list = np.repeat(-1, max_qid)
    
    for idx,frame in df.iterrows():
        qid1 = int(frame['qid1'])
        qid2 = int(frame['qid2'])
        
        if int(frame['is_duplicate'])==1:
            # if both has no group, add new group
            if group_list[qid1]==-1 and group_list[qid2]==-1:
                group_list[qid1] = group_id
                group_list[qid2] = group_id
                group_id += 1

            # if both has group, join the group 
            elif group_list[qid1]!=-1 and group_list[qid2]!=-1 :
                idxes_to_be_joined = np.where(group_list==group_list[qid2])[0]
                group_list[idxes_to_be_joined] = group_list[qid1]

            # only q1 has group , than add q2 to q1's group
            elif  group_list[qid1]!=-1:
                group_list[qid2] = group_list[qid1]

            # only q2 has group , than add q1 to q2's group
            elif  group_list[qid2]!=-1:
                group_list[qid1] = group_list[qid2]
                
    return group_list
    
group_ids = group_questions(df_train)

In [75]:
sum(group_ids!=-1) # means these questions has group

147086

In [76]:
# Get all the group and store it as a dictionary
group_dict = {}
for i in range(np.max(group_ids)+1):
    group_members = np.where(group_ids==i)[0]
    if len(group_members)>0:
        group_dict[i] = group_members

In [77]:
import itertools

def enumerate_all_positive_cases(group_dict):
    
    def enumerate_inside_group(group):
        return list(itertools.combinations(group, 2))
    
    return np.vstack(enumerate_inside_group(group_dict[group_id]) for group_id in group_dict)

def duplicate_all(df):
    
    def get_qid_set():
        ids = set()
        for i,series in df.iterrows():
            if series['qid1'] not in ids:
                ids.add(series['qid1'])
            if series['qid2'] not in ids:
                ids.add(series['qid2'])
        return ids
    
    id_set = get_qid_set()
    return [[i,i] for i in id_set]

In [78]:
# Enumerate all cases of duplicated question pairs from each group
enumerate_pairs = enumerate_all_positive_cases(group_dict)

# The question pairs with itself is also a sample of duplicated question pair
duplicate_pairs = duplicate_all(df_train) 

all_pos_pairs = np.vstack([enumerate_pairs,duplicate_pairs])

In [79]:
print(len(all_pos_pairs)) # The total duplicated samples count 766481 about 5~6 times than original

754062


In [80]:
pickle.dump(all_pos_pairs, open('../dataset/processed/positive_question_id_pairs.pkl', 'wb'))

## Parse original training DataFrame to words list and store it

Note: 

Not encoded yet, we need to map rare words to same `<RARE_X>` special token in each question pair. This should be done in training phase.

In [81]:
enc_map = pickle.load(open('../dataset/processed/enc_map.pkl','rb'))

In [82]:
import re

def parse_wrod_list(question):
    
    if type(question)!=str:
        return []
    
    # identify special characters that separate words : (space) ' ! " ? @ ^ + * / . , ~ ( ) [ ] { } & | ` $ % = : ; < >  
    separator = '(?=[\s\'!"?@\^+*/\.,~\(\)\[\]\{\}\&\|`\$\%\=:;\<\>\-]|$)'
    single_word = '[^\s\-]+' # non-empty is enough here

    return re.findall(single_word+separator, question)
    

In [83]:
def gen_qid_question_dict(df):
    res = {}
    for i,frame in df.iterrows():
        
        qid1 = int(frame['qid1'])
        if qid1 not in res:
            res[qid1] = parse_wrod_list(frame['question1'])
        
        qid2 = int(frame['qid2'])
        if qid2 not in res:
            res[qid2] = parse_wrod_list(frame['question2'])
            
    return res

In [84]:
# def gen_all_pos_df(qid_dict, all_pos_pairs):
#     all_series = []
#     column_names = ['qid1','qid2','question1','question2', 'is_duplicate']
#     for i,pair in enumerate(all_pos_pairs):
#         series = pd.Series([ pair[0], pair[1], qid_dict[pair[0]], qid_dict[pair[1]], 1 ], name=i)
#         all_series.append(series)
#     ret = pd.DataFrame(all_series)
#     ret.columns = column_names
#     return ret

# all_pos_df = gen_all_pos_df(qid_dict, all_pos_pairs)
# all_pos_df.head(10)
# pickle.dump(all_pos_df, open('../dataset/processed/enumerate_all_positive_training_data.pkl','wb'))

In [85]:
training_question_dict = gen_qid_question_dict(df_train)

In [86]:
pickle.dump(training_question_dict, open('../dataset/processed/qid_question_wrods_list_dict.pkl', 'wb'))