In [185]:
import sys
import os
import hashlib
import struct
import subprocess
import collections
import tensorflow as tf
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import pandas as pd
from tqdm import tqdm
from tensorflow.core.example import example_pb2

In [9]:
# from nltk.tokenize import word_tokenize
from nltk.parse import CoreNLPParser
parser = CoreNLPParser(url='http://localhost:8889')

### 1. Preliminary data filtering/analysis

In [190]:
def process_data(fpath):
    # Move to dataframe for text usage
    df = pd.read_csv(fpath)
    print("Total number of pairs:", df.shape[0])
    
    # Convert all questions to string and X, y format for splitting
    train_q1 = [str(el) for el in train_df["question1"]]
    train_q2 = [str(el) for el in train_df["question2"]]
    X = [(q1, q2) for q1, q2 in zip(train_q1, train_q2)]
    y = list(train_df["is_duplicate"])
    
    # Sample splitting 60% train, 20% validation and 20% test
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=1)
    
    return X_train, y_train, X_val, y_val, X_test, y_test

In [191]:
X_train, y_train, X_val, y_val, X_test, y_test = process_data("/Users/dfirebanks/Projects/DRLParaphrase/quora/train.csv")

Total number of pairs: 404290


In [2]:
# Read csv
train_df = pd.read_csv("/Users/dfirebanks/Projects/DRLParaphrase/quora/train.csv")

In [155]:
train_df.shape

(404290, 6)

In [66]:
train_df.head(20)

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0
5,5,11,12,Astrology: I am a Capricorn Sun Cap moon and c...,"I'm a triple Capricorn (Sun, Moon and ascendan...",1
6,6,13,14,Should I buy tiago?,What keeps childern active and far from phone ...,0
7,7,15,16,How can I be a good geologist?,What should I do to be a great geologist?,1
8,8,17,18,When do you use シ instead of し?,"When do you use ""&"" instead of ""and""?",0
9,9,19,20,Motorola (company): Can I hack my Charter Moto...,How do I hack Motorola DCX3400 for free internet?,0


In [156]:
# Convert all questions to string and X, y format for splitting
train_q1 = [str(el) for el in train_df["question1"]]
train_q2 = [str(el) for el in train_df["question2"]]
X = [(q1, q2) for q1, q2 in zip(train_q1, train_q2)]
y = list(train_df["is_duplicate"])

In [93]:
q1_word_count = [len(str(q).split(" ")) for q in train_df["question1"]]
q2_word_count = [len(str(q).split(" ")) for q in train_df["question2"]]

In [157]:
# Sample splitting 60% train, 20% validation and 20% test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=1)

In [141]:
# For initial processing purposes, we calculate the number of questions with more than 20 words 
count1 = 0
count2 = 0

for l1, l2 in zip(q1_word_count, q2_word_count):
    if l1 > 20:
        count1 += 1
    if l2 > 20:
        count2 += 1

print("Percentage of questions with more than 20 words for each column:")
100*count1/len(q1_word_count), 100*count2/len(q2_word_count)

Percentage of questions with more than 20 words for each column:


(6.365727571792525, 7.723169012342625)

In [154]:
# Samples of nonsensical questions
for i, q in enumerate(train_q1):
    if len(q) == 1:
        print(q, i)
    if len(q) == 2:
        print(q, i)
    if len(q) == 4:
        print(q, i)
    if len(q) == 6:
        print(q, i)

. 3306
? 13016
? 20794
HH 23884
Na 44619
I'm  54029
? 96725
ok ? 102512
? 104101
i 108978
What 109311
o 115347
? 134403
o 151922
A 158778
Nana 164553
I'm  175584
spam 180461
? 189659
‘ 190570
I 199110
? 208485
Can? 208798
? 213220
Aaas 216861
? 254161
hi 257077
Why? 260779
delete 263134
Ok 270146
? 273065
Why? 324777
My 325530
My 328601
Who is 329933
Q? 351788
H 357127
no 381124
? 402423


### 2. Tokenization

In [218]:
VOCAB_SIZE = 5000

train_tokenized_dir = "/Users/dfirebanks/Projects/DRLParaphrase/quora/train_tokens"
val_tokenized_dir = "/Users/dfirebanks/Projects/DRLParaphrase/quora/val_tokens"
test_tokenized_dir = "/Users/dfirebanks/Projects/DRLParaphrase/quora/test_tokens"
finished_files_dir = "/Users/dfirebanks/Projects/DRLParaphrase/quora/finished_files/"

CHUNK_SIZE = 1000 # num examples per chunk, for the chunked data
chunks_dir = os.path.join(finished_files_dir, "chunked")

In [203]:
def tokenize_questions(questions):
    """Takes in a list of question pairs, returns a list of tokenized question pairs"""
    tokenized_qs = []
    for i in range(len(questions)):
        # Tokenizing both question1 and question2
        tokenized_qs.append([list(parser.tokenize(questions[i][0])), list(parser.tokenize(questions[i][1]))])
        if i % 10000 == 0:
            print(f"Tokenized {i} questions!")
            print(f"Q1: {questions[i][0]} \nQ2: {questions[i][1]}")
    return tokenized_qs

In [204]:
def store_tokens(questions, outdir):
    """ Stores a pair of tokenized questions separated by a new line"""
    
    fnum = 0
    for qs in questions: 
        with open(os.path.join(outdir, "qpair" + str(fnum) + ".tokens"), "w") as f:
            e1 = ' '.join(qs[0]).lower().strip()
            e2 = ' '.join(qs[1]).lower().strip()
            f.write(f"Q1: {e1} \nQ2: {e2}\n")
        
        fnum += 1

In [205]:
# Create tokenized questions
train_tokens = tokenize_questions(X_train)
val_tokens = tokenize_questions(X_val)
test_tokens = tokenize_questions(X_test)

Tokenized 0 questions!
Q1: How can I become a problem solver? 
Q2: How do I become a better thinker, innovator and a problem solver?
Tokenized 10000 questions!
Q1: Can I crack the JEE Mains without coaching in two months? 
Q2: Can I crack the JEE Mains in two months without any coaching?
Tokenized 20000 questions!
Q1: What is your opinion on PM Narendra Modi's decision to ban INR 500 and INR 1000 notes? 
Q2: What is your reaction about the ban on Rs. 500 and Rs. 1000 notes? Won't it create a chaos and harm the economy?
Tokenized 30000 questions!
Q1: How does the new change of elimination of master’s degree exemption impact the H-1B holders and H-1B applicants? 
Q2: Can H-1B visa holders do higher studies in the US?
Tokenized 40000 questions!
Q1: Why isn't Hillary Clinton in jail? 
Q2: Why could Hillary Clinton go to jail?
Tokenized 50000 questions!
Q1: What are the differences between a Fibonacci heap and a binomial heap? 
Q2: What is the difference between a Fibonacci heap and Binomia

In [206]:
# Store tokenized questions
train_tokenized_dir = "/Users/dfirebanks/Projects/DRLParaphrase/quora/train_tokens1"
val_tokenized_dir = "/Users/dfirebanks/Projects/DRLParaphrase/quora/val_tokens1"
test_tokenized_dir = "/Users/dfirebanks/Projects/DRLParaphrase/quora/test_tokens1"

store_tokens(train_tokens, train_tokenized_dir)
store_tokens(val_tokens, val_tokenized_dir)
store_tokens(test_tokens, test_tokenized_dir)

### 3. Storing files for model (.bin, tf.Example())

In [212]:
def write_to_bin(tokenized_qs, is_duplicate, outfile, makevocab=False):
    """ Creates bin files given pairs of tokenized questions, an outfile name and if applicable, creates a vocabulary file 
        
        @tokenized_qs: list of questions as tokenized strings [(question1_tokenized, question2_tokenized), (tokenized_pair_2), ...]
        @is_duplicate: label list, target variable
        @outfile: path as string"""
    
    if makevocab:
        vocab_counter = collections.Counter()
        
    with open(outfile, 'wb') as writer:
        
        for i in range(len(tokenized_qs)):
            tok_q1 = tokenized_qs[i][0]
            tok_q2 = tokenized_qs[i][1]
            target = str(is_duplicate[i])
            # TODO Important note: Max length of question is 20 words, I assume we clean up the symbols that are not question marks? OR we do this in the actual program
            # In the original program, there was no particular data cleaning, so I assume that this is done afterwards
            
            # Questions as strings: lowercase and strip them 
            q1 = ' '.join(tok_q1).lower().strip()
            q2 = ' '.join(tok_q2).lower().strip()
            
            # Write to tf.Example
            tf_example = example_pb2.Example()
            tf_example.features.feature['question1'].bytes_list.value.extend([q1.encode()])
            tf_example.features.feature['question2'].bytes_list.value.extend([q2.encode()])
            tf_example.features.feature['target'].bytes_list.value.extend([target.encode()])
            
            tf_example_str = tf_example.SerializeToString()
            str_len = len(tf_example_str)
            writer.write(struct.pack('q', str_len)) # write length of string
            writer.write(struct.pack('%ds' % str_len, tf_example_str)) # write string of length noted earlier
            
            
            # Make the vocab to write, if applicable
            if makevocab:
                tokens = q1 + q2
                tokens = [t.strip() for t in tokens] # strip
                tokens = [t for t in tokens if t != ""] # remove empty
                vocab_counter.update(tokens)
    
    print("Finished writing file %s\n" % outfile)
    
    # Write vocab to file
    if makevocab:
        print("Writing vocab file...")
        with open(os.path.join(finished_files_dir, "vocab"), 'w') as writer:
            for word, count in vocab_counter.most_common(VOCAB_SIZE):
                writer.write(word + ' ' + str(count) + '\n')
        print("Finished writing vocab file")
        

In [213]:
finished_files_dir = "/Users/dfirebanks/Projects/DRLParaphrase/quora/finished_files1/"
# Store all .bin files
write_to_bin(train_tokens, y_train, os.path.join(finished_files_dir, "train.bin"), makevocab=True)
write_to_bin(val_tokens, y_val, os.path.join(finished_files_dir, "val.bin"))
write_to_bin(test_tokens, y_test, os.path.join(finished_files_dir, "test.bin"))

Finished writing file /Users/dfirebanks/Projects/DRLParaphrase/quora/finished_files1/train.bin

Writing vocab file...
Finished writing vocab file
Finished writing file /Users/dfirebanks/Projects/DRLParaphrase/quora/finished_files1/val.bin

Finished writing file /Users/dfirebanks/Projects/DRLParaphrase/quora/finished_files1/test.bin



### 4. Chunk files in batches

In [227]:
# Chunk functions for replications of paper, not sure if truly necessary?
def chunk_file(set_name, files_dir):
    in_file = files_dir % set_name
    reader = open(in_file, "rb")
    chunk = 0
    finished = False
    while not finished:
        chunk_fname = os.path.join(chunks_dir, '%s_%03d.bin' % (set_name, chunk)) # new chunk
        with open(chunk_fname, 'wb') as writer:
            for _ in range(CHUNK_SIZE):
                len_bytes = reader.read(8)
                if not len_bytes:
                    finished = True
                    break
                str_len = struct.unpack('q', len_bytes)[0]
                example_str = struct.unpack('%ds' % str_len, reader.read(str_len))[0]
                writer.write(struct.pack('q', str_len))
                writer.write(struct.pack('%ds' % str_len, example_str))
            chunk += 1


def chunk_all(files_dir):
    # Make a dir to hold the chunks
    if not os.path.isdir(chunks_dir):
        os.mkdir(chunks_dir)
    # Chunk the data
    for set_name in ['train', 'val', 'test']:
        print("Splitting %s data into chunks..." % set_name)
        chunk_file(set_name, files_dir)
    print("Saved chunked data in %s" % chunks_dir)
    

In [229]:
finished_files_dir = "/Users/dfirebanks/Projects/DRLParaphrase/quora/finished_files1/"
chunks_dir = os.path.join(finished_files_dir, "chunked")
chunk_all('/Users/dfirebanks/Projects/DRLParaphrase/quora/finished_files1/%s.bin')

Splitting train data into chunks...
Splitting val data into chunks...
Splitting test data into chunks...
Saved chunked data in /Users/dfirebanks/Projects/DRLParaphrase/quora/finished_files1/chunked


### 5. Combine the whole process

In [None]:
VOCAB_SIZE = 5000

train_tokenized_dir = "/Users/dfirebanks/Projects/DRLParaphrase/quora/train_tokens"
val_tokenized_dir = "/Users/dfirebanks/Projects/DRLParaphrase/quora/val_tokens"
test_tokenized_dir = "/Users/dfirebanks/Projects/DRLParaphrase/quora/test_tokens"
finished_files_dir = "/Users/dfirebanks/Projects/DRLParaphrase/quora/finished_files1/"

CHUNK_SIZE = 1000 # num examples per chunk, for the chunked data
chunks_dir = os.path.join(finished_files_dir, "chunked")

def main():
    """ 1. Get the train and test set paths, parse them into texts
        2. Tokenize all of them
        3. Create bin files 
        4. Chunk data 
    """

    # Alternative use: input file path from command line
    if len(sys.argv) != 1:
        print("USAGE: python make_datafiles.py")
        sys.exit()
        
    
    fpath = "/Users/dfirebanks/Projects/DRLParaphrase/quora/train.csv"

    # Move to dataframe for text usage
    X_train, y_train, X_val, y_val, X_test, y_test = process_data(fpath)

    # Create some new directories to store tokenized versions of the questions
    if not os.path.exists(train_tokenized_dir): 
        os.makedirs(train_tokenized_dir)
    if not os.path.exists(val_tokenized_dir): 
        os.makedirs(val_tokenized_dir)
    if not os.path.exists(test_tokenized_dir): 
        os.makedirs(test_tokenized_dir)
    if not os.path.exists(finished_files_dir): 
        os.makedirs(finished_files_dir)

    # Run stanford tokenizer on both sets, outputting to tokenized questions directories
    train_tokens = tokenize_questions(X_train)
    val_tokens = tokenize_questions(X_val)
    test_tokens = tokenize_questions(X_test)
    
    store_tokens(train_tokens, train_tokenized_dir)
    store_tokens(val_tokens, val_tokenized_dir)
    store_tokens(test_tokens, test_tokenized_dir)
    
    # Read the tokenized stories, do a little postprocessing then write to bin files
    write_to_bin(train_tokens, y_train, os.path.join(finished_files_dir, "train.bin"), makevocab=True)
    write_to_bin(val_tokens, y_val, os.path.join(finished_files_dir, "val.bin"))
    write_to_bin(test_tokens, y_test, os.path.join(finished_files_dir, "test.bin"))
    
    # Chunk the data. This splits each of train.bin, val.bin and test.bin into smaller chunks, each containing e.g. 1000 examples, and saves them in finished_files/chunks
    chunk_all('/Users/dfirebanks/Projects/DRLParaphrase/quora/finished_files/%s.bin')