# Dependencies

In [145]:
import time
import nltk
import math

import pandas as pd
import numpy as np

#from multiprocessing import Pool
from pathos.multiprocessing import ProcessingPool as Pool
#from pathos.pools import ProcessPool as Pool
#from pathos.pools import ThreadPool as Pool
#from pathos.pools import ParallelPool as Pool

# Loading Data

In [74]:
def load_the_data(path="train.csv"):
    data = pd.read_csv("train.csv")
    
    data['question1'] = list(map(str, data['question1']))
    data['question2'] = list(map(str, data['question2']))
                            
    return data

data = load_the_data()

# Data Exploration

In [71]:
data.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,<map object at 0x1a87d07ef0>,<map object at 0x1a8d456400>,0
1,1,3,4,<map object at 0x1a87d07ef0>,<map object at 0x1a8d456400>,0
2,2,5,6,<map object at 0x1a87d07ef0>,<map object at 0x1a8d456400>,0
3,3,7,8,<map object at 0x1a87d07ef0>,<map object at 0x1a8d456400>,0
4,4,9,10,<map object at 0x1a87d07ef0>,<map object at 0x1a8d456400>,0


# Feature Engineering

In [4]:
def get_common_unigrams(row):
    question1 = str(row['question1'])
    question2 = str(row['question2'])
    
    q1_unigrams = set([i for i in nltk.ngrams(question1, 1)])
    q2_unigrams = set([i for i in nltk.ngrams(question2, 1)])
    return len( q1_unigrams.intersection(q2_unigrams))

def get_common_unigram_ratio(row):
    question1 = str(row['question1'])
    question2 = str(row['question2'])
    
    q1_unigrams = set([i for i in nltk.ngrams(question1, 1)])
    q2_unigrams = set([i for i in nltk.ngrams(question2, 1)])
    unigram_count = float(row["unigrams_common_count"])
               
    return  unigram_count / max(len(q1_unigrams.union(q2_unigrams)),1)

def get_common_bigrams(row):
    question1 = str(row['question1'])
    question2 = str(row['question2'])
    
    q1_bigrams = set([i for i in nltk.ngrams(question1, 2)])
    q2_bigrams = set([i for i in nltk.ngrams(question2, 2)])
    return len(q1_bigrams.intersection(q2_bigrams))

def get_common_bigram_ratio(row):
    question1 = str(row['question1'])
    question2 = str(row['question2'])
    
    q1_bigrams = set([i for i in nltk.ngrams(question1, 2)])
    q2_bigrams = set([i for i in nltk.ngrams(question2, 2)])
    bigram_count = float(row["bigrams_common_count"])
               
    return  bigram_count / max(len(q1_bigrams.union(q2_bigrams)),1)

In [128]:
feature_factory = {
    'q1_word_count': lambda x: len(x['question1'].split()),
    'q2_word_count': lambda x: len(x['question2'].split()),
    'q1_length': lambda x: len(x['question1']),
    'q2_length': lambda x: len(x['question2']),
    'q1_has_question_mark': lambda x: 1 if '?' in x['question1'] else 0,
    'q2_has_question_mark': lambda x: 1 if '?' in x['question2'] else 0,
    'unigrams_common_count': get_common_unigrams,
    'unigrams_common_ratio': get_common_unigram_ratio,
    'bigrams_common_count': get_common_bigrams,
    'bigrams_common_ratio': get_common_bigram_ratio
}

# Utility Functions

In [129]:
# Source: https://stackoverflow.com/questions/5478351/python-time-measure-function

def timing(f):
    def wrap(*args):
        time_start = time.time()
        ret = f(*args)
        time_end = time.time()
        
        print ('{} function took {:.4f}s'.format(
            f.__name__, (time_end - time_start))
        )    
        return ret
    
    return wrap

# Traditional Approach

In [130]:
@timing
def traditional_feature_generation():
    for feature, transformation in feature_factory.items():
        data[feature] = data.apply(transformation, axis=1, raw=True)

In [131]:
data = load_the_data()
traditional_feature_generation()

traditional_feature_generation function took 239.4596s


In [132]:
data.head(2)

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,q1_word_count,q2_word_count,q1_length,q2_length,q1_has_question_mark,q2_has_question_mark,unigrams_common_count,unigrams_common_ratio,bigrams_common_count,bigrams_common_ratio
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0,14,12,66,57,1,1,21,1.0,39,0.886364
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0,8,13,51,88,1,1,20,0.625,36,0.455696


# Multiprocess Approach

In [146]:
def chunk(data, num):   
    chunk_size = math.ceil(len(data) / num)
    return [data[i*chunk_size : (i+1)*chunk_size] for i in range(num)]

def pool_apply(data, apply_func, proc_num=8):
    
    def transform_data(data):
        return data.apply(apply_func, axis=1, raw=True)
    
    with Pool(processes=proc_num) as pool:
        chunks = chunk(data, proc_num) 
        proccessed_chunks = list(pool.map(transform_data, chunks))
  
    return np.hstack(tuple(proccessed_chunks))

In [147]:
@timing
def multiprocess_feature_generation(proc_num=4):
    for feature, transformation in feature_factory.items():  
        data[feature] = pool_apply(data, transformation, proc_num)

In [148]:
data = load_the_data()
multiprocess_feature_generation(4)

multiprocess_feature_generation function took 165.3188s


In [149]:
data.head(2)

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,q1_word_count,q2_word_count,q1_length,q2_length,q1_has_question_mark,q2_has_question_mark,unigrams_common_count,unigrams_common_ratio,bigrams_common_count,bigrams_common_ratio
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0,14,12,66,57,1,1,21,1.0,39,0.886364
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0,8,13,51,88,1,1,20,0.625,36,0.455696
