# Dependencies

In [125]:
import time
import pandas as pd

#from multiprocessing import Pool
#from pathos.multiprocessing import ProcessingPool as Pool
#from pathos.pools import ProcessPool as Pool
from pathos.pools import ThreadPool as Pool
#from pathos.pools import ParallelPool as Pool

# Loading Data

In [2]:
data = pd.read_csv("train.csv")

# Data Exploration

In [3]:
data.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


# Feature Engineering

In [None]:
def get_common_unigrams(row):
    question1 = str(row['question1'])
    question2 = str(row['question2'])
    
    q1_unigrams = set([i for i in nltk.ngrams(question1, 1)])
    q2_unigrams = set([i for i in nltk.ngrams(question2, 1)])
    return len( q1_unigrams.intersection(q2_unigrams))

def get_common_unigram_ratio(row):
    question1 = str(row['question1'])
    question2 = str(row['question2'])
    
    q1_unigrams = set([i for i in nltk.ngrams(question1, 1)])
    q2_unigrams = set([i for i in nltk.ngrams(question2, 1)])
    unigram_count = float(row["unigrams_common_count"])
               
    return  unigram_count / max(len(q1_unigrams.union(q2_unigrams)),1)

def get_common_bigrams(row):
    question1 = str(row['question1'])
    question2 = str(row['question2'])
    
    q1_bigrams = set([i for i in nltk.ngrams(question1, 2)])
    q2_bigrams = set([i for i in nltk.ngrams(question2, 2)])
    return len(q1_bigrams.intersection(q2_bigrams))

def get_common_bigram_ratio(row):
    question1 = str(row['question1'])
    question2 = str(row['question2'])
    
    q1_bigrams = set([i for i in nltk.ngrams(question1, 2)])
    q2_bigrams = set([i for i in nltk.ngrams(question2, 2)])
    bigram_count = float(row["bigrams_common_count"])
               
    return  bigram_count / max(len(q1_bigrams.union(q2_bigrams)),1)

In [4]:
feature_factory = {
    'q1_word_count': (['question1'], lambda x: len(str(x).split())),
    'q2_word_count': (['question2'], lambda x: len(str(x).split())),
    'q1_length': (['question1'], lambda x: len(str(x))),
    'q2_length': (['question2'], lambda x: len(str(x))),
    'q1_has_question_mark': (['question1'], lambda x: 1 if '?' in str(x) else 0),
    'q2_has_question_mark': (['question2'], lambda x: 1 if '?' in str(x) else 0),
}

# Utility Functions

In [61]:
# Source: https://stackoverflow.com/questions/5478351/python-time-measure-function

def timing(f):
    def wrap(*args):
        time_start = time.time()
        ret = f(*args)
        time_end = time.time()
        
        print ('{} function took {:.4f}s'.format(
            f.__name__, (time_end - time_start))
        )    
        return ret
    
    return wrap

# Traditional Approach

In [99]:
@timing
def traditional_feature_generation():
    for feature, payload in feature_factory.items():
        input_features, transformation = payload
        input_data = data[input_features].values.flatten()

        data[feature] = list(map(transformation, input_data))

In [116]:
data = pd.read_csv("train.csv")

traditional_feature_generation()

traditional_feature_generation function took 2.2049s


In [117]:
data.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,q1_word_count,q2_word_count,q1_length,q2_length,q1_has_question_mark,q2_has_question_mark
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0,14,12,66,57,1,1
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0,8,13,51,88,1,1
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0,14,10,73,59,1,1
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0,11,9,50,65,1,1
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0,13,7,76,39,1,1


# Multiprocess Approach

In [134]:
@timing
def multiprocess_feature_generation(proc_num=4):
    pool = Pool(proc_num)
    
    for feature, payload in feature_factory.items():
        input_features, transformation = payload
        input_data = data[input_features].values.flatten()

        data[feature] = list(pool.map(transformation, input_data))

In [135]:
data = pd.read_csv("train.csv")

multiprocess_feature_generation(4)

multiprocess_feature_generation function took 3.3703s


In [136]:
data.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,q1_word_count,q2_word_count,q1_length,q2_length,q1_has_question_mark,q2_has_question_mark
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0,14,12,66,57,1,1
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0,8,13,51,88,1,1
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0,14,10,73,59,1,1
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0,11,9,50,65,1,1
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0,13,7,76,39,1,1
