In [1]:
import pandas as pd
from utils import persistence as ps
from urllib3.response import HTTPResponse
from typing import List, Dict, Tuple
import numpy as np

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
from toolz import partition_all

In [4]:
INPUT_BUCKET: str = 'dq-data'
HASH_BUCKET: str = 'dq-hashed'

In [5]:
#load train_set
data: str = 'train.csv'
filestream: HTTPResponse = ps.get_file_stream(bucket=INPUT_BUCKET, filename=data)
dtypes: Dict[str, str] = {
    'id': 'int64',
    'qid1': 'int64',
    'qid2': 'int64',
    'question1': 'object',
    'question2': 'object',
    'is_duplicate': 'int64'
}
df: pd.DataFrame = pd.read_csv(#urlpath=s3_in_url, 
                                     #storage_options=s3_options,
                                     filestream,
                                     header=0, 
                                     usecols=dtypes.keys(), 
                                     names=dtypes.keys(),
                                     skipinitialspace=True,
                                     skip_blank_lines=True,
                                     encoding='utf-8')
df = df.set_index('id')

In [6]:
df.head()

Unnamed: 0_level_0,qid1,qid2,question1,question2,is_duplicate
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 404290 entries, 0 to 404289
Data columns (total 5 columns):
qid1            404290 non-null int64
qid2            404290 non-null int64
question1       404289 non-null object
question2       404288 non-null object
is_duplicate    404290 non-null int64
dtypes: int64(3), object(2)
memory usage: 18.5+ MB


#### Train-test split

In [8]:
from sklearn.model_selection import train_test_split
X = df.drop(columns=['is_duplicate'])
y = df['is_duplicate']
X_train, X_test, y_train, y_test =  train_test_split(X, y, test_size=0.33, random_state=42)

In [9]:
from gensim.parsing.preprocessing import preprocess_string
from sklearn.feature_extraction.text import TfidfVectorizer
def get_tokens(process='train'):
    if process=='test':
        X = X_test
    else:
        X = X_train
    series = pd.Series(pd.concat([X['question1'], X['question2']]),dtype=str)
    series.dropna()
    for question in series:
        yield preprocess_string(question)
                
pass_through = lambda x:x
tfidf = TfidfVectorizer(analyzer=pass_through)
X_trfmd = tfidf.fit_transform(get_tokens('train'))

In [10]:
X_trfmd

<541748x50968 sparse matrix of type '<class 'numpy.float64'>'
	with 2524075 stored elements in Compressed Sparse Row format>

In [11]:
# dimension reduction using SVD
from sklearn.decomposition import TruncatedSVD
import time
start = time.time()
svd = TruncatedSVD(n_components=100, n_iter=7, random_state=42)
X_svd = svd.fit_transform(X_trfmd)
end =  time.time()
print('created SVD transform in time {}'.format(end-start))

created SVD transform in time 18.192118167877197


In [12]:
X_svd.shape

(541748, 100)

In [13]:
# split back into two
X1 = X_svd[:len(X_train), :]
X2 = X_svd[len(X_train):, :]

#### word2vector (fasttext)

In [14]:
ps.get_file(bucket=INPUT_BUCKET, filename='cc.en.300.bin.gz', filepath='/tmp/cc.en.300.bin.gz')
import gzip
import shutil
with gzip.open('/tmp/cc.en.300.bin.gz', 'rb') as f_in:
    with open('/tmp/cc.en.300.bin', 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)  

In [15]:
import os
os.remove('/tmp/cc.en.300.bin.gz')
from gensim.models import FastText
model = FastText.load_fasttext_format('/tmp/cc.en.300.bin')

In [16]:
def get_ft_vectors(model, process):
    for tokens in get_tokens(process):
        vectors = []
        for token in tokens:
            try:
                vector = model.wv[token]
            except:
                continue
            vectors.append(vector)
        yield vectors

In [17]:
X_ft = np.array([vectors for vectors in get_ft_vectors(model,'train')])
X_ft.shape

(541748,)

In [18]:
# split back into two
X1_ft = X_ft[:len(X_train)]
X2_ft = X_ft[len(X_train):]

In [19]:
X1_ft

array([list([array([ 5.15881181e-02,  5.81315532e-02, -9.70149972e-03,  9.21105593e-02,
       -5.69709465e-02, -8.62881020e-02,  6.37376457e-02,  4.20518853e-02,
        7.30852771e-04,  2.42972858e-02, -1.60080463e-01, -7.36736953e-02,
       -1.16030768e-01, -1.05162179e-02,  1.95952542e-02,  4.26273420e-02,
       -1.33283213e-01, -8.12865272e-02, -7.15422619e-04,  1.36178138e-03,
        2.70773284e-02,  2.75074206e-02, -6.31720584e-04, -1.58908224e-04,
        3.71515891e-03, -4.30366769e-02,  3.56693496e-03, -6.41848817e-02,
       -7.48521462e-02,  1.30827233e-01,  2.20603012e-02,  3.28433439e-02,
        9.25663859e-02, -7.83696212e-03,  6.80066198e-02,  3.21056657e-02,
       -2.79456321e-02, -2.52460055e-02,  5.32374568e-02, -1.92222878e-01,
        9.44242626e-02, -3.21247093e-02, -8.01256672e-02,  6.38195947e-02,
       -9.27609056e-02, -1.81211380e-03,  3.74464020e-02,  4.08083498e-02,
       -1.69307347e-02,  3.57952784e-03,  6.25777692e-02,  4.73536141e-02,
        1.01

#### fuzzy-wuzzy

In [20]:
# difference in text size
compute_size_diff = lambda row: abs(len(str(row['question1'])) - len(str(row['question2'])))
X_train['size_diff'] = X_train.apply(compute_size_diff, axis=1)
X_train.head()

Unnamed: 0_level_0,qid1,qid2,question1,question2,size_diff
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
316451,441499,441500,What is the tattoo written on DMC's head?,Were tattoos part of the 60's hippie culture?,4
398368,531584,531585,What are monocot plants and what are some exam...,What are some examples of monocot plants? What...,10
218253,324807,324808,Why do we see colors?,What is your review of Weightless (2017 movie)?,26
282919,402950,402951,"Is Reform Judaism becoming more ""orthodox""?",What is Reform Judaism?,20
243365,355786,355787,Is there any culture that has dessert before d...,What are my options for desserts after dinner ...,17


In [21]:
from fuzzywuzzy import fuzz

In [22]:
# ratio
compute_ratio = lambda row: fuzz.ratio(str(row['question1']), str(row['question2']))
X_train['ratio'] = X_train.apply(compute_ratio, axis=1)
X_train.head()

Unnamed: 0_level_0,qid1,qid2,question1,question2,size_diff,ratio
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
316451,441499,441500,What is the tattoo written on DMC's head?,Were tattoos part of the 60's hippie culture?,4,49
398368,531584,531585,What are monocot plants and what are some exam...,What are some examples of monocot plants? What...,10,64
218253,324807,324808,Why do we see colors?,What is your review of Weightless (2017 movie)?,26,35
282919,402950,402951,"Is Reform Judaism becoming more ""orthodox""?",What is Reform Judaism?,20,52
243365,355786,355787,Is there any culture that has dessert before d...,What are my options for desserts after dinner ...,17,45


In [23]:
# partial ratio
compute_partial_ratio = lambda row: fuzz.partial_ratio(str(row['question1']), str(row['question2']))
X_train['partial_ratio'] = X_train.apply(compute_partial_ratio, axis=1)
X_train.head()

Unnamed: 0_level_0,qid1,qid2,question1,question2,size_diff,ratio,partial_ratio
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
316451,441499,441500,What is the tattoo written on DMC's head?,Were tattoos part of the 60's hippie culture?,4,49,49
398368,531584,531585,What are monocot plants and what are some exam...,What are some examples of monocot plants? What...,10,64,63
218253,324807,324808,Why do we see colors?,What is your review of Weightless (2017 movie)?,26,35,43
282919,402950,402951,"Is Reform Judaism becoming more ""orthodox""?",What is Reform Judaism?,20,52,70
243365,355786,355787,Is there any culture that has dessert before d...,What are my options for desserts after dinner ...,17,45,50


In [24]:
# token_sort_ratio
compute_token_sort_ratio = lambda row: fuzz.token_sort_ratio(str(row['question1']), str(row['question2']))
X_train['token_sort_ratio'] = X_train.apply(compute_token_sort_ratio, axis=1)
X_train.head()

Unnamed: 0_level_0,qid1,qid2,question1,question2,size_diff,ratio,partial_ratio,token_sort_ratio
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
316451,441499,441500,What is the tattoo written on DMC's head?,Were tattoos part of the 60's hippie culture?,4,49,49,55
398368,531584,531585,What are monocot plants and what are some exam...,What are some examples of monocot plants? What...,10,64,63,82
218253,324807,324808,Why do we see colors?,What is your review of Weightless (2017 movie)?,26,35,43,41
282919,402950,402951,"Is Reform Judaism becoming more ""orthodox""?",What is Reform Judaism?,20,52,70,55
243365,355786,355787,Is there any culture that has dessert before d...,What are my options for desserts after dinner ...,17,45,50,50


In [25]:
# token_set_ratio
compute_token_set_ratio = lambda row: fuzz.token_set_ratio(str(row['question1']), str(row['question2']))
X_train['token_set_ratio'] = X_train.apply(compute_token_set_ratio, axis=1)
X_train.head()

Unnamed: 0_level_0,qid1,qid2,question1,question2,size_diff,ratio,partial_ratio,token_sort_ratio,token_set_ratio
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
316451,441499,441500,What is the tattoo written on DMC's head?,Were tattoos part of the 60's hippie culture?,4,49,49,55,55
398368,531584,531585,What are monocot plants and what are some exam...,What are some examples of monocot plants? What...,10,64,63,82,93
218253,324807,324808,Why do we see colors?,What is your review of Weightless (2017 movie)?,26,35,43,41,41
282919,402950,402951,"Is Reform Judaism becoming more ""orthodox""?",What is Reform Judaism?,20,52,70,55,87
243365,355786,355787,Is there any culture that has dessert before d...,What are my options for desserts after dinner ...,17,45,50,50,50


In [26]:
# build complete feature dataframe
X_train_temp = pd.concat([pd.DataFrame(X1, columns=['q1_'+str(i) for i in range(X1.shape[1])], index=X_train.index), 
                     pd.DataFrame(X2, columns=['q2_'+str(i) for i in range(X2.shape[1])], index=X_train.index)], axis=1)
X_train_temp.head()

Unnamed: 0_level_0,q1_0,q1_1,q1_2,q1_3,q1_4,q1_5,q1_6,q1_7,q1_8,q1_9,...,q2_90,q2_91,q2_92,q2_93,q2_94,q2_95,q2_96,q2_97,q2_98,q2_99
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
316451,0.002739,0.001087,0.001754,-0.001864,0.000338,0.00039,-0.000555,0.000764,0.000265,0.0005,...,0.022237,-0.005225,0.004594,-0.023669,0.013882,-0.006077,-0.016482,-0.027223,0.00818,0.007138
398368,0.003055,0.001118,0.005409,-0.003024,8e-06,-0.001402,0.003404,0.000618,-0.001324,0.001924,...,0.012156,-0.016979,0.022933,-0.014093,-0.009837,0.034511,-0.008367,0.040154,0.029722,-0.02947
218253,0.002613,0.00085,0.003548,-0.001644,-0.000909,-0.000873,-0.001579,-0.00012,0.000446,0.000603,...,-0.03981,-0.014068,-0.019585,-0.007861,0.018909,-0.023025,0.006404,0.028432,0.043247,-0.002326
282919,0.000191,0.000285,0.000784,-0.000264,-0.000177,0.00028,1.7e-05,0.000499,-0.000187,0.000395,...,-0.000492,0.000196,-0.000359,-0.000419,0.000988,0.000123,-0.000226,0.000259,0.000506,0.000412
243365,0.00224,0.001951,0.009381,-0.005399,-0.000702,-0.001451,-0.008642,0.002907,-0.007932,-0.000873,...,0.027286,0.009972,0.020975,0.019271,-0.002636,-0.012734,-0.002169,0.000569,0.021168,-0.016095


In [27]:
X_train = pd.concat([X_train_temp, X_train], axis=1)
del X_train_temp
X_train = X_train.drop(columns=['qid1', 'qid2','question1','question2'])
X_train.head()

Unnamed: 0_level_0,q1_0,q1_1,q1_2,q1_3,q1_4,q1_5,q1_6,q1_7,q1_8,q1_9,...,q2_95,q2_96,q2_97,q2_98,q2_99,size_diff,ratio,partial_ratio,token_sort_ratio,token_set_ratio
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
316451,0.002739,0.001087,0.001754,-0.001864,0.000338,0.00039,-0.000555,0.000764,0.000265,0.0005,...,-0.006077,-0.016482,-0.027223,0.00818,0.007138,4,49,49,55,55
398368,0.003055,0.001118,0.005409,-0.003024,8e-06,-0.001402,0.003404,0.000618,-0.001324,0.001924,...,0.034511,-0.008367,0.040154,0.029722,-0.02947,10,64,63,82,93
218253,0.002613,0.00085,0.003548,-0.001644,-0.000909,-0.000873,-0.001579,-0.00012,0.000446,0.000603,...,-0.023025,0.006404,0.028432,0.043247,-0.002326,26,35,43,41,41
282919,0.000191,0.000285,0.000784,-0.000264,-0.000177,0.00028,1.7e-05,0.000499,-0.000187,0.000395,...,0.000123,-0.000226,0.000259,0.000506,0.000412,20,52,70,55,87
243365,0.00224,0.001951,0.009381,-0.005399,-0.000702,-0.001451,-0.008642,0.002907,-0.007932,-0.000873,...,-0.012734,-0.002169,0.000569,0.021168,-0.016095,17,45,50,50,50


In [28]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 270874 entries, 316451 to 121958
Columns: 205 entries, q1_0 to token_set_ratio
dtypes: float64(200), int64(5)
memory usage: 425.7 MB


In [29]:
X_train = pd.concat([X_train, pd.Series(X1_ft, name='q1_ft',index=X_train.index), pd.Series(X2_ft, name='q2_ft',index=X_train.index)], axis=1)
X_train.head()

Unnamed: 0_level_0,q1_0,q1_1,q1_2,q1_3,q1_4,q1_5,q1_6,q1_7,q1_8,q1_9,...,q2_97,q2_98,q2_99,size_diff,ratio,partial_ratio,token_sort_ratio,token_set_ratio,q1_ft,q2_ft
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
316451,0.002739,0.001087,0.001754,-0.001864,0.000338,0.00039,-0.000555,0.000764,0.000265,0.0005,...,-0.027223,0.00818,0.007138,4,49,49,55,55,"[[0.051588118, 0.058131553, -0.0097015, 0.0921...","[[0.051588118, 0.058131553, -0.0097015, 0.0921..."
398368,0.003055,0.001118,0.005409,-0.003024,8e-06,-0.001402,0.003404,0.000618,-0.001324,0.001924,...,0.040154,0.029722,-0.02947,10,64,63,82,93,"[[0.070252635, -0.034428675, -0.04147598, 0.07...","[[-0.060567684, -0.013681983, -0.087162875, -0..."
218253,0.002613,0.00085,0.003548,-0.001644,-0.000909,-0.000873,-0.001579,-0.00012,0.000446,0.000603,...,0.028432,0.043247,-0.002326,26,35,43,41,41,"[[0.103581265, 0.0048466064, 0.05893848, 0.061...","[[-0.03352166, 0.08625177, 0.03402934, 0.03248..."
282919,0.000191,0.000285,0.000784,-0.000264,-0.000177,0.00028,1.7e-05,0.000499,-0.000187,0.000395,...,0.000259,0.000506,0.000412,20,52,70,55,87,"[[-0.20145771, -0.030789683, 0.066938974, 0.00...","[[-0.20145771, -0.030789683, 0.066938974, 0.00..."
243365,0.00224,0.001951,0.009381,-0.005399,-0.000702,-0.001451,-0.008642,0.002907,-0.007932,-0.000873,...,0.000569,0.021168,-0.016095,17,45,50,50,50,"[[0.00074437854, -0.06444594, -0.060729586, 0....","[[0.027902221, -0.040149886, -0.04252064, 0.05..."


#### Test set vectorization

In [30]:
X_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 133416 entries, 8067 to 231389
Data columns (total 4 columns):
qid1         133416 non-null int64
qid2         133416 non-null int64
question1    133416 non-null object
question2    133415 non-null object
dtypes: int64(2), object(2)
memory usage: 5.1+ MB


In [31]:
X_test.head()

Unnamed: 0_level_0,qid1,qid2,question1,question2
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
8067,15738,15739,How do I play Pokémon GO in Korea?,How do I play Pokémon GO in China?
368101,12736,104117,What are some of the best side dishes for crab...,What are some good side dishes for buffalo chi...
70497,121486,121487,Which is more advisable and better material fo...,What is the best server setup for buddypress?
226567,254474,258192,How do I improve logical programming skills?,How can I improve my logical skills for progra...
73186,48103,3062,How close we are to see 3rd world war?,How close is a World War III?


In [32]:
X_test_trfmd = tfidf.transform(get_tokens('test'))

In [33]:
X_test_trfmd

<266832x50968 sparse matrix of type '<class 'numpy.float64'>'
	with 1228788 stored elements in Compressed Sparse Row format>

In [34]:
# dimension reduction using SVD
start = time.time()
X_test_svd = svd.transform(X_test_trfmd)
end =  time.time()
print('created SVD transform in time {}'.format(end-start))

created SVD transform in time 0.18194079399108887


In [35]:
# split back into two
X1_test = X_test_svd[:len(X_test), :]
X2_test = X_test_svd[len(X_test):, :]

In [36]:
X_ft_test = np.array([vectors for vectors in get_ft_vectors(model,'test')])
X_ft_test.shape

(266832,)

In [37]:
# split back into two
X1_ft_test = X_ft_test[:len(X_test)]
X2_ft_test = X_ft_test[len(X_test):]

In [38]:
# difference in text size
X_test['size_diff'] = X_test.apply(compute_size_diff, axis=1)
X_test.head()

Unnamed: 0_level_0,qid1,qid2,question1,question2,size_diff
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
8067,15738,15739,How do I play Pokémon GO in Korea?,How do I play Pokémon GO in China?,0
368101,12736,104117,What are some of the best side dishes for crab...,What are some good side dishes for buffalo chi...,2
70497,121486,121487,Which is more advisable and better material fo...,What is the best server setup for buddypress?,51
226567,254474,258192,How do I improve logical programming skills?,How can I improve my logical skills for progra...,8
73186,48103,3062,How close we are to see 3rd world war?,How close is a World War III?,9


In [39]:
# ratio
X_test['ratio'] = X_test.apply(compute_ratio, axis=1)
X_test.head()

Unnamed: 0_level_0,qid1,qid2,question1,question2,size_diff,ratio
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
8067,15738,15739,How do I play Pokémon GO in Korea?,How do I play Pokémon GO in China?,0,88
368101,12736,104117,What are some of the best side dishes for crab...,What are some good side dishes for buffalo chi...,2,73
70497,121486,121487,Which is more advisable and better material fo...,What is the best server setup for buddypress?,51,40
226567,254474,258192,How do I improve logical programming skills?,How can I improve my logical skills for progra...,8,73
73186,48103,3062,How close we are to see 3rd world war?,How close is a World War III?,9,63


In [40]:
# partial ratio
X_test['partial_ratio'] = X_test.apply(compute_partial_ratio, axis=1)
X_test.head()

Unnamed: 0_level_0,qid1,qid2,question1,question2,size_diff,ratio,partial_ratio
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
8067,15738,15739,How do I play Pokémon GO in Korea?,How do I play Pokémon GO in China?,0,88,88
368101,12736,104117,What are some of the best side dishes for crab...,What are some good side dishes for buffalo chi...,2,73,73
70497,121486,121487,Which is more advisable and better material fo...,What is the best server setup for buddypress?,51,40,44
226567,254474,258192,How do I improve logical programming skills?,How can I improve my logical skills for progra...,8,73,68
73186,48103,3062,How close we are to see 3rd world war?,How close is a World War III?,9,63,59


In [41]:
# token_sort_ratio
X_test['token_sort_ratio'] = X_test.apply(compute_token_sort_ratio, axis=1)
X_test.head()

Unnamed: 0_level_0,qid1,qid2,question1,question2,size_diff,ratio,partial_ratio,token_sort_ratio
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
8067,15738,15739,How do I play Pokémon GO in Korea?,How do I play Pokémon GO in China?,0,88,88,81
368101,12736,104117,What are some of the best side dishes for crab...,What are some good side dishes for buffalo chi...,2,73,73,73
70497,121486,121487,Which is more advisable and better material fo...,What is the best server setup for buddypress?,51,40,44,39
226567,254474,258192,How do I improve logical programming skills?,How can I improve my logical skills for progra...,8,73,68,89
73186,48103,3062,How close we are to see 3rd world war?,How close is a World War III?,9,63,59,71


In [42]:
# token_set_ratio
X_test['token_set_ratio'] = X_test.apply(compute_token_set_ratio, axis=1)
X_test.head()

Unnamed: 0_level_0,qid1,qid2,question1,question2,size_diff,ratio,partial_ratio,token_sort_ratio,token_set_ratio
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
8067,15738,15739,How do I play Pokémon GO in Korea?,How do I play Pokémon GO in China?,0,88,88,81,90
368101,12736,104117,What are some of the best side dishes for crab...,What are some good side dishes for buffalo chi...,2,73,73,73,73
70497,121486,121487,Which is more advisable and better material fo...,What is the best server setup for buddypress?,51,40,44,39,41
226567,254474,258192,How do I improve logical programming skills?,How can I improve my logical skills for progra...,8,73,68,89,96
73186,48103,3062,How close we are to see 3rd world war?,How close is a World War III?,9,63,59,71,81


In [43]:
# build complete feature dataframe
X_test_temp = pd.concat([pd.DataFrame(X1_test, columns=['q1_'+str(i) for i in range(X1_test.shape[1])], index=X_test.index), 
                    pd.DataFrame(X2_test, columns=['q2_'+str(i) for i in range(X2_test.shape[1])], index=X_test.index)], axis=1)
X_test_temp.head()

Unnamed: 0_level_0,q1_0,q1_1,q1_2,q1_3,q1_4,q1_5,q1_6,q1_7,q1_8,q1_9,...,q2_90,q2_91,q2_92,q2_93,q2_94,q2_95,q2_96,q2_97,q2_98,q2_99
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
8067,0.007231,0.00175,0.009705,-0.002815,0.004487,-0.000852,-0.001055,0.003104,1.5e-05,-0.003105,...,0.014516,0.009744,-0.029916,0.005495,0.017049,-0.003302,0.006653,-0.021045,0.000711,-0.015775
368101,0.173304,-0.040715,-0.060616,-0.026144,-0.047868,-0.004951,-0.002196,-0.002985,0.012573,-0.02599,...,0.003222,-0.000875,0.00605,-0.005266,-0.009295,-0.005788,0.006772,-0.009317,0.001217,0.005651
70497,0.007007,0.003853,0.012364,-0.004243,0.000265,0.009923,0.006139,-0.00389,-0.003884,-0.000306,...,0.005654,0.006192,0.003048,-0.000371,-0.001583,-0.002453,-0.001015,-0.000408,-0.00401,-4.7e-05
226567,0.060334,0.039251,0.004125,-0.01032,0.187215,0.039278,0.021367,0.010827,-0.106947,0.060422,...,0.021386,-0.061438,0.07408,0.04551,-0.074708,0.010919,0.008193,0.042662,-0.014512,0.046118
73186,0.024708,0.007696,0.057461,-0.027079,-0.025996,0.035996,-0.014842,0.037326,-0.009745,0.001573,...,0.01948,0.000393,-0.016767,0.011891,0.019128,0.002572,0.00538,-0.004818,0.005845,-0.000139


In [44]:
X_test = pd.concat([X_test_temp, X_test], axis=1)
del X_test_temp
X_test = X_test.drop(columns=['question1','question2', 'qid1', 'qid2'])
X_test.head()

Unnamed: 0_level_0,q1_0,q1_1,q1_2,q1_3,q1_4,q1_5,q1_6,q1_7,q1_8,q1_9,...,q2_95,q2_96,q2_97,q2_98,q2_99,size_diff,ratio,partial_ratio,token_sort_ratio,token_set_ratio
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
8067,0.007231,0.00175,0.009705,-0.002815,0.004487,-0.000852,-0.001055,0.003104,1.5e-05,-0.003105,...,-0.003302,0.006653,-0.021045,0.000711,-0.015775,0,88,88,81,90
368101,0.173304,-0.040715,-0.060616,-0.026144,-0.047868,-0.004951,-0.002196,-0.002985,0.012573,-0.02599,...,-0.005788,0.006772,-0.009317,0.001217,0.005651,2,73,73,73,73
70497,0.007007,0.003853,0.012364,-0.004243,0.000265,0.009923,0.006139,-0.00389,-0.003884,-0.000306,...,-0.002453,-0.001015,-0.000408,-0.00401,-4.7e-05,51,40,44,39,41
226567,0.060334,0.039251,0.004125,-0.01032,0.187215,0.039278,0.021367,0.010827,-0.106947,0.060422,...,0.010919,0.008193,0.042662,-0.014512,0.046118,8,73,68,89,96
73186,0.024708,0.007696,0.057461,-0.027079,-0.025996,0.035996,-0.014842,0.037326,-0.009745,0.001573,...,0.002572,0.00538,-0.004818,0.005845,-0.000139,9,63,59,71,81


In [45]:
X_test = pd.concat([X_test, pd.Series(X1_ft_test, name='q1_ft',index=X_test.index), pd.Series(X2_ft_test, name='q2_ft',index=X_test.index)], axis=1)
X_test.head()

Unnamed: 0_level_0,q1_0,q1_1,q1_2,q1_3,q1_4,q1_5,q1_6,q1_7,q1_8,q1_9,...,q2_97,q2_98,q2_99,size_diff,ratio,partial_ratio,token_sort_ratio,token_set_ratio,q1_ft,q2_ft
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
8067,0.007231,0.00175,0.009705,-0.002815,0.004487,-0.000852,-0.001055,0.003104,1.5e-05,-0.003105,...,-0.021045,0.000711,-0.015775,0,88,88,81,90,"[[0.0038416118, -0.14802723, -0.19296621, 0.11...","[[0.0038416118, -0.14802723, -0.19296621, 0.11..."
368101,0.173304,-0.040715,-0.060616,-0.026144,-0.047868,-0.004951,-0.002196,-0.002985,0.012573,-0.02599,...,-0.009317,0.001217,0.005651,2,73,73,73,73,"[[-0.11572341, 0.019766208, -0.17047717, 0.206...","[[-0.09213716, -0.0634383, 0.0017381323, 0.135..."
70497,0.007007,0.003853,0.012364,-0.004243,0.000265,0.009923,0.006139,-0.00389,-0.003884,-0.000306,...,-0.000408,-0.00401,-4.7e-05,51,40,44,39,41,"[[0.001946263, -0.06057124, -0.10881804, 0.037...","[[-0.11572341, 0.019766208, -0.17047717, 0.206..."
226567,0.060334,0.039251,0.004125,-0.01032,0.187215,0.039278,0.021367,0.010827,-0.106947,0.060422,...,0.042662,-0.014512,0.046118,8,73,68,89,96,"[[-0.043390375, -0.004501399, -0.044122953, 0....","[[-0.043390375, -0.004501399, -0.044122953, 0...."
73186,0.024708,0.007696,0.057461,-0.027079,-0.025996,0.035996,-0.014842,0.037326,-0.009745,0.001573,...,-0.004818,0.005845,-0.000139,9,63,59,71,81,"[[-0.014249172, -0.030450467, 0.005704186, 0.1...","[[-0.014249172, -0.030450467, 0.005704186, 0.1..."


#### MDS

In [None]:
from sklearn.manifold import MDS, LocallyLinearEmbedding
def pairwise_embed(pc1, pc2, method):
    if method == 'LLE':
        embedding = LocallyLinearEmbedding(n_components=3, random_state=42)
    elif method == 'MLLE':
        embedding = LocallyLinearEmbedding(n_components=3, method='modified', random_state=42)
    elif method == 'Hessian':
        embedding = LocallyLinearEmbedding(n_components=3, method='hessian', random_state=42)
    else #method == 'MDS':
        embedding = MDS(n_components=3, random_state=42)
        
    arr1 = np.array(pc1)
    arr2 = np.array(pc2)
    X = np.vstack((arr1,arr2))
    return embedding.fit_transform(X)

### Modeling

#### Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
logr_model = LogisticRegression(random_state=42)
param_grid = {'C': np.logspace(-2, 7, 10),
             #'penalty': ['l1','l2'],
             'tol': np.logspace(-5, -1, 5),
             #'solver': ['lbfgs']
             #'max_iter': np.linspace(10, 1000, 10)
             }
logr_cv = RandomizedSearchCV(logr_model, param_distributions=param_grid, cv=5, n_jobs=-1)
logr_cv.fit(X_train, y_train)

In [None]:
logr_cv.best_params_

In [None]:
logr_model = LogisticRegression(#solver=logr_cv.best_params_['solver'], 
                                random_state=42, 
                                C=logr_cv.best_params_['C'], 
                                tol=logr_cv.best_params_['tol'], 
                                #max_iter=logr_cv.best_params_['max_iter'], 
                                n_jobs=-1)
logr_model.fit(X_train, y_train)

In [None]:
logr_pred = logr_model.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
logr_acc_score = accuracy_score(y_test, logr_pred)
logr_prec_score = precision_score(y_test, logr_pred)
logr_rec_score = recall_score(y_test, logr_pred)
print('Logistic Regression')
print('accuracy score : {}'.format(logr_acc_score))
print('precision score : {}'.format(logr_prec_score))
print('recall score : {}'.format(logr_rec_score))