In [14]:
import os
import time 
import warnings
warnings.filterwarnings('ignore')

import nltk
import re
import pickle

%pylab inline
import matplotlib.pylab as plt
import seaborn as sns
plt.style.use('ggplot')
plt.style.use('seaborn-poster')
sns.set_palette('Set1', 10, desat=0.75)

import numpy as np
import pandas as pd
pd.set_option("display.max_rows", 200)
pd.set_option("display.max_colwidth", 200)

Populating the interactive namespace from numpy and matplotlib


In [15]:
DATA_PATH = 'data/'
postfixes = ['_src', '_stem', '_nostops']  
NUM_CORES = 6

In [16]:
import multiprocessing as mp
def apply_parallel(df, my_func):
    """
    Input: 
        df: pandas DataFrame or pandas Series
        my_func: custom function which will be apllied to df. Must accept pandas DataFrame or Series as input.
    Output: concatenated results of function application on DataFrame. Either pandas Series or pandas DataFrame.
    
    df is splitted by the number of cores and function applied to each part independetly.
    Results are concatenated and returned
    """
    df_splitted = np.array_split(df, NUM_CORES)
    pool = mp.Pool(NUM_CORES)
    result = pd.concat(pool.map(my_func, df_splitted))
    pool.close()
    pool.join()
    return result

In [21]:
data = pd.read_csv(os.path.join(DATA_PATH, 'train_preprocessed.csv'), sep=';')
data.fillna('xxx', inplace=True)

kagg = pd.read_csv(os.path.join(DATA_PATH, 'test_preprocessed.csv'), sep=';')
kagg.fillna('xxx', inplace=True)

# TFIDF cosine distances

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from scipy import sparse

In [None]:
def apply_parallel_sparse(DF, my_func):
    """
    Input: 
        DF: scipy sparse matrix
        my_func: custom function which will be apllied to DF. Must accept scipy sparse matrix and return a list.
    Output: concatenated results of function application on DataFrame. Either pandas Series or pandas DataFrame.
    
    DF is splitted by the number of cores and function applied to each part independetly.
    Results are combined in one list and returned
    """
    
    # Get split indices for DF to split on batches with (total_rows/NUM_CORES) rows each
    total_rows = DF.shape[0]
    split_indices = []
    for i in range(NUM_CORES):
        split_on = (int(total_rows/NUM_CORES)*i)
        split_indices.append(split_on)
    split_indices.append(total_rows)
    
    # Replace input matrix with list of NUM_CORES matricies
    DF = [DF[split_indices[i]:split_indices[i+1]] for i in range(NUM_CORES)]
    
    # Apply my_func to each element of a DF list (sparse matricies) and construct final results list
    res = []
    pool = mp.Pool(NUM_CORES)
    for part_res in pool.map(my_func, DF):
        res+=part_res
    pool.close()
    pool.join()
    return res

def efficient_hstack_csr(matrices, batch_size=1e5):
    """
    Input: 
        matrices: list of matrices to be hstacked. All must have same row number. 
            Acceptable formats: csr matrix, coo matric, pandas DataFrame, numpy array.
        batch_size: int, number of rows to hstack per batch. The lower the batch, the lower the memory footprint.
        Recommended batch size 10-200K.
    Output: essentially just sparse.hstack(csr_matrices, format='csr'), but much more efficient
    
    Hstack operation works ok only on csc matrices or small csr matrices, so we can't apply it on tfidf ouputs as is. 
    Therefore we will hstack small batches with each other and after that vstack all resulting batches. Vstack works
    the best with csr format, as we need. 
    """
    
    batches = []
    batch_size = int(batch_size) # convert float to int in case if passed like 1e5
    for i in range(0, matrices[0].shape[0], batch_size):
        lower_bound = i
        upper_bound = min(i+batch_size, matrices[0].shape[0])
        batches.append(sparse.hstack([matrix[lower_bound:upper_bound] for matrix in matrices]
                                     , format='csr'))
    
    return sparse.vstack(batches, format='csr')


def cosine_sim(DF):
    """
    Input: 
        DF: scipy sparse matrix. Must be constracted preliminarily from two sparse matricies with the 
        same number of columns. Each of sparse matrices - tfidf sparse representation of one of the questions.
    Output: list of cosine similarities between two halves of DF
    
    Usage example:
    tf_q1 = tfidf.transform(train['q1_src'])
    tf_q2 = tfidf.transform(train['q2_src'])
    train['cosine_similarities'] = cosine_sim(efficient_hstack_csr(tf_q1, tf_q2, 1e5))
    """
    
    split_halves_on = int(DF.shape[1]/2) # Get number of columns in each part
    df1 = DF[:, :split_halves_on]
    df2 = DF[:, split_halves_on:]
    return [cosine_similarity(df1[i], df2[i])[0][0] for i in range(df1.shape[0])] 

def get_cosine_similarity_features(train, test, p, token, min_n, max_n):
    """
    Input: 
        train: pd DataFrame with questions ins format 'q1'+postfix and 'q2'+postfix. Tf-idf is trained on 
        corpus from this DataFrame
        test: pd DataFrame with questions ins format 'q1'+postfix and 'q2'+postfix
        p: postfix ('_src' or '_stem' or '_nostops')
        token: 'word' or 'char'. Analyzer in Tf-idf is set to token
        min_n, max_n: integers, used for ngram_range in Tf-idf
    Output: train and test pd DataFrames with cosine similarities columns added
    """
    
    # Construct a corpus from all questions from train dataset and fit tfidf to it.
    corpus = pd.DataFrame(train['q1'+p].tolist() + train['q2'+p].tolist(), columns=['full_text'])
    tfidf = TfidfVectorizer(max_df=0.8, min_df=3, analyzer=token, ngram_range=(min_n, max_n)).fit(corpus.full_text) 
    del corpus
    
    # Get tfidf vectors for each question, intersection and resudial
    tf_q1 = tfidf.transform(train['q1'   +p])
    tf_q2 = tfidf.transform(train['q2'   +p])
    tf_in = tfidf.transform(train['inter'+p])
    tf_ex = tfidf.transform(train['extra'+p])
    
    # Calculate cosine distances in parallel
    train['DF_cos_q1_q2_'+token+str(min_n)+str(max_n)+p] = apply_parallel_sparse(efficient_hstack_csr([tf_q1, tf_q2]), cosine_sim)
    train['DF_cos_q1_in_'+token+str(min_n)+str(max_n)+p] = apply_parallel_sparse(efficient_hstack_csr([tf_q1, tf_in]), cosine_sim)
    train['DF_cos_q1_ex_'+token+str(min_n)+str(max_n)+p] = apply_parallel_sparse(efficient_hstack_csr([tf_q1, tf_ex]), cosine_sim)
    train['DF_cos_q2_in_'+token+str(min_n)+str(max_n)+p] = apply_parallel_sparse(efficient_hstack_csr([tf_q2, tf_in]), cosine_sim)
    train['DF_cos_q2_ex_'+token+str(min_n)+str(max_n)+p] = apply_parallel_sparse(efficient_hstack_csr([tf_q2, tf_ex]), cosine_sim)
    
    # Get tfidf vectors for each question, intersection and resudial for test
    tf_q1  = tfidf.transform(test ['q1'   +p])
    tf_q2  = tfidf.transform(test ['q2'   +p])
    tf_in  = tfidf.transform(test ['inter'+p])
    tf_ex  = tfidf.transform(test ['extra'+p])
    
    # Calculate cosine distances in parallel for test
    test['DF_cos_q1_q2_'+token+str(min_n)+str(max_n)+p] = apply_parallel_sparse(efficient_hstack_csr([tf_q1, tf_q2]), cosine_sim)
    test['DF_cos_q1_in_'+token+str(min_n)+str(max_n)+p] = apply_parallel_sparse(efficient_hstack_csr([tf_q1, tf_in]), cosine_sim)
    test['DF_cos_q1_ex_'+token+str(min_n)+str(max_n)+p] = apply_parallel_sparse(efficient_hstack_csr([tf_q1, tf_ex]), cosine_sim)
    test['DF_cos_q2_in_'+token+str(min_n)+str(max_n)+p] = apply_parallel_sparse(efficient_hstack_csr([tf_q2, tf_in]), cosine_sim)
    test['DF_cos_q2_ex_'+token+str(min_n)+str(max_n)+p] = apply_parallel_sparse(efficient_hstack_csr([tf_q2, tf_ex]), cosine_sim)
    
    return train, test



In [79]:
%%time
p_list =        ['_stem', '_stem', '_nostops']

token_list    = ['word',  'char',  'word'] # 'word' or 'char'
min_ngram_list =[  1,        3,      2]    # lower ngram range limit
max_ngram_list =[  1,        3,      2]    # upper ngram range limit

for p, token, min_ngram, max_ngram in zip(p_list, analyzer_list, min_ngram_list, max_ngram_list):
    t_start = time.time()
    print (p, token, min_ngram, max_ngram,)
    
    data, kagg = get_cosine_similarity_features(train=data, test=kagg,
                                p=p, token=token, min_n=min_ngram, max_n=max_ngram)
  
    print ('Done in {} minutes \n'.format(round((time.time()-t_start)/60,1) ))

_stem word 1 1
Done in 0.6 minutes 

_stem char 3 3
Done in 0.7 minutes 

_nostops word 2 2
Done in 0.5 minutes 

CPU times: user 15 s, sys: 2.98 s, total: 18 s
Wall time: 1min 46s


# Distances

In [10]:
from fuzzywuzzy import fuzz
import difflib, distance, datasketch
from simhash import Simhash
import nltk

In [37]:
def minhash_sim_2gram (df):
    """
    Input: pandas DataFrame. Must have two columns, corresponding to two questions
    Output: pandas Series, consisting of jaccard distances on minhashed calculated on 2-words ngrams
    
    More information on MinHash similarity:
        Russian: https://habrahabr.ru/post/115147/
        English: https://en.wikipedia.org/wiki/MinHash
    """
    cols = df.columns
    Q1 = df[cols[0]]
    Q2 = df[cols[1]]
    
    def minhash_sim(q1,q2):    
        m1 = datasketch.MinHash(num_perm=256)
        for d in nltk.ngrams(q1.split(' '), 2):
            m1.update("".join(d).encode('utf-8'))

        m2 = datasketch.MinHash(num_perm=256)
        for d in nltk.ngrams(q2.split(' '), 2):
            m2.update("".join(d).encode('utf-8'))

        wmh1 = datasketch.WeightedMinHash(0, m1.hashvalues)
        wmh2 = datasketch.WeightedMinHash(0, m2.hashvalues)
        return wmh1.jaccard(wmh2)
    
    df['result'] = [minhash_sim(q1,q2) for q1,q2 in zip(Q1, Q2)]
    return df['result']

def minhash_sim_3gram (df):
    """
    Input: pandas DataFrame. Must have two columns, corresponding to two questions
    Output: pandas Series, consisting of jaccard distances on minhashed calculated on 3-words ngrams
    """
    cols = df.columns
    Q1 = df[cols[0]]
    Q2 = df[cols[1]]
    
    def minhash_sim(q1,q2):    
        m1 = datasketch.MinHash(num_perm=256)
        for d in nltk.ngrams(q1.split(' '), 3):
            m1.update("".join(d).encode('utf-8'))

        m2 = datasketch.MinHash(num_perm=256)
        for d in nltk.ngrams(q2.split(' '), 3):
            m2.update("".join(d).encode('utf-8'))

        wmh1 = datasketch.WeightedMinHash(0, m1.hashvalues)
        wmh2 = datasketch.WeightedMinHash(0, m2.hashvalues)
        return wmh1.jaccard(wmh2)
    
    df['result'] = [minhash_sim(q1,q2) for q1,q2 in zip(Q1, Q2)]
    return df['result']

def fuzzy_features(df):
    """
    Input: pandas DataFrame. Must have two columns, corresponding to two questions. Format needed: q1+postfix 
    and q2+postfix, eg: q1_src, q2_src.
    Output: pandas DataFrame, containing 7 columns for diffirenet distance features from Fuzzywuzzy module
    """
    cols = df.columns
    p = cols[0][2:] # _src or _nostops
    
    Q1 = cols[0]
    Q2 = cols[1]
    
    df['f_part'+p]   = df.apply(lambda row: fuzz.partial_ratio (row[Q1], row[Q2]), axis=1)
    df['f_Qratio'+p] = df.apply(lambda row: fuzz.QRatio        (row[Q1], row[Q2]), axis=1)
    df['f_WRatio'+p] = df.apply(lambda row: fuzz.WRatio        (row[Q1], row[Q2]), axis=1)
    df['f_part_set'+p]   = df.apply(lambda row: fuzz.partial_token_set_ratio (row[Q1], row[Q2]), axis=1)
    df['f_part_sort'+p]  = df.apply(lambda row: fuzz.partial_token_sort_ratio(row[Q1], row[Q2]), axis=1)
    df['f_token_set'+p]  = df.apply(lambda row: fuzz.token_set_ratio         (row[Q1], row[Q2]), axis=1)
    df['f_token_sort'+p] = df.apply(lambda row: fuzz.token_sort_ratio        (row[Q1], row[Q2]), axis=1)
    
    # Select only added columns.
    columns_to_return = list(set(df.columns) - set(cols))
    
    # In order to return several columns to initial DataFrame you need to concatenate this output DataFrame 
    # with the main one. For example 
    # data = pd.concat((data, fuzzy_features(data[['q1_src', 'q2_src']])), axis=1) 
    # returns data with added columns_to_return features.
    return df[columns_to_return]

def add_simhash_dist(df):
    """
    Input: pandas DataFrame. Must have two columns, corresponding to two questions
    Output: pandas Series, consisting of Hamming distances on words/chars X-grams values 
    (check global variables: NGRAM_TOKEN and NGRAM_VALUE)
    """
    cols = df.columns
    Q1 = df[cols[0]]
    Q2 = df[cols[1]]
    def simhash_dist(q1, q2):
        if NGRAM_TOKEN=='word':
            q1 = q1.split(' ')
            q2 = q2.split(' ')
        q1 = [' '.join(ngram) for ngram in nltk.ngrams(q1, NGRAM_VALUE)]
        q2 = [' '.join(ngram) for ngram in nltk.ngrams(q2, NGRAM_VALUE)]
        return Simhash(q1).distance(Simhash(q2))
    
    df['result'] = [simhash_dist(q1,q2) for q1,q2 in zip(Q1, Q2)]
    return df['result']

In [40]:
%%time
def add_distances(data, p):
    "apply defined functions to data (pd DataFrame) and given postfix"
    
    # fuzzywuzzy distances
    block_start_time = time.time()
    data = pd.concat((data, 
                      apply_parallel(data[['q1'+p, 'q2'+p]], fuzzy_features)),
                      axis=1)
    print ('fuzzywuzzy dists for {} postfix are done in {} minutes'
           .format(p, round((time.time()-block_start_time)/60,1) ))
    
    # simhash distances
    block_start_time = time.time()
    global NGRAM_TOKEN, NGRAM_VALUE
    NGRAM_TOKEN = 'word'
    NGRAM_VALUE = 3
    data['simhash_word3'+p] = apply_parallel(data[['q1'+p, 'q2'+p]], add_simhash_dist)

    NGRAM_TOKEN = 'char'
    NGRAM_VALUE = 3
    data['simhash_char3'+p] = apply_parallel(data[['q1'+p, 'q2'+p]], add_simhash_dist)
    print ('simhash dists for {} postfix are done in {} minutes'
           .format(p, round((time.time()-block_start_time)/60,1) ))    
    
    # minhash distances
    # comment next four lines to save 99% of time execution
    block_start_time = time.time()
    data['mhash2'+p] = apply_parallel(data[['q1'+p, 'q2'+p]], minhash_sim_2gram)
    data['mhash3'+p] = apply_parallel(data[['q1'+p, 'q2'+p]], minhash_sim_3gram)
    print ('minhash dists for {} postfix are done in {} minutes'
           .format(p, round((time.time()-block_start_time)/60,1) ))
    
    # additional basic distances
    block_start_time = time.time()
    data['diffl'+p]  = [difflib.SequenceMatcher(None, q1, q2).ratio() for q1,q2 in zip(data['q1'+p], data['q2'+p])]
    data['sor'+p]    = [1 - distance.sorensen(q1, q2)                 for q1,q2 in zip(data['q1'+p], data['q2'+p])]
    data['jac'+p]    = [1 - distance.jaccard(q1, q2)                  for q1,q2 in zip(data['q1'+p], data['q2'+p])]
    print ('other dists for {} postfix are done in {} minutes'
           .format(p, round((time.time()-block_start_time)/60,1) ))

    return data

for p in postfixes:
    t_start = time.time()
    data = add_distances(data, p)
    kagg = add_distances(kagg, p)
    print ('{} postfix is done in {} minutes \n'.format(p, round((time.time()-t_start)/60,1) ))

fuzzywuzzy dists for _src postfix are done in 0.1 minutes
simhash dists for _src postfix are done in 0.0 minutes
minhash dists for _src postfix are done in 0.1 minutes
other dists for _src postfix are done in 0.0 minutes
fuzzywuzzy dists for _src postfix are done in 0.1 minutes
simhash dists for _src postfix are done in 0.0 minutes
minhash dists for _src postfix are done in 0.1 minutes
other dists for _src postfix are done in 0.0 minutes
_src postfix is done in 0.3 minutes 

fuzzywuzzy dists for _stem postfix are done in 0.1 minutes
simhash dists for _stem postfix are done in 0.0 minutes
minhash dists for _stem postfix are done in 0.1 minutes
other dists for _stem postfix are done in 0.0 minutes
fuzzywuzzy dists for _stem postfix are done in 0.1 minutes
simhash dists for _stem postfix are done in 0.0 minutes
minhash dists for _stem postfix are done in 0.1 minutes
other dists for _stem postfix are done in 0.0 minutes
_stem postfix is done in 0.3 minutes 

fuzzywuzzy dists for _nostops p

# Word mover distance on GloVe word embeddings

In [50]:
%%time
import gensim

# read GoogleNews word2vec embeddings with 300 vectors for each word.
# first model will be used as is, but the second is normalized
w2v_model      = gensim.models.KeyedVectors.load_word2vec_format('GloVe/GoogleNews-vectors-negative300.bin.gz', binary=True)
norm_w2v_model = gensim.models.KeyedVectors.load_word2vec_format('GloVe/GoogleNews-vectors-negative300.bin.gz', binary=True)
norm_w2v_model.init_sims(replace=True)

CPU times: user 6min 39s, sys: 40.9 s, total: 7min 20s
Wall time: 7min 46s


In [51]:
def wmd(df):
    """
    Input: pandas DataFrame. Must have two columns, corresponding to two questions
    Output: pandas Series, containing word mover distance on GloVe word embeddings
    """
    cols = df.columns
    Q1 = df[cols[0]]
    Q2 = df[cols[1]]
    df['res'] = [w2v_model.wmdistance(q1.split(' '), q2.split(' ')) for q1,q2 in zip(Q1,Q2)]
    return df['res']

def wmd_norm(df):
    """
    Input: pandas DataFrame. Must have two columns, corresponding to two questions
    Output: pandas Series, containing word mover distance on normalized GloVe word embeddings
    """
    cols = df.columns
    Q1 = df[cols[0]]
    Q2 = df[cols[1]]
    df['res'] = [norm_w2v_model.wmdistance(q1.split(' '), q2.split(' ')) for q1,q2 in zip(Q1,Q2)]
    return df['res']

In [53]:
%%time
# Adding wordmover distances on GloVe embeddings
for p in postfixes:
       
    data['wmd_'+p]  = apply_parallel(data[['q1'+p, 'q2'+p]], wmd)
    data['wmdn_'+p] = apply_parallel(data[['q1'+p, 'q2'+p]], wmd_norm)
    
    kagg['wmd_'+p]  = apply_parallel(kagg[['q1'+p, 'q2'+p]], wmd)
    kagg['wmdn_'+p] = apply_parallel(kagg[['q1'+p, 'q2'+p]], wmd_norm)
    
    print (p)
    
del w2v_model, norm_w2v_model  

_src
_stem
_nostops
CPU times: user 640 ms, sys: 1.14 s, total: 1.78 s
Wall time: 1min 50s


# Custom features

In [17]:
def intersect_full_col(df):
    """
    Input: pandas DataFrame. Must have two columns, corresponding to two questions
    Output: pandas Series, consisting of jaccard distance on full word intersection between two questions
    (see description for intersect_words function below)
    """
    cols = df.columns
    Q1 = df[cols[0]]
    Q2 = df[cols[1]]
    
    def word_sublist(w):
        """
        input: list of words
        returns: generator for all possible ordered word sequences from list. 
        E.g. ['a', 'b', 'c'] -> [('a',), ('a', 'b'), ('a', 'b', 'c'), ('b',), ('b', 'c'), ('c',)]
        """
        for i in range(len(w)):
            for j in range(i, len(w)):
                yield tuple(w[i:j+1])
    
    def intersect_words(q1, q2):
        """
        input: two strings with questions
        output: intersection character length divided by total number of character in both questions
        E.g. 
            input: q1 = 'a b c d e g'; q2 = 'h a b c a d e'
            max intersetion = 'a b c'
            output: 3/13
        """
        words1 = q1.split(' ')
        words2 = q2.split(' ')

        intersections = set(word_sublist(words1)) & set(word_sublist(words2))
        if len(intersections)>0:
            max_intersection = max(intersections, key=lambda x: len(x))
            char_sum = sum([len(i) for i in max_intersection])
            return char_sum/len(q1+q2)
        return 0
    
    df['result'] = [intersect_words(q1,q2) for q1,q2 in zip(Q1, Q2)]
    return df['result'].replace('', 'xxx')

def stopshare(q, q_nostops):
    """
    input: two strings: full question and question with stop-words removed (look up remove_stopwords_col funciton)
    output: share of stopwords in full question. -1 if question is empty
    """
    if len(q)>0:
        return (len(q)-len(q_nostops))/len(q)
    return -1 

def add_len_features(data, p):
    """
    input: DataFrame with questions columns name in format q1+postfix and q2+postfix.
    output: DataFrame with added basic word and char lengths features for questions, their intersection and resudial
    """
    
    data['q1_word_len'+p]    = data['q1'+p].apply(lambda q: len(q.split(' ')))
    data['q2_word_len'+p]    = data['q2'+p].apply(lambda q: len(q.split(' ')))
    data['inter_word_len'+p] = data['inter'+p].apply(lambda q: len(q.split(' ')))
    data['extra_word_len'+p] = data['extra'+p].apply(lambda q: len(q.split(' ')))
    
    data['q1_char_len'+p]    = data['q1'+p].apply(len)
    data['q2_char_len'+p]    = data['q2'+p].apply(len)
    data['inter_char_len'+p] = data['inter'+p].apply(len)
    data['extra_char_len'+p] = data['extra'+p].apply(len)
    
    return data

def pos_diff_from_start_col(df):
    """
    Input: pandas DataFrame. Must have two columns, corresponding to two questions
    Output: pandas Series, containing custom word positional diffirence metric
    (see description for positional_diff function below)
    """
    cols = df.columns
    Q1 = df[cols[0]]
    Q2 = df[cols[1]]
    Inter = df[cols[2]]

    def positional_diff(q1,q2,inter):
        """
        Input:
            q1: string, text for the first question
            q2: string, text for the second question
            inter: shared words for questions
        Output: pandas Series, containing custom word positional diffirence metric
        Calculation: for each word in interseciton we find it's index in the each question. Then we sum up 
        absolute index diffirences and divide resulting sum by average len of the question. 
        The higher the value, the more mixed the word order is.
        """
        if inter=='xxx':
            return 1

        inter = inter.split(' ')
        q1 = q1.split(' ')
        q2 = q2.split(' ')
        len_q1 = len(q1)
        len_q2 = len(q2)

        diff_sum = 0
        for word in inter:
            q1_pos = q1.index(word)
            q2_pos = q2.index(word)
            diff_sum += abs(q1_pos-q2_pos)

        diff_sum /= len(inter)
        diff_sum /= (len(q1)+len(q2))/2
        return diff_sum
    
    df['result'] = [positional_diff(q1, q2, inter) 
                            for q1,q2,inter 
                            in zip(Q1,Q2,Inter)]
    return df['result']

def pos_diff_from_end_col(df):
    """
    Input: pandas DataFrame. Must have two columns, corresponding to two questions
    Output: pandas Series, containing custom word positional diffirence metric
    (see description for positional_diff function below)
    """
    cols = df.columns
    Q1 = df[cols[0]]
    Q2 = df[cols[1]]
    Inter = df[cols[2]]

    def positional_diff(q1,q2,inter):
        """
        Input:
            q1: string, text for the first question
            q2: string, text for the second question
            inter: shared words for questions
        Output: pandas Series, containing custom word positional diffirence metric
        Calculation: for each word in interseciton we find it's index FROM THE QUESTION END in the each question. 
        Then we sum up absolute index diffirences and divide resulting sum by average len of the question. 
        The higher the value, the more mixed the word order is.
        """
        if inter=='xxx':
            return 1

        inter = inter.split(' ')
        q1 = q1.split(' ')
        q2 = q2.split(' ')
        len_q1 = len(q1)
        len_q2 = len(q2)

        diff_sum = 0
        for word in inter:
            q1_pos = q1.index(word) - len_q1
            q2_pos = q2.index(word) - len_q2
            diff_sum += abs(q1_pos-q2_pos)

        diff_sum /= len(inter)
        diff_sum /= (len(q1)+len(q2))/2
        return diff_sum
    
    df['result'] = [positional_diff(q1, q2, inter) 
                            for q1,q2,inter 
                            in zip(Q1,Q2,Inter)]
    return df['result']



def add_customs(data, p):
    "apply defined functions to data (pd DataFrame) and given postfix"
    
    data['full_inter'+p]   = apply_parallel(data[['q1'+p, 'q2'+p]], intersect_full_col)
    data['pos_diff'+p]     = apply_parallel(data[['q1'+p, 'q2'+p, 'inter'+p]], pos_diff_from_start_col)
    data['pos_diff_end'+p] = apply_parallel(data[['q1'+p, 'q2'+p, 'inter'+p]], pos_diff_from_end_col)
    
    if p =='q1_stopshare':
        data['q1_stopshare'] = [stopshare(q, q_nostops) for q, q_nostops in zip(data.q1_src, data.q1_nostops)]
        data['q2_stopshare'] = [stopshare(q, q_nostops) for q, q_nostops in zip(data.q2_src, data.q2_nostops)]
    data = add_len_features(data, p)
    return data
    
for p in postfixes:
    t_start = time.time()
    data = add_customs(data, p)
    kagg = add_customs(kagg, p)    
    print ('{} postfix is done in {} minutes \n'.format(p, round((time.time()-t_start)/60,1) ))
    

_src postfix is done in 0.0 minutes 

_stem postfix is done in 0.0 minutes 

_nostops postfix is done in 0.0 minutes 



In [18]:
%%time
def question_uniquness(q, word_count):
    """
    input:
        q - string, question
        word_count - dictionary with word coutns for each word in train dataset
    output: mean uniquness of words in question. ranges from 0.33 (very unique) to 0 (very common)
    """
    
    words = q.split(' ')
    
    # Number of words in question with count >=3. Incremented for each word satisfying the condition
    word_number = 0 
    
    # Uniqueness index of question. Incremented by 1.0/word_count for each word in question. 
    # Rarest words will give 1/3 and the most common will give value close to zero.
    word_sum = 0 

    # For each word in questions get it's count from train dataset. If 3 or greater, add 1/count to word_sum 
    # and 1 to word_number.
    for word in words:
        val = word_count.get(word,0)
        if val>=3:
            word_sum += 1.0/val
            word_number += 1
            
    # Return word_sum/word_number. Return 0.35 if word_number is zero.       
    if word_number>0:
        return word_sum/word_number #
    return 0.35

def add_uniquness_features(data, p, word_count):
    """
    input:
        data - DataFrame with all columns
        p - postfix
        word_count - dictionary with word coutns for each word in train dataset
    output: main DataFrame with question_uniquness function applied to questions, their intersection and resudial
    """    
    
    data['q1_uniq'+p]    = data['q1'+p]   .apply(lambda q: question_uniquness(q, word_count))
    data['q2_uniq'+p]    = data['q2'+p]   .apply(lambda q: question_uniquness(q, word_count))
    data['inter_uniq'+p] = data['inter'+p].apply(lambda q: question_uniquness(q, word_count))
    data['extra_uniq'+p] = data['extra'+p].apply(lambda q: question_uniquness(q, word_count))
    return data


for p in postfixes:
    t_start = time.time()
    
    # Compute word counts dictionary for train dataset. Recomputed for each postfix
    word_count = dict()
    for _, row in data.iterrows():
        q1 = row['q1'+p].split(' ')
        q2 = row['q2'+p].split(' ')
        for w in (q1+q2):
            word_count[w] = word_count.get(w, 0)+1
    
    # Add uniquness features to DataFrames
    data = add_uniquness_features(data, p, word_count)
    kagg = add_uniquness_features(kagg, p, word_count)
        
    print ('{} postfix is done in {} minutes'.format(p, round((time.time()-t_start)/60,1) ))

_src postfix is done in 0.0 minutes
_stem postfix is done in 0.0 minutes
_nostops postfix is done in 0.0 minutes
CPU times: user 484 ms, sys: 8.18 ms, total: 492 ms
Wall time: 495 ms


# Custom features: positional differences matrix

In [120]:
%%time
def pos_diff_matrix(df, max_len, prefix, from_end):
    """
    input: 
        df: pd DataFrame, that contains two question columns. Must not contain Nulls
        max_len: int, max word length of question. Every question will be truncated to that word length
        prefix: string, column name prefix.
        from_end: boolean, if False - questions are truncated by max_len from start. Else - from end.
    output: pd DataFrame with max_len columns. Each column has values from 0 to max_len*10
    """
    
    cols = df.columns
    Q1 = df[cols[0]]
    Q2 = df[cols[1]]
    
    def text_intersect(q1, q2, max_len):
        """
        input: two questions in strings, number of words to which truncate the questions
        output: list of max_len elements, that represent ordinal word differences in quesiton (see comments in code)
        """

        # For each word in the first question get it's position number in quesiton and 
        # create dict where word is a key and it's position is a value.
        q1_words_positions = dict()
        word_position = 0
        q1_word_list = q1.split(' ')
        if from_end:
            q1_word_list = reversed(q1_word_list)
        for word in q1_word_list:
            if word not in q1_words_positions:
                q1_words_positions[word] = word_position
                word_position += 1
        
        # Get truncated to max_len list of words in second question.
        q2_word_list = q2.split(' ')
        if from_end:
            q2_word_list = list(reversed(q2_word_list))
        q2_word_list = q2_word_list[:max_len]
        
        # For each word in second question calculate it's position and compute it's positional difference 
        # with the same word in the first question. If the word is not present in the first question, it's position
        # in the first question is set to 5*max_len. 
        intersection = []
        for i in range(len(q2_word_list)):
            intersection.append(q1_words_positions.get(q2_word_list[i], max_len*5) - i)
        
        # Fill resulted numbers with max_len*10 values to the length of max_len. Happens if second question 
        # is shorter that max_len.
        intersection += [max_len*10]*(max_len-len(intersection))
        return intersection
    
    # Apply text_intersect to every questions pair in df. Return pd DataFrame
    matrix = []
    for q1,q2 in zip(Q1,Q2):
        matrix.append(text_intersect(q1, q2, max_len))
    return pd.DataFrame(matrix, columns = [prefix+str(i+1) for i in range(max_len)])

assert data.dropna().shape == data.shape
MAX_WORDS_IN_QUESTION = 10
pos_diff_postfixes = ['_nostops', '_stem']

# Calculate posdiff matricies for each postfix in pos_diff_postfixes from start and from eng. 
# Concatenate every matrix into single pd DataFrame. Resulting number of columns is:
# MAX_WORDS_IN_QUESTION*len(pos_diff_postfixes)*2 = 40 for current parameters.
data_posdiff = pd.DataFrame()
kagg_posdiff = pd.DataFrame()
for p in pos_diff_postfixes:
    data_posdiff = pd.concat((data_posdiff, pos_diff_matrix(data[['q1'+p, 'q2'+p]], MAX_WORDS_IN_QUESTION, 
                                       'posdiff'+p, from_end=False),
                               pos_diff_matrix(data[['q1'+p, 'q2'+p]], MAX_WORDS_IN_QUESTION, 
                                       'posdiff_end'+p, from_end=True)), 
                               axis=1, copy=False)
    
    kagg_posdiff = pd.concat((pos_diff_matrix(kagg[['q1'+p, 'q2'+p]], MAX_WORDS_IN_QUESTION, 
                                       'posdiff'+p, from_end=False),
                               pos_diff_matrix(kagg[['q1'+p, 'q2'+p]], MAX_WORDS_IN_QUESTION, 
                                       'posdiff_end'+p, from_end=True)), 
                               axis=1, copy=False)

assert data_posdiff.dropna().shape == data_posdiff.shape

#data_posdiff.to_csv('data/features/train_pos_diff_matrix.csv', index=False)
#kagg_posdiff.to_csv('data/features/test_pos_diff_matrix.csv', index=False)

CPU times: user 1.47 s, sys: 23.3 ms, total: 1.49 s
Wall time: 1.49 s


# Proper nouns

In [74]:
# reload unprocessed questions. We'll need to get all words starting with capital letter, and sentences' endings
data_qs = pd.read_csv(os.path.join(DATA_PATH, 'train.csv'), usecols=[3,4])
data_qs.columns = ['q1', 'q2']
data_qs.fillna('xxx', inplace=True)

kagg_qs = pd.read_csv(os.path.join(DATA_PATH, 'test.csv'),  usecols=[1,2])
kagg_qs.columns = ['q1', 'q2']
kagg_qs.fillna('xxx', inplace=True)

Populating the interactive namespace from numpy and matplotlib
40000 40000 37295 37255


In [64]:
%%time
import re
import nltk.data
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

def sentence_regex(text):    
    """
    input: text string
    output: string with some common English abbrv replaced. Doesn't remove punctuation.
    """
    text = re.sub(r"[^A-Za-z0-9^,!.\'+-=]", " ", text)
    text = re.sub(r"\'s", " 's ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", " cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r";", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"\s{2,}", " ", text)
    
    text = re.sub('[^0-9a-zA-Z]+', ' ', text)
    return text

def get_proper_nouns_list(data):
    """
    input: pd DataFrame with a single column, in which questions are stored in string format
    output: pd Series with list of words, that are starting with capital letter in the middle of the sentence
    for each quesiton
    """
    
    q = data.columns[0]
    
    # Split question in a list of sentences
    data[q]  = data[q].apply(lambda Q: tokenizer.tokenize(Q))
    
    # Clean up every sentence with regexes
    data[q] = data[q].apply(lambda Q: [sentence_regex(S) for S in Q])
    
    # In each sentences take all words except for the first one. If word length >1 and the first char in upper case
    # then add it to the list of proper nouns in question
    data[q+'_cap'] = data[q].apply(lambda Q: 
                                   list(flatten([
                                       [word for word in S.split(' ')[1:] if len(word)>1 and word[0].isupper()] 
                                                  for S in Q])))
    return data[q+'_cap']

data_qs['q1_cap'] = apply_parallel(data_qs[['q1']], get_proper_nouns_list)
data_qs['q2_cap'] = apply_parallel(data_qs[['q2']], get_proper_nouns_list)

kagg_qs['q1_cap'] = apply_parallel(kagg_qs[['q1']], get_proper_nouns_list)
kagg_qs['q2_cap'] = apply_parallel(kagg_qs[['q2']], get_proper_nouns_list)

CPU times: user 118 ms, sys: 99.7 ms, total: 218 ms
Wall time: 1.66 s


In [65]:
%%time
def add_propernouns_features(data):
    """
    input: pd DataFrame, that contains following columns:
        'q1' and 'q2' - questions in string format
        'q1_cap' and 'q2_cap' - lists of words, that starts with capital letter (proper nouns)
    output: pd DataFrame with 6 features: lengths of proper nouns for both questions, their intersection 
    and difference; relative length of intersection and difference
    """
    
    results = []
    for index, row in data.iterrows():
        q1_cap = row['q1_cap']
        q2_cap = row['q2_cap']

        # For each first word of each sentence in q1, check if the word is in proper nouns of q2.
        # If it is - add it to lsit of q1 proper nouns. 
        # Reason for it - we excluded first words on previous step.
        for S in row['q1']:
            first_word = S.split(' ')[0]
            if first_word in q2_cap:
                q1_cap.append(first_word)
        # Vice versa
        for S in row['q2']:
            first_word = S.split(' ')[0]
            if first_word in q1_cap:
                q2_cap.append(first_word)
        
        # Get list of matches between proper nouns list
        proper_matches = list(set(q1_cap).intersection(set(q2_cap)))
    
        # Dict, from which the row will be constructed at the return
        row_dict = {}
        
        row_dict['PN_q1_count'] = len(q1_cap)
        row_dict['PN_q2_count'] = len(q2_cap)
        row_dict['PN_match_count'] = len(proper_matches)
        row_dict['PN_mismatch_count'] = len(set(q1_cap+q2_cap).difference(set(proper_matches)))
        
        if len(q1_cap) + len(q2_cap) > 0:
            row_dict['PN_match_relative']   = len(proper_matches) / (len(q1_cap)+len(q2_cap))
            row_dict['PN_mismatch_relative'] = row_dict['PN_mismatch_count'] / ( len(q1_cap)+len(q2_cap))
        else:
            row_dict['PN_match_relative'] = -1
            row_dict['PN_mismatch_relative'] = -1

        results.append(row_dict)
    return pd.DataFrame.from_dict(results) # returns pd DataFrame

# Adding proper nouns features
data_PN = apply_parallel(data_qs[['q1', 'q2', 'q1_cap', 'q2_cap']], add_propernouns_features)
kagg_PN = apply_parallel(kagg_qs[['q1', 'q2', 'q1_cap', 'q2_cap']], add_propernouns_features)

CPU times: user 97.3 ms, sys: 74.2 ms, total: 171 ms
Wall time: 2.16 s


In [83]:
# Get string of countries and cities. We will search for shared locations

locations = pd.read_csv("locations/cities.csv")

countries = "|".join(set(locations['Country'].dropna()))
cities    = "|".join(set(locations['City'].dropna()))
cities = re.sub(r"[^A-Za-z0-9,-|]", " ", cities) # Clean up the mess

print ('unique countries:', locations['Country'].nunique())
print ('unique cities:', locations['City'].nunique())

del locations

unique countries: 175
unique cities: 3489


In [67]:
%%time
def add_country_features(data):
    """
    input: pd DataFrame, that contains following columns:
        'q1' and 'q2' - questions in string format
    output: pd DataFrame with 6 features: number of countries for both questions, their intersection 
    and difference; relative number of intersection and difference numbers
    """
    results = []
    for _, row in data.iterrows():
        q1 = row['q1']
        q2 = row['q2']
        
        # Get lists of mentioned countries for each question
        q1_countries = [country.lower() for country in re.findall(countries, q1, flags=re.IGNORECASE)]
        q2_countries = [country.lower() for country in re.findall(countries, q2, flags=re.IGNORECASE)]
        # Intersection of countries
        country_mathces = set(q1_countries).intersection(set(q2_countries))
        
        # Dict, from which the row will be constructed at the return
        row_dict = {}
        row_dict['loc_q1_country_num'] = len(q1_countries)
        row_dict['loc_q2_country_num'] = len(q2_countries) 
        row_dict['loc_country_match_num'] = len(country_mathces)
        row_dict['loc_country_mismatch_num'] = len(set(q1_countries).difference(set(q2_countries)))
        
        if len(q1_countries) + len(q2_countries) > 0:
            row_dict['loc_country_match_relative'] = len(country_mathces) / (len(q1_countries)+len(q2_countries))
            row_dict['loc_country_mismatch_relative'] = (row_dict['loc_country_mismatch_num'] 
                                                         / (len(q1_countries)+len(q2_countries)))
        else:
            row_dict['loc_country_match_relative'] = -1
            row_dict['loc_country_mismatch_relative'] = -1

        results.append(row_dict)     
    return pd.DataFrame.from_dict(results) # returns pd DataFrame

data_PN = pd.concat([data_PN
                    ,apply_parallel(data_qs[['q1', 'q2']], add_country_features)]
                    ,axis=1)
kagg_PN = pd.concat([kagg_PN
                    ,apply_parallel(kagg_qs[['q1', 'q2']], add_country_features)]
                    ,axis=1)

CPU times: user 70.4 ms, sys: 59.7 ms, total: 130 ms
Wall time: 5.21 s


In [69]:
%%time
def add_city_features(data):
    """
    input: pd DataFrame, that contains following columns:
        'q1' and 'q2' - questions in string format
    output: pd DataFrame with 6 features: number of cities for both questions, their intersection 
    and difference; relative number of intersection and difference numbers
    """
    results = []
    for _, row in data.iterrows():
        q1 = row['q1']
        q2 = row['q2']
        
        q1_cities = [city.lower() for city in re.findall(cities, q1, flags=re.IGNORECASE)]
        q2_cities = [city.lower() for city in re.findall(cities, q2, flags=re.IGNORECASE)]
        city_mathces = set(q1_cities).intersection(set(q2_cities))
        
        row_dict = {}
        row_dict['loc_q1_city_num'] = len(q1_cities)
        row_dict['loc_q2_city_num'] = len(q2_cities) 
        row_dict['loc_city_match_num'] = len(city_mathces)
        row_dict['loc_city_mismatch_num'] = len(set(q1_cities).difference(set(q2_cities)))
        
        if len(q1_cities) + len(q2_cities) > 0:
            row_dict['loc_city_match_relative'] = len(city_mathces) / (len(q1_cities)+len(q2_cities))
            row_dict['loc_city_mismatch_relative'] = (row_dict['loc_city_mismatch_num'] 
                                                         / (len(q1_cities)+len(q2_cities)))
        else:
            row_dict['loc_city_match_relative'] = -1
            row_dict['loc_city_mismatch_relative'] = -1

        results.append(row_dict)     
    return pd.DataFrame.from_dict(results)

data_PN = pd.concat([data_PN
                    ,apply_parallel(data_qs[['q1', 'q2']], add_city_features)]
                    ,axis=1)
kagg_PN = pd.concat([kagg_PN
                    ,apply_parallel(kagg_qs[['q1', 'q2']], add_city_features)]
                    ,axis=1)

data_PN.to_csv('data/features/train_PN.csv', index=False)
kagg_PN.to_csv('data/features/test_PN.csv',  index=False)
del data_PN, kagg_PN, data_qs, kagg_qs

CPU times: user 292 ms, sys: 195 ms, total: 487 ms
Wall time: 1min 22s


# Jaccard char ngrams (Mephistphel's features)

In [92]:
# Jaccard distances for char ngrams
# Source: https://www.kaggle.com/c/quora-question-pairs/discussion/32313

for p in postfixes:
    print (p)
    #kagg = pd.read_csv(os.path.join(DATA_PATH, 'kagg_feat'+p+'.csv'), sep=';')
    kagg.fillna('xxx', inplace=True)
    
    ### jaccard distances ###
    
    cv_char = CountVectorizer(ngram_range=(1, 3), analyzer='char')
    ch_freq = np.array(cv_char.fit_transform(data['q1'+p].tolist() + data['q2'+p].tolist()).sum(axis=0))[0, :]
    
    unigrams = dict([(k, v) for (k, v) in cv_char.vocabulary_.items() if len(k) == 1])
    ix_unigrams = np.sort(list(unigrams.values()))
    bigrams = dict([(k, v) for (k, v) in cv_char.vocabulary_.items() if len(k) == 2])
    ix_bigrams = np.sort(list(bigrams.values()))
    trigrams = dict([(k, v) for (k, v) in cv_char.vocabulary_.items() if len(k) == 3])
    ix_trigrams = np.sort(list(trigrams.values()))
    
    ####### unigrams ######
    
    ## train
    m_q1 = cv_char.transform(data['q1'+p].values)
    m_q2 = cv_char.transform(data['q2'+p].values)
    
    v_num = (m_q1[:, ix_unigrams] > 0).minimum((m_q2[:, ix_unigrams] > 0)).sum(axis=1)
    v_den = (m_q1[:, ix_unigrams] > 0).maximum((m_q2[:, ix_unigrams] > 0)).sum(axis=1)
    v_score = np.array(v_num.flatten()).astype(np.float32)[0, :]/np.array(v_den.flatten())[0, :]
    data['unigram_jaccard'+p] = v_score
    
    v_num = m_q1[:, ix_unigrams].minimum(m_q2[:, ix_unigrams]).sum(axis=1)
    v_den = m_q1[:, ix_unigrams].sum(axis=1) + m_q2[:, ix_unigrams].sum(axis=1)
    v_score = np.array(v_num.flatten()).astype(np.float32)[0, :]/np.array(v_den.flatten())[0, :]
    data['unigram_all_jaccard'+p] = v_score
    
    v_num = m_q1[:, ix_unigrams].minimum(m_q2[:, ix_unigrams]).sum(axis=1)
    v_den = m_q1[:, ix_unigrams].maximum(m_q2[:, ix_unigrams]).sum(axis=1)
    v_score = np.array(v_num.flatten()).astype(np.float32)[0, :]/np.array(v_den.flatten())[0, :]
    data['unigram_all_jaccard_max'+p] = v_score
    
    ## test
    m_q1 = cv_char.transform(kagg['q1'+p].values)
    m_q2 = cv_char.transform(kagg['q2'+p].values)
    
    v_num = (m_q1[:, ix_unigrams] > 0).minimum((m_q2[:, ix_unigrams] > 0)).sum(axis=1)
    v_den = (m_q1[:, ix_unigrams] > 0).maximum((m_q2[:, ix_unigrams] > 0)).sum(axis=1)
    v_score = np.array(v_num.flatten()).astype(np.float32)[0, :]/np.array(v_den.flatten())[0, :]
    kagg['unigram_jaccard'+p] = v_score
    
    v_num = m_q1[:, ix_unigrams].minimum(m_q2[:, ix_unigrams]).sum(axis=1)
    v_den = m_q1[:, ix_unigrams].sum(axis=1) + m_q2[:, ix_unigrams].sum(axis=1)
    v_score = np.array(v_num.flatten()).astype(np.float32)[0, :]/np.array(v_den.flatten())[0, :]
    kagg['unigram_all_jaccard'+p] = v_score
    
    v_num = m_q1[:, ix_unigrams].minimum(m_q2[:, ix_unigrams]).sum(axis=1)
    v_den = m_q1[:, ix_unigrams].maximum(m_q2[:, ix_unigrams]).sum(axis=1)
    v_score = np.array(v_num.flatten()).astype(np.float32)[0, :]/np.array(v_den.flatten())[0, :]
    kagg['unigram_all_jaccard_max'+p] = v_score
    
    
    ####### bigrams ######
    
    ## train
    m_q1 = cv_char.transform(data['q1'+p].values)
    m_q2 = cv_char.transform(data['q2'+p].values)
    
    v_num = (m_q1[:, ix_bigrams] > 0).minimum((m_q2[:, ix_bigrams] > 0)).sum(axis=1)
    v_den = (m_q1[:, ix_bigrams] > 0).maximum((m_q2[:, ix_bigrams] > 0)).sum(axis=1)
    v_score = np.array(v_num.flatten()).astype(np.float32)[0, :]/np.array(v_den.flatten())[0, :]
    data['bigram_jaccard'+p] = v_score
    
    v_num = m_q1[:, ix_bigrams].minimum(m_q2[:, ix_bigrams]).sum(axis=1)
    v_den = m_q1[:, ix_bigrams].sum(axis=1) + m_q2[:, ix_bigrams].sum(axis=1)
    v_score = np.array(v_num.flatten()).astype(np.float32)[0, :]/np.array(v_den.flatten())[0, :]
    data['bigram_all_jaccard'+p] = v_score
    
    v_num = m_q1[:, ix_bigrams].minimum(m_q2[:, ix_bigrams]).sum(axis=1)
    v_den = m_q1[:, ix_bigrams].maximum(m_q2[:, ix_bigrams]).sum(axis=1)
    v_score = np.array(v_num.flatten()).astype(np.float32)[0, :]/np.array(v_den.flatten())[0, :]
    data['bigram_all_jaccard_max'+p] = v_score
    
    ##test
    m_q1 = cv_char.transform(kagg['q1'+p].values)
    m_q2 = cv_char.transform(kagg['q2'+p].values)
    
    v_num = (m_q1[:, ix_bigrams] > 0).minimum((m_q2[:, ix_bigrams] > 0)).sum(axis=1)
    v_den = (m_q1[:, ix_bigrams] > 0).maximum((m_q2[:, ix_bigrams] > 0)).sum(axis=1)
    v_score = np.array(v_num.flatten()).astype(np.float32)[0, :]/np.array(v_den.flatten())[0, :]
    kagg['bigram_jaccard'+p] = v_score
    
    v_num = m_q1[:, ix_bigrams].minimum(m_q2[:, ix_bigrams]).sum(axis=1)
    v_den = m_q1[:, ix_bigrams].sum(axis=1) + m_q2[:, ix_bigrams].sum(axis=1)
    v_score = np.array(v_num.flatten()).astype(np.float32)[0, :]/np.array(v_den.flatten())[0, :]
    kagg['bigram_all_jaccard'+p] = v_score
    
    v_num = m_q1[:, ix_bigrams].minimum(m_q2[:, ix_bigrams]).sum(axis=1)
    v_den = m_q1[:, ix_bigrams].maximum(m_q2[:, ix_bigrams]).sum(axis=1)
    v_score = np.array(v_num.flatten()).astype(np.float32)[0, :]/np.array(v_den.flatten())[0, :]
    kagg['bigram_all_jaccard_max'+p] = v_score
    
    
    ####### trigrams ######
    
    ## train
    m_q1 = cv_char.transform(data['q1'+p].values)
    m_q2 = cv_char.transform(data['q2'+p].values)
    
    v_num = (m_q1[:, ix_trigrams] > 0).minimum((m_q2[:, ix_trigrams] > 0)).sum(axis=1)
    v_den = (m_q1[:, ix_trigrams] > 0).maximum((m_q2[:, ix_trigrams] > 0)).sum(axis=1)
    v_den[np.where(v_den == 0)] = 1
    v_score = np.array(v_num.flatten()).astype(np.float32)[0, :]/np.array(v_den.flatten())[0, :]
    data['trigram_jaccard'+p] = v_score
    
    v_num = m_q1[:, ix_trigrams].minimum(m_q2[:, ix_trigrams]).sum(axis=1)
    v_den = m_q1[:, ix_trigrams].sum(axis=1) + m_q2[:, ix_trigrams].sum(axis=1)
    v_den[np.where(v_den == 0)] = 1
    v_score = np.array(v_num.flatten()).astype(np.float32)[0, :]/np.array(v_den.flatten())[0, :]
    data['trigram_all_jaccard'+p] = v_score
    
    v_num = m_q1[:, ix_trigrams].minimum(m_q2[:, ix_trigrams]).sum(axis=1)
    v_den = m_q1[:, ix_trigrams].maximum(m_q2[:, ix_trigrams]).sum(axis=1)
    v_den[np.where(v_den == 0)] = 1
    v_score = np.array(v_num.flatten()).astype(np.float32)[0, :]/np.array(v_den.flatten())[0, :]
    data['trigram_all_jaccard_max'+p] = v_score
    
    ##test
    m_q1 = cv_char.transform(kagg['q1'+p].values)
    m_q2 = cv_char.transform(kagg['q2'+p].values)
    
    v_num = (m_q1[:, ix_trigrams] > 0).minimum((m_q2[:, ix_trigrams] > 0)).sum(axis=1)
    v_den = (m_q1[:, ix_trigrams] > 0).maximum((m_q2[:, ix_trigrams] > 0)).sum(axis=1)
    v_den[np.where(v_den == 0)] = 1
    v_score = np.array(v_num.flatten()).astype(np.float32)[0, :]/np.array(v_den.flatten())[0, :]
    kagg['trigram_jaccard'+p] = v_score
    
    v_num = m_q1[:, ix_trigrams].minimum(m_q2[:, ix_trigrams]).sum(axis=1)
    v_den = m_q1[:, ix_trigrams].sum(axis=1) + m_q2[:, ix_trigrams].sum(axis=1)
    v_den[np.where(v_den == 0)] = 1
    v_score = np.array(v_num.flatten()).astype(np.float32)[0, :]/np.array(v_den.flatten())[0, :]
    kagg['trigram_all_jaccard'+p] = v_score
    
    v_num = m_q1[:, ix_trigrams].minimum(m_q2[:, ix_trigrams]).sum(axis=1)
    v_den = m_q1[:, ix_trigrams].maximum(m_q2[:, ix_trigrams]).sum(axis=1)
    v_den[np.where(v_den == 0)] = 1
    v_score = np.array(v_num.flatten()).astype(np.float32)[0, :]/np.array(v_den.flatten())[0, :]
    kagg['trigram_all_jaccard_max'+p] = v_score
    
    
    jaccard_feat = ['unigram_jaccard'+p, 'unigram_all_jaccard'+p, 'unigram_all_jaccard_max'+p, 
                     'bigram_jaccard'+p,  'bigram_all_jaccard'+p,  'bigram_all_jaccard_max'+p,
                    'trigram_jaccard'+p, 'trigram_all_jaccard'+p, 'trigram_all_jaccard_max'+p]
    
    kagg[jaccard_feat].fillna(-1, inplace=True)
    data[jaccard_feat].fillna(-1, inplace=True)
    
    print (p)

_src
_src
_stem
_stem
_nostops
_nostops


# Synonyms analysis

In [4]:
import nltk
import gensim
import goslate
from PyDictionary import PyDictionary

def apply_parallel_fill_dict(my_list, my_func):
    """
    Input: 
        my_list: list of words
        my_func: custom function which will be apllied to my_list. Must accept list as input and return a dict.
    Output: combined dictionary from results of my_func.
    
    my_list is splitted by the number of cores and function applied to each part independetly.
    """
    list_splitted = np.array_split(my_list, NUM_CORES)
    pool = mp.Pool(NUM_CORES)
    
    result = dict()
    for dictionary in pool.map(my_func, list_splitted):
        for key, value in dictionary.items():
            result[key]=value
            
    pool.close()
    pool.join()
    return result

def get_antonyms_dict(words):
    """
    input: list of words
    output: dictinoary (word, list_of_antonyms)
    """
    pydict=PyDictionary()
    antonym_dict_local = dict()
    for word in words:
        antonym_list = pydict.antonym(word) # get list of antonyms for the word
        if antonym_list: # ignore if None
            antonym_dict_local[word] = antonym_list
    return antonym_dict_local

def get_synonyms_dict(words):
    """
    input: list of words
    output: dictinoary (word, list_of_synonyms)
    """
    pydict=PyDictionary()
    synonym_dict_local = dict()
    for word in words:
        synonym_list = pydict.synonym(word) # get list of synonyms for the word
        if synonym_list: # ignore if None
            synonym_dict_local[word] = synonym_list
    return synonym_dict_local

# Get array of lists, where each list contain all the words from question pair
all_sentences = np.append(
                     data[['q1_src', 'q2_src']].iloc[:1000]
                     .apply(lambda row: ' '.join((row['q1_src'], row['q2_src'])).split(' '), axis=1)
                     .values,
    
                     kagg[['q1_src', 'q2_src']].iloc[:1000]
                     .apply(lambda row: ' '.join((row['q1_src'], row['q2_src'])).split(' '), axis=1)
                     .values)

# Fill the words dictinary with empty list for each unique word in text
words_dict = dict ([(word, []) for word in set(flatten(all_sentences))])
print ('total unique words:', len(words_dict))
print ('estimated time to end, min:', round(len(words_dict)*(16.5/6479),1))

ImportError: No module named 'core'

In [None]:
t_start = time.time()
#antonym_dict = apply_parallel_fill_dict(list(words_dict), get_antonyms_dict)
#with open('data/dics/antonyms.pkl', 'wb') as F:
#    pickle.dump(antonym_dict, F)    

synonym_dict = apply_parallel_fill_dict(list(words_dict), get_synonyms_dict)
with open('data/dics/synonyms.pkl', 'wb') as F:
    pickle.dump(synonym_dict, F)

In [None]:
%%time
def syn(q1, q2):
    """
    input: two questions in string
    output: thre feautres:
        share of words in questions that doesn't have a synonym in the other quesiton;
        share of words, for which synonyms are not defined;
        jaccard distance between questions with all words replaced (if possible) to the synonyms that are
        present in the other question.
    Further info in comments in code
    """
    q1 = q1.split(' ')
    q2 = q2.split(' ')
    
    q1_dict = dict((word, None) for word in q1)
    q2_dict = dict((word, None) for word in q2)
    
    no_syn_count = 0 # Number of words in both questions, that doesn't have a synonym in other quesiton
    syn_number = 0   # Number of words in both questions, that have at least one sysnonym in other quesiton
    unknown_words_count = 0 # Number of words in both questions with empty synonym list
    
    # Generate q1_new. Replace each word with the first word from synonym list, that is present in second quesiton.
    # Leave word untouched if no matches in second question.
    q1_new = []
    for w in q1:        
        syn_added = syn_number
        syn_list = [w] + syn_dict.get(w, []) # current word plus list of synonyms from dictionary
        
        # Append to q1_new the first word from syn_list, that present in second question.
        # If the word is added, increment  number of found synonyms by 1.
        for syn_word in syn_list:
            if syn_word in q2_dict:
                syn_number+=1
                q1_new.append(syn_word)
                continue
        
        # If no synonym_word from syn_list is found in second question, then don't replace word with anything
        if syn_added == syn_number:
            q1_new.append(w)
            no_syn_count+=1
            
        # If synonym_list contain only main word, then increment number of unknown words
        if len(syn_list)==1:
            unknown_words_count+=1

    # Do the same procedure to generate new second quesiton        
    q2_new = []
    for w in q2:
        syn_list = [w] + syn_dict.get(w, [])
        syn_added = syn_number
        for syn_word in syn_list:
            if syn_word in q1_dict:
                syn_number+=1
                q2_new.append(syn_word)
                continue 
        if syn_added == syn_number:
            q2_new.append(w)   
        if len(syn_list)==1:
            unknown_words_count+=1
    
    jaccard_distance = len(set(q1_new).intersection(set(q2_new))) / len(set(q1_new+q2_new))
    
    q1_new = ' '.join(q1_new)
    q2_new = ' '.join(q2_new)

    mean_length = (len(q1) + len(q2))/2
    return pd.Series([q1_new, no_syn_count/mean_length, unknown_words_count/mean_length, jaccard_distance], 
            index = ['nosyn_share', 'syn_unknown_share', 'syn_jaccard'])

with open('data/dics/synonyms.pkl', 'rb') as F:
    syn_dict = pickle.load(F)

syn_features = []
p = '_nostops'
data = pd.concat((data, 
                  data.apply(lambda row: syn(row['q1'+p], row['q2'+p]), axis=1)),
                  axis=1, copy=False)

kagg = pd.concat((kagg, 
                  kagg.apply(lambda row: syn(row['q1'+p], row['q2'+p]), axis=1)),
                  axis=1, copy=False)


# Save dense features

In [None]:
data[features].to_csv(os.path.join(DATA_PATH, 'train_NLP_features.csv'), index=False)
kagg[features].to_csv(os.path.join(DATA_PATH,  'test_NLP_features.csv'), index=False)

# Save sparse matrix

For information about why among all possible sparse representations, this one was chosen, look in additional_1_choosing_sparse_format.ipynb

In [27]:
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy import sparse 

In [28]:
%%time
p_text = '_stem'
corpus = data['q1'+p_text].tolist() + data['q2'+p_text].tolist()
tfidf = TfidfVectorizer(max_df=0.8, min_df=3).fit(corpus) 

data_sparse = sparse.hstack([tfidf.transform(data['inter'+p_text])
                            ,tfidf.transform(data['extra'+p_text])
                            ], format='csr')
kagg_sparse = sparse.hstack([tfidf.transform(kagg['inter'+p_text])
                            ,tfidf.transform(kagg['extra'+p_text])
                            ], format='csr')                                  

p_text = '_tags'
corpus = data['q1'+p_text].tolist() + data['q2'+p_text].tolist()
tfidf = TfidfVectorizer().fit(corpus) 
data_sparse = sparse.hstack([data_sparse
                            ,tfidf.transform(data['inter'+p_text])
                            ,tfidf.transform(data['extra'+p_text])
                            ], format='csr')                                  

kagg_sparse = sparse.hstack([kagg_sparse
                            ,tfidf.transform(kagg['inter'+p_text])
                            ,tfidf.transform(kagg['extra'+p_text])
                            ], format='csr') 
del corpus, tfidf

CPU times: user 3min 50s, sys: 25.6 s, total: 4min 16s
Wall time: 5min 29s


In [29]:
def save_sparse_csr(filename, sparce_matrix):
    "Saves scipy sparse matrix under the given name"
    np.savez(filename, data=sparce_matrix.data, indices=sparce_matrix.indices,
             indptr=sparce_matrix.indptr, shape=sparce_matrix.shape)

def load_sparse_csr(filename):
    "Loads scipy sparse matrix with csr format"
    loader = np.load(filename)
    return csr_matrix((loader['data'], loader['indices'], loader['indptr']),
                      shape=loader['shape'])

save_sparse_csr('data/data_tfidf_stem_tags.npz', data_sparse)
save_sparse_csr('data/kagg_tfidf_stem_tags.npz', kagg_sparse)

# ----- END ------

## wordnet similarity

In [None]:
import pandas as pd
import numpy as np
import datetime
import os
import multiprocessing as mp

%pylab inline
import matplotlib.pylab as plt
import seaborn as sns

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import log_loss
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import MaxAbsScaler

from scipy.sparse import csr_matrix, hstack, vstack

DATA_PATH = 'data/'
data = pd.read_csv(os.path.join(DATA_PATH, 'data_feat1.csv'), sep=';', 
                   usecols =['target', 'q1_nostops', 'q2_nostops'])
data.fillna('xxx', inplace=True)
# kagg = pd.read_csv(os.path.join(DATA_PATH, 'kagg_feat_nostops.csv'), sep=';', 
#                    usecols =['q1_nostops', 'q2_nostops'])
# kagg.fillna('xxx', inplace=True)

NUM_CORES = 6
def apply_parallel(df, my_func):
    df_splitted = np.array_split(df, NUM_CORES)
    pool = mp.Pool(NUM_CORES)
    result = pd.concat(pool.map(my_func, df_splitted))
    pool.close()
    pool.join()
    return result

In [None]:
from nltk.corpus import wordnet as wn
from nltk.corpus import wordnet_ic
brown_ic = wordnet_ic.ic('ic-brown.dat')

def find_similarity(w1,w2, nouns=False, CUT_VALUE=14.50):
    if nouns==True:
        lst1=wn.synsets(w1,pos=wn.NOUN)
        lst2=wn.synsets(w2,pos=wn.NOUN)                          
    else:
        lst1=wn.synsets(w1)
        lst2=wn.synsets(w2)       
    
    similarities_list=[item1.path_similarity(item2)  for item1 in lst1 \
                                for  item2 in lst2 \
                                if item1.path_similarity(item2)!=None]                     
    if len(similarities_list)==0:
        max_similarity=0
        mean_similarity=0
    else:
        max_similarity=max(similarities_list)
        mean_similarity=np.mean(similarities_list)
        
    lch_similarities_list=[item1.lch_similarity(item2)  for item1 in lst1 \
                                for  item2 in lst2 \
                                if item1.pos()==item2.pos() and item1.lch_similarity(item2)!=None]                     

    if len(lch_similarities_list)==0:
        max_lch_similarity=0
        mean_lch_similarity=0
    else:
        max_lch_similarity=max(lch_similarities_list)
        mean_lch_similarity=np.mean(lch_similarities_list)
        
    res_similarities_list=[min(CUT_VALUE,item1.res_similarity(item2,brown_ic))  for item1 in lst1 \
                                for  item2 in lst2 \
                                if item1.pos() not in ['a','s','r'] and item1.pos()==item2.pos() and item1.res_similarity(item2,brown_ic)!=None]  

    if len(res_similarities_list)==0:
        max_res_similarity=0
        mean_res_similarity=0
    else:
        max_res_similarity=max(res_similarities_list)
        mean_res_similarity=np.mean(res_similarities_list)
        
    return max_similarity, mean_similarity, max_lch_similarity, mean_lch_similarity, max_res_similarity, mean_res_similarity

In [None]:
%%time
from time import time

def find_similarity(w1,w2, nouns=False, CUT_VALUE=14.50):
    if nouns==True:
        lst1=wn.synsets(w1,pos=wn.NOUN)
        lst2=wn.synsets(w2,pos=wn.NOUN)                          
    else:
        lst1=wn.synsets(w1)
        lst2=wn.synsets(w2)       
    
    similarities_list=[item1.path_similarity(item2)  for item1 in lst1 \
                                for  item2 in lst2 \
                                if item1.path_similarity(item2)!=None]                     
    if len(similarities_list)==0:
        max_similarity=0
        mean_similarity=0
    else:
        max_similarity=max(similarities_list)
        mean_similarity=np.mean(similarities_list)
        
    lch_similarities_list=[item1.lch_similarity(item2)  for item1 in lst1 \
                                for  item2 in lst2 \
                                if item1.pos()==item2.pos() and item1.lch_similarity(item2)!=None]                     

    if len(lch_similarities_list)==0:
        max_lch_similarity=0
        mean_lch_similarity=0
    else:
        max_lch_similarity=max(lch_similarities_list)
        mean_lch_similarity=np.mean(lch_similarities_list)
        
    res_similarities_list=[min(CUT_VALUE,item1.res_similarity(item2,brown_ic))  for item1 in lst1 \
                                for  item2 in lst2 \
                                if item1.pos() not in ['a','s','r'] and item1.pos()==item2.pos() and item1.res_similarity(item2,brown_ic)!=None]  

    if len(res_similarities_list)==0:
        max_res_similarity=0
        mean_res_similarity=0
    else:
        max_res_similarity=max(res_similarities_list)
        mean_res_similarity=np.mean(res_similarities_list)
        
    return max_similarity, mean_similarity, max_lch_similarity, mean_lch_similarity, max_res_similarity, mean_res_similarity


test = data.loc[:100]
target = test.target
wordnet_sim_dict = dict()
t_start = time()
counter = 0

def wordnet_sim(q1,q2):
    global wordnet_sim_dict, counter
    counter+=1
    results = []
    for w1 in set(q1.split(' ')):
        for w2 in set(q2.split(' ')):
    # for w1 in set(q1.split(' ')) - set(q2.split(' ')):
    #     for w2 in set(q2.split(' ')):
            if (w1, w2) not in wordnet_sim_dict:
                wordnet_sim_dict[(w1,w2)] = find_similarity(w1, w2)
                wordnet_sim_dict[(w2,w1)] = wordnet_sim_dict[(w1,w2)]
            results.append(wordnet_sim_dict[(w1,w2)])
            
#     for w1 in set(q2.split(' ')) - set(q1.split(' ')):
#         for w2 in set(q1.split(' ')):
#             if (w1, w2) not in wordnet_sim_dict:
#                 wordnet_sim_dict[(w1,w2)] = find_similarity(w1, w2)
#                 wordnet_sim_dict[(w2,w1)] = wordnet_sim_dict[(w1,w2)]
#             results.append(wordnet_sim_dict[(w1,w2)])
    #print (results)
    if len(results)==0:
        results = np.array([[1]*6,[1]*6])
        #print (results)
    else:
        results = np.array(results)
    if counter % 1000 == 0:
        print (counter, len(wordnet_sim_dict), round((time()-t_start)/60,1) ,'minutes')

    return pd.Series([np.mean(results[:,0]), np.mean(results[:,1]), 
                      np.mean(results[:,2]), np.mean(results[:,3]), 
                      np.mean(results[:,4]), np.mean(results[:,5])
                     ], 
                     index = ['max_similarity', 'mean_similarity', 
                              'max_lch_similarity', 'mean_lch_similarity', 
                              'max_res_similarity', 'mean_res_similarity'
                             ])
test = test.apply(lambda row: wordnet_sim(row.q1_nostops,row.q2_nostops), axis=1)
test['target'] = target

In [None]:
test.corr()

In [None]:
test.corr()

In [None]:
wn.synsets('on the hill')

In [None]:
with open('data/wordnet_sim_dict.pkl', 'wb') as F:
    pickle.dump(wordnet_sim_dict, F)

In [None]:
%%time
from time import time
#test = data.loc[:10]
wordnet_sim_dict = dict()
t_start = time()
counter = 0

def wordnet_sim(q1,q2):
    global wordnet_sim_dict, counter
    counter+=1
    results = []
    for w1 in set(q1.split(' ')):
        for w2 in set(q2.split(' ')):
            if (w1, w2) not in wordnet_sim_dict:
                wordnet_sim_dict[(w1,w2)] = find_similarity(w1, w2)
                wordnet_sim_dict[(w2,w1)] = wordnet_sim_dict[(w1,w2)]
            results.append(wordnet_sim_dict[(w1,w2)])
    results = np.array(results)
    if counter % 1000 == 0:
        print (counter, len(wordnet_sim_dict), round((time()-t_start)/60,1) ,'minutes')

    return pd.Series([np.mean(results[:,0]), np.mean(results[:,1]), np.mean(results[:,2]), 
                      np.mean(results[:,3]), np.mean(results[:,4]), np.mean(results[:,5])], 
                     index = ['max_similarity', 'mean_similarity', 
                              'max_lch_similarity', 'mean_lch_similarity', 
                              'max_res_similarity', 'mean_res_similarity'])
data = data.apply(lambda row: wordnet_sim(row.q1_nostops,row.q2_nostops), axis=1)