In [0]:
import numpy as np
from gensim.parsing.preprocessing import preprocess_string, strip_tags, strip_multiple_whitespaces, remove_stopwords, stem_text
from gensim.models import Word2Vec
import pickle
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel, polynomial_kernel, sigmoid_kernel, laplacian_kernel, rbf_kernel
from sklearn.metrics import pairwise_distances
from scipy.spatial.distance import cdist, directed_hausdorff
from fastdtw import fastdtw
#import similaritymeasures
from scipy.spatial import procrustes
import sys
import os

def isCollab():
  return os.environ.get('COLAB_GPU', None) != None


In [20]:
path_to_storage = os.path.abspath(os.path.join(os.getcwd(), '../storage'))

if isCollab():
  ## For Google colab (chage dir from local to GDrive)
  ## Mount gdrive and set path to folder
  from google.colab import drive
  drive.mount('/content/gdrive')
  path_to_storage = '/content/gdrive/My Drive/UCU-2019-final-project-storage'
  #Sorry! I am realy sorry for it. But there are any other solution....
  sys.path.append(path_to_storage)
else:
  sys.path.append('..')

from utils.func.functions import pickle_and_remove
  
  
data_folder = path_to_storage+'/data/'
serialization_objects_folder = path_to_storage+'/serialization_objects/'

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/gdrive


## Preprocessing

In [0]:
from func.functions import build_x

In [0]:
filters = [strip_tags, strip_multiple_whitespaces, remove_stopwords, stem_text]
def tokenize(data_type='train'):
    if data_type=='test':
        X = pickle.load(open(serialization_objects_folder+'X_test.p', 'rb'))
    else:
        X = pickle.load(open(serialization_objects_folder+'X_train.p', 'rb'))
    series = pd.Series(pd.concat([X['question1'], X['question2']]),dtype=str)
    series.dropna()
    for question in series:
        yield preprocess_string(question, filters)

In [0]:
def tokenize_for_model(model, data_type):
    """
    This function tokenize and check token for dict in model
    :param:  model, data_type
    :return: ndarray 
    """
    for question in tokenize(data_type=data_type):
        tf_idf_tokens = []
        for token in question:
            try:
                vector = model.wv[token]
                tf_idf_tokens.append(token)
            except:
                continue
        yield np.array(tf_idf_tokens)

## Feature Extraction

#### Embedding: word2vec (Transfer-train on training data)

In [0]:
tokenized_questions = [question for question in tokenize(data_type='train')]

In [0]:
model_w2v = Word2Vec(tokenized_questions, size=300)

In [0]:
model_w2v.intersect_word2vec_format(data_folder+'GoogleNews-vectors-negative300.bin',
                                lockf=1.0,
                                binary=True)

In [0]:
model_w2v.train(tokenized_questions,total_examples=model_w2v.corpus_count, epochs=10)

(28526660, 35550940)

### Feature Sets 1 - Pairwise Distance & word2vec Vectors

#### Train - TF-IDF Vectorizer and TF-IDF Weights + word2vec Vectors


In [0]:
X_train = pickle.load(open(serialization_objects_folder+'X_train.p', 'rb'))

In [0]:
pass_through = lambda x:x
tfidf = TfidfVectorizer(analyzer=pass_through)
X_tfidf_all_q = tfidf.fit_transform(tokenize_for_model(model=model_w2v,data_type='train'))
X_q1_tfidf = X_tfidf_all_q[:len(X_train)]
X_q2_tfidf = X_tfidf_all_q[len(X_train):]

In [0]:
#X1_q1_tfidf[0] - sparsed vector with float (tfidf)
X_q1_tfidf.shape, X_q2_tfidf.shape, X_train.shape, X_q1_tfidf[0,X_q1_tfidf[0].todense().nonzero()[1]].todense()

((270872, 31889),
 (270872, 31889),
 (270872, 4),
 matrix([[0.56637731, 0.66474651, 0.14063621, 0.46642286]]))

In [0]:
# function to compute TF-IDF weights as well as the word2vec vectors for all tokens
def get_weights_and_w2vectors(tfidf_matrix, tfidf_vectorizer, model):
    weights = []
    vectors = []
    rows = tfidf_matrix.shape[0]
    inverse_vocab_dict = {v: k for k, v in tfidf_vectorizer.vocabulary_.items()}
    for doc in range(rows):
        features = tfidf_matrix[doc,:].nonzero()[1]
        #weights[i] - all tfidf value for i- row (len(w[i]) - number token/words in row/question)
        weights.append(np.array([tfidf_matrix[doc, x] for x in features]))
        #vectors[i] - all vectors embeded from model. (len(w[i]) - number token/words in row/question)
        vectors.append(np.array([model.wv[inverse_vocab_dict[x]] for x in features]))
    return np.array(weights), np.array(vectors)

##### Train set

In [0]:
X_q1_tfidf, X_q1_w2v_vect = get_weights_and_w2vectors(X_q1_tfidf, tfidf, model_w2v)
X_q2_tfidf, X_q2_w2v_vect = get_weights_and_w2vectors(X_q2_tfidf, tfidf, model_w2v)

In [0]:
#first row
X_q1_tfidf[0].shape, X_q1_w2v_vect[0].shape

((4,), (4, 300))

In [0]:
pickle.dump(X_q1_tfidf, open(serialization_objects_folder+'X_train_q1_tfidf.p','wb'))
pickle.dump(X_q2_tfidf, open(serialization_objects_folder+'X_train_q2_tfidf.p','wb'))
pickle.dump(X_q1_w2v_vect, open(serialization_objects_folder+'X_train_q1_w2v_vect.p','wb'))
pickle.dump(X_q2_w2v_vect, open(serialization_objects_folder+'X_train_q2_w2v_vect.p','wb'))

In [0]:
del X_q1_tfidf, X_q1_w2v_vect, X_q2_tfidf, X_q2_w2v_vect, X_train

##### Test set

In [0]:
X_test = pickle.load(open(serialization_objects_folder+'X_test.p', 'rb'))

In [0]:
X_tfidf_all_q = tfidf.transform(tokenize_for_model(model=model_w2v,data_type='test'))
# split back into two
X_q1_tfidf = X_tfidf_all_q[:len(X_test)]
X_q2_tfidf = X_tfidf_all_q[len(X_test):]

In [0]:
#X1_q1_tfidf[0] - sparsed vector with float (tfidf)
X_q1_tfidf.shape, X_q2_tfidf.shape, X_test.shape, X_q1_tfidf[0,X_q1_tfidf[0].todense().nonzero()[1]].todense()

((133415, 31889),
 (133415, 31889),
 (133415, 4),
 matrix([[0.41021448, 0.3659877 , 0.61066633, 0.56996817]]))

In [0]:
X_q1_tfidf, X_q1_w2v_vect = get_weights_and_w2vectors(X_q1_tfidf, tfidf, model_w2v)
X_q2_tfidf, X_q2_w2v_vect = get_weights_and_w2vectors(X_q2_tfidf, tfidf, model_w2v)

In [0]:
#first row
X_q1_tfidf[0].shape, X_q1_w2v_vect[0].shape

((4,), (4, 300))

In [0]:
pickle.dump(X_q1_tfidf, open(serialization_objects_folder+'X_test_q1_tfidf.p','wb'))
pickle.dump(X_q2_tfidf, open(serialization_objects_folder+'X_test_q2_tfidf.p','wb'))
pickle.dump(X_q1_w2v_vect, open(serialization_objects_folder+'X_test_q1_w2v_vect.p','wb'))
pickle.dump(X_q2_w2v_vect, open(serialization_objects_folder+'X_test_q2_w2v_vect.p','wb'))

In [0]:
!ls "$serialization_objects_folder"

X_test.p              X_train.p             readme
X_test_q1_tfidf.p     X_train_q1_tfidf.p    y_test.p
X_test_q1_w2v_vect.p  X_train_q1_w2v_vect.p y_train.p
X_test_q2_tfidf.p     X_train_q2_tfidf.p
X_test_q2_w2v_vect.p  X_train_q2_w2v_vect.p


In [0]:
del X_q1_tfidf, X_q1_w2v_vect, X_q2_tfidf, X_q2_w2v_vect, X_test

In [0]:
del model_w2v

#### Pairwise Distances & Weighted Means

In [0]:
def create_nan_array(r,c):
    arr = np.empty((r,c))
    arr[:] = np.nan
    return arr
def compute_pairwise_kernel(pc1, pc2, w1, w2, method='linear'):
    if pc1.size == 0 or pc2.size == 0:
        return np.nan
    if method=='polynomial':
        dist_mat = polynomial_kernel(pc1, pc2, 2)
    elif method=='rbf':
        dist_mat = rbf_kernel(pc1, pc2)
    elif method=='sigmoid':
        dist_mat = sigmoid_kernel(pc1, pc2)
    elif method=='laplacian':
        dist_mat = laplacian_kernel(pc1, pc2)
    else:
        dist_mat = linear_kernel(pc1, pc2)
    return np.average(dist_mat, weights=np.matmul(w1.reshape(-1,1),w2.reshape(-1,1).T))

def compute_pairwise_dist(pc1, pc2, w1, w2, method='euclidean'):
    if pc1.size == 0 or pc2.size == 0:
        return np.nan
    if method=='hausdorff':
        dist = directed_hausdorff(pc1, pc2)
        return dist[0]
    else:
        dist_mat = pairwise_distances(pc1, pc2, metric=method) 

    return np.average(dist_mat, weights=np.matmul(w1.reshape(-1,1),w2.reshape(-1,1).T))

def compute_pairwise_for_dataset(X1, X2, X1_w, X2_w, method):
    temp = []
    for q_tuple in zip(X1, X2, X1_w, X2_w):
        if q_tuple:
            q1_rd, q2_rd, q1_w, q2_w = q_tuple
            if method in ['polynomial', 'rbf', 'sigmoid', 'laplacian', 'linear']:
                temp.append(compute_pairwise_kernel(q1_rd, q2_rd, q1_w, q2_w, method))
            else:
                temp.append(compute_pairwise_dist(q1_rd, q2_rd, q1_w, q2_w, method))
        else:
            temp.append(np.nan)
    return temp

def compute_pairwise_for_dataset_wmean(X, X_w, file, store_folder):
    temp = []
    for q_tuple in zip(X, X_w):
        if q_tuple:
            q_rd, q_w = q_tuple
            if np.sum(q_w) != 0:
                temp.append(compute_weighted_mean(q_rd, q_w))
            else:
                temp.append(create_nan_array(1,300))                    
        else:
            temp.append(create_nan_array(1,300))
    temp_arr = np.array(temp)
    pickle_and_remove(temp_arr, file, store_folder) 

    # computes pairwise metrics, weighted mean and saves to store_folder 
def compute_and_save(X1, X2, X1_w, X2_w, method, file, store_folder):
    computed_obj = compute_pairwise_for_dataset(X1, X2, X1_w, X2_w, method)
    pickle_and_remove(computed_obj, file, store_folder) 

In [0]:
distances = ['chebyshev','braycurtis', 'cosine', 'correlation', 'canberra', 'hausdorff', 'cityblock',
            'euclidean', 'l1', 'l2', 'manhattan', 'minkowski', 'sqeuclidean']

def compute_and_save_for_all(X1, X2, X1_w, X2_w, distance, data_type, store_folder):
    name="%s_%s_w"%(distance, data_type)
    print(distance)
    compute_and_save(X1, X2, X1_w, X2_w, distance, name, store_folder)

##### Train set

In [0]:
X1_w = pickle.load(open(serialization_objects_folder+'X_train_q1_tfidf.p','rb'))
X2_w = pickle.load(open(serialization_objects_folder+'X_train_q2_tfidf.p','rb'))
X1 = pickle.load(open(serialization_objects_folder+'X_train_q1_w2v_vect.p','rb'))
X2 = pickle.load(open(serialization_objects_folder+'X_train_q2_w2v_vect.p','rb'))

In [0]:
X1_w[0].shape,X1[0].shape,X2_w[0].shape,X2[0].shape,

((4,), (4, 300), (6,), (6, 300))

In [0]:
#Run for train
data_type = 'train'
store_folder = serialization_objects_folder

for distance in distances:
    compute_and_save_for_all(X1, X2, X1_w, X2_w, distance, data_type, store_folder)

compute_and_save_mean_for_all('weighted_mean1', X1,X1_w, data_type, store_folder)
compute_and_save_mean_for_all('weighted_mean2', X2,X2_w, data_type, store_folder)

chebyshev
braycurtis
cosine
correlation
canberra
hausdorff
cityblock
euclidean
l1
l2
manhattan
minkowski
sqeuclidean
sqeuclidean
sqeuclidean


In [0]:
del X1_w, X2_w, X1, X2

##### Test set

In [0]:
X1_w = pickle.load(open(serialization_objects_folder+'X_test_q1_tfidf.p','rb'))
X2_w = pickle.load(open(serialization_objects_folder+'X_test_q2_tfidf.p','rb'))
X1 = pickle.load(open(serialization_objects_folder+'X_test_q1_w2v_vect.p','rb'))
X2 = pickle.load(open(serialization_objects_folder+'X_test_q2_w2v_vect.p','rb'))

In [0]:
X1_w[0].shape,X1[0].shape,X2_w[0].shape,X2[0].shape,

((4,), (4, 300), (5,), (5, 300))

In [0]:
#Run for train
data_type = 'test'
store_folder = serialization_objects_folder

for distance in distances:
    compute_and_save_for_all(X1, X2, X1_w, X2_w, distance, data_type, store_folder)

chebyshev
braycurtis
cosine
correlation
canberra
hausdorff
cityblock
euclidean
l1
l2
manhattan
minkowski
sqeuclidean
sqeuclidean
sqeuclidean


In [0]:
!ls "$serialization_objects_folder"

1_train.p              canberra_train_w.p     l1_train_w.p
X_test.p               chebyshev_test_w.p     l2_test_w.p
X_test_q1_tfidf.p      chebyshev_train_w.p    l2_train_w.p
X_test_q1_w2v_vect.p   cityblock_test_w.p     manhattan_test_w.p
X_test_q2_tfidf.p      cityblock_train_w.p    manhattan_train_w.p
X_test_q2_w2v_vect.p   correlation_test_w.p   minkowski_test_w.p
X_train.p              correlation_train_w.p  minkowski_train_w.p
X_train_q1_tfidf.p     cosine_test_w.p        readme
X_train_q1_w2v_vect.p  cosine_train_w.p       sqeuclidean_test_w.p
X_train_q2_tfidf.p     euclidean_test_w.p     sqeuclidean_train_w.p
X_train_q2_w2v_vect.p  euclidean_train_w.p    weighted_mean2_train.p
braycurtis_test_w.p    hausdorff_test_w.p     y_test.p
braycurtis_train_w.p   hausdorff_train_w.p    y_train.p
canberra_test_w.p      l1_test_w.p
