In [1]:
#Google Colab needs
import os
import sys
def isCollab():
    return os.environ.get('COLAB_GPU', None) != None

if isCollab():
    #Mounting GDrive disc
    from google.colab import drive
    drive.mount('/content/gdrive')
    path_to_storage = '/content/gdrive/My Drive/UCU-2019-final-project-storage'

    #Append path where custom modules stored. I put custom modules to GDrive disc
    path_to_modules = '/content/gdrive/My Drive/UCU-2019-final-project-storage'
    sys.path.append(path_to_modules)
else:
    sys.path.append('..')
    path_to_storage = None

In [37]:
#in: picke files with X_train, y_train, X_test, y_test 
#out: picke files with features  for test and train (size_diff_train, partial_ratio_test etc) 
import numpy as np
import pickle
import pandas as pd
!pip install fuzzywuzzy
!pip install python-Levenshtein
import fuzzywuzzy
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
from tqdm import tqdm
import gensim
import nltk
from scipy.spatial.distance import cosine, cityblock, jaccard, canberra, euclidean, minkowski, braycurtis
from utils.func.functions import pickle_and_remove, build_x



In [3]:
if not path_to_storage:
    path_to_storage = os.path.abspath(os.path.join(os.getcwd(), '../storage')) 

data_folder = path_to_storage+'/data/'
serialization_objects_folder = path_to_storage+'/serialization_objects/'

In [4]:
X_train = pickle.load(open(serialization_objects_folder + 'X_train.p', 'rb'))
y_train = pickle.load(open(serialization_objects_folder + 'y_train.p', 'rb'))
X_test = pickle.load(open(serialization_objects_folder + 'X_test.p', 'rb'))
y_test = pickle.load(open(serialization_objects_folder + 'y_test.p', 'rb'))

In [5]:
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords

stops = stopwords.words('english')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/denisporplenko/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/denisporplenko/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [6]:
def add_feature(dataset, feature_name, feature):
  dataset[feature_name] = dataset.apply(feature, axis=1)
 

In [7]:
len_diff = lambda data: abs(len(str(data['question1'])) - len(str(data['question2'])))

token_sort_ratio = lambda data: fuzz.token_sort_ratio(str(data['question1']), str(data['question2']))

token_set_ratio = lambda data: fuzz.token_set_ratio(str(data['question1']), str(data['question2']))

partial_ratio = lambda data: fuzz.partial_ratio(str(data['question1']), str(data['question2']))

def intersection_ratio(data):
  q1 = {}
  q2 = {}
  for word in str(data['question1']).lower().split():
    if word not in stops:
      q1[word] = q1.get(word, 0) + 1
  for word in str(data['question2']).lower().split():
    if word not in stops:
      q2[word] = q2.get(word, 0) + 1
  q1_shared_count = sum([q1[w] for w in q1 if w in q2])
  q2_shared_count = sum([q2[w] for w in q2 if w in q1])
  total = sum(q1.values()) + sum(q2.values())
  return (q1_shared_count + q2_shared_count) / total



token_ratio = lambda data: fuzz.ratio(str(data['question1']), str(data['question2']))

jaccard_distance = lambda data: nltk.jaccard_distance(set(str(data['question1'])), set(str(data['question2'])))
n_question_marks_diff = lambda data: abs(str(data['question1']).count('?') - str(data['question2']).count('?'))
n_capital_letters_diff = lambda data: abs(sum(map(str.isupper, str(data['question1']))) - sum(map(str.isupper, str(data['question2']))))




In [8]:
#Download GoogleNews-vectors-negative300.bin
path_to_google_news_model = data_folder+'GoogleNews-vectors-negative300.bin'
if not os.path.isfile(path_to_google_news_model):
    !wget -P "$data_folder" -c "https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz" 

In [9]:
model = gensim.models.KeyedVectors.load_word2vec_format(path_to_google_news_model, binary=True)

In [10]:
def sent2vec(s):
    words = str(s).lower()
    words = nltk.word_tokenize(words)
    words = [w for w in words if not w in stops]
    words = [w for w in words if w.isalpha()]
    M = []
    for w in words:
        try:
            M.append(model[w])
        except:
            continue
    M = np.array(M)
    v = M.sum(axis=0)
    return v / np.sqrt((v ** 2).sum())

In [11]:
def createVectors(data):
  q1_vecs = np.zeros((data.shape[0], 300))
  error_count = 0

  for i, q in tqdm(enumerate(data.question1.values)):
      q1_vecs[i, :] = sent2vec(q)

  q2_vecs  = np.zeros((data.shape[0], 300))
  for i, q in tqdm(enumerate(data.question2.values)):
      q2_vecs[i, :] = sent2vec(q)
      
  return q1_vecs, q2_vecs

In [12]:
q1_vecs_train, q2_vecs_train = createVectors(X_train)

  
270872it [00:54, 4930.88it/s]
270872it [00:52, 5157.91it/s]


In [13]:
q1_vecs_test, q2_vecs_test = createVectors(X_test)

  
133415it [00:25, 5155.24it/s]
133415it [00:26, 5053.35it/s]


In [14]:
def add_distances(data, q1v, q2v):
    data['cosine_distance'] = [cosine(x, y) for (x, y) in zip(np.nan_to_num(q1v),
                                        np.nan_to_num(q2v))]

    data['cityblock_distance'] = [cityblock(x, y) for (x, y) in zip(np.nan_to_num(q1v),
                                        np.nan_to_num(q2v))]

    data['jaccard_distance'] = [jaccard(x, y) for (x, y) in zip(np.nan_to_num(q1v),
                                        np.nan_to_num(q2v))]

    data['canberra_distance'] = [canberra(x, y) for (x, y) in zip(np.nan_to_num(q1v),
                                        np.nan_to_num(q2v))]

    data['euclidean_distance'] = [euclidean(x, y) for (x, y) in zip(np.nan_to_num(q1v),
                                        np.nan_to_num(q2v))]

    data['minkowski_distance'] = [minkowski(x, y, 3) for (x, y) in zip(np.nan_to_num(q1v),                                                           
                                        np.nan_to_num(q2v))]

    data['braycurtis_distance'] = [braycurtis(x, y) for (x, y) in zip(np.nan_to_num(q1v),
                                        np.nan_to_num(q2v))]

add_distances(X_train, q1_vecs_train, q2_vecs_train)



  dist = 1.0 - uv / np.sqrt(uu * vv)
  return l1_diff.sum() / l1_sum.sum()


In [16]:
add_distances(X_test, q1_vecs_test, q2_vecs_test)

In [17]:
del model, q1_vecs_train, q2_vecs_train, q1_vecs_test, q2_vecs_test

In [18]:
add_feature(X_train, "len_diff", len_diff)
add_feature(X_train, "token_sort_ratio", token_sort_ratio)
add_feature(X_train, 'token_ratio', token_ratio)
add_feature(X_train, "intersection_ratio", intersection_ratio)
add_feature(X_train, "token_set_ratio", token_set_ratio)
add_feature(X_train, "partial_ratio", partial_ratio)
add_feature(X_train, 'jaccard_distance', jaccard_distance)
add_feature(X_train, "n_capital_letters_diff", n_capital_letters_diff)
add_feature(X_train, 'n_question_marks_diff', n_question_marks_diff)

In [19]:
add_feature(X_test, "len_diff", len_diff)
add_feature(X_test, "token_sort_ratio", token_sort_ratio)
add_feature(X_test, 'token_ratio', token_ratio)
add_feature(X_test, "intersection_ratio", intersection_ratio)
add_feature(X_test, "token_set_ratio", token_set_ratio)
add_feature(X_test, "partial_ratio", partial_ratio)
add_feature(X_test, 'jaccard_distance', jaccard_distance)
add_feature(X_test, "n_capital_letters_diff", n_capital_letters_diff)
add_feature(X_test, 'n_question_marks_diff', n_question_marks_diff)

In [21]:
X_train.head(10)

Unnamed: 0_level_0,qid1,qid2,question1,question2,cosine_distance,cityblock_distance,jaccard_distance,canberra_distance,euclidean_distance,minkowski_distance,braycurtis_distance,len_diff,token_sort_ratio,token_ratio,intersection_ratio,token_set_ratio,partial_ratio,n_capital_letters_diff,n_question_marks_diff
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
243973,356509,356510,How does airbnb screen its host?,When as an Airbnb host can I review my guests?,0.512372,13.970236,0.416667,175.627457,1.012297,0.456996,0.585159,14,55,51,0.285714,58,50,2,0
82523,123111,45893,How can I learn hacking for free?,How can I learn hacking for security purposes?,0.291997,10.556866,0.227273,149.853963,0.764195,0.343893,0.40883,13,70,78,0.571429,92,88,0,0
373083,41716,2986,How can I speak fluent English with accuracy?,How do I speak English like celebrities?,0.345266,11.343897,0.2,156.346549,0.830983,0.376946,0.452207,5,55,61,0.5,66,60,0,0
145241,86221,51226,What are the best books for UPSC?,Which are the best books to prepare for IAS exam?,0.346353,11.238465,0.416667,150.825298,0.83229,0.379203,0.441895,16,65,68,0.5,81,73,1,0
227393,336229,302258,Why do smart people have to ask questions on Q...,Why do people ask questions on Quora?,0.088301,5.967858,0.090909,109.991955,0.42024,0.185252,0.224701,14,84,84,0.888889,100,78,0,0
25000,46635,46636,"Does the end justify the means, or does the me...",Does the end justify the means?,0.0,0.0,0.105263,0.0,0.0,0.0,0.0,35,64,64,0.555556,100,97,0,0
8308,16202,16203,How do I set up my VPN?,How do you set up a VPN?,0.0,0.0,0.166667,0.0,0.0,0.0,0.0,1,76,85,1.0,89,83,1,0
152489,239573,239574,What is viscous fluid?,Which is the least viscous fluid?,0.068753,5.087406,0.0625,91.663114,0.370818,0.16766,0.185664,11,72,73,0.8,86,77,0,0
283624,14110,36847,How does the ban on 500 and 1000 rupee notes h...,How will the India demonetization of 500 and 1...,0.310763,10.765844,0.071429,146.046414,0.78837,0.359624,0.415542,3,65,63,0.421053,70,63,1,0
308266,92119,432073,What should we lean for hacking?,What does lean do?,0.36531,11.902436,0.4,158.105356,0.854764,0.380901,0.470702,14,50,60,0.5,69,61,0,0


In [35]:
features = [f for f in X_train.columns.tolist() if f not in ['qid1', 'qid2', 'question1', 'question2']]

In [38]:
for feature in features:
    print(feature)
    pickle_and_remove(X_train[feature], "%s_train_w"%(feature), serialization_objects_folder)
    pickle_and_remove(X_test[feature], "%s_test_w"%(feature), serialization_objects_folder)

cosine_distance
cityblock_distance
jaccard_distance
canberra_distance
euclidean_distance
minkowski_distance
braycurtis_distance
len_diff
token_sort_ratio
token_ratio
intersection_ratio
token_set_ratio
partial_ratio
n_capital_letters_diff
n_question_marks_diff


In [39]:
ls "$serialization_objects_folder"

1_train.p                         euclidean_train_w.p
X_test.p                          hausdorff_test_w.p
X_test_q1_tfidf.p                 hausdorff_train_w.p
X_test_q1_w2v_vect.p              intersection_ratio_test_w.p
X_test_q2_tfidf.p                 intersection_ratio_train_w.p
X_test_q2_w2v_vect.p              jaccard_distance_test_w.p
X_train.p                         jaccard_distance_train_w.p
X_train_q1_tfidf.p                l1_test_w.p
X_train_q1_w2v_vect.p             l1_train_w.p
X_train_q2_tfidf.p                l2_test_w.p
X_train_q2_w2v_vect.p             l2_train_w.p
braycurtis_distance_test_w.p      len_diff_test_w.p
braycurtis_distance_train_w.p     len_diff_train_w.p
braycurtis_test_w.p               manhattan_test_w.p
braycurtis_train_w.p              manhattan_train_w.p
canberra_distance_test_w.p        minkowski_distance_test_w.p
canberra_distance_train_w.p       minkowski_distance_train_w.p
canberra_test_w.p                 minkowski_test_w.p

In [46]:
del X_train, X_test