In [1]:
#Google Colab needs
import os
import sys
def isCollab():
    return os.environ.get('COLAB_GPU', None) != None

if isCollab():
    #Mounting GDrive disc
    from google.colab import drive
    drive.mount('/content/gdrive')
    path_to_storage = '/content/gdrive/My Drive/UCU-2019-final-project-storage'

    #Append path where custom modules stored. I put custom modules to GDrive disc
    path_to_modules = '/content/gdrive/My Drive/UCU-2019-final-project-storage'
    sys.path.append(path_to_modules)
else:
    sys.path.append('..')
    path_to_storage = None

In [2]:
import numpy
import pandas as pd
import networkx as nx
import pickle
import hashlib
from itertools import combinations
from utils.func.functions import pickle_and_remove, build_x

In [3]:
if not path_to_storage:
    path_to_storage = os.path.abspath(os.path.join(os.getcwd(), '../storage')) 

data_folder = path_to_storage+'/data/'
serialization_objects_folder = path_to_storage+'/serialization_objects/'

In [4]:
X_train = pickle.load(open(serialization_objects_folder+'X_train.p', 'rb'))
y_train = pickle.load(open(serialization_objects_folder+'y_train.p', 'rb'))
X_test = pickle.load(open(serialization_objects_folder+'X_test.p', 'rb'))
y_test = pickle.load(open(serialization_objects_folder+'y_test.p', 'rb'))

In [5]:
def add_clique_size(X, y):
    G = nx.Graph()
    edges = [tuple(x) for x in X[['question1', 'question2']].values]
    G.add_edges_from(edges)
    with_y = pd.concat([X, y], axis=1)
    map_label = dict(((x[0], x[1]), x[2]) for x in with_y[['question1', 'question2', 'is_duplicate']].values)
    map_clique_size = {}
    cliques = sorted(list(nx.find_cliques(G)), key=lambda x: len(x))
    for cli in cliques:
        for q1, q2 in combinations(cli, 2):
            if (q1, q2) in map_label:
                map_clique_size[q1, q2] = len(cli)
            elif (q2, q1) in map_label:
                map_clique_size[q2, q1] = len(cli)
    X['clique_size'] = X.apply(lambda row: map_clique_size.get((row['question1'], row['question2']), -1), axis=1)


In [6]:
add_clique_size(X_train, y_train)

In [7]:
add_clique_size(X_test, y_test)

In [12]:
X_test.head(5)

Unnamed: 0_level_0,qid1,qid2,question1,question2,clique_size
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
92217,154416,154417,Thoughts on the new Pakistani blockbuster movi...,What do you think of the new Pakistani movie W...,2
191143,12400,168520,How do I come out of comfort zone?,How exactly can I get out of this comfort zone...,3
22640,42469,42470,What are the best ways to fake your own death?,What are the worst ways to fake one's own death?,2
348814,1178,52837,What is the meaning of life? Whats our purpose...,What's the purpose of a human life?,3
259163,374897,374898,How do I better communicate with my boyfriend?,How can I better communicate with my boyfriend...,2


In [9]:
pickle_and_remove(X_train['clique_size'], 'clique_size_train_w', serialization_objects_folder)
pickle_and_remove(X_test['clique_size'], 'clique_size_test_w', serialization_objects_folder)

In [10]:
ls "$serialization_objects_folder"

1_train.p                         euclidean_train_w.p
X_test.p                          hausdorff_test_w.p
X_test_q1_tfidf.p                 hausdorff_train_w.p
X_test_q1_w2v_vect.p              intersection_ratio_test_w.p
X_test_q2_tfidf.p                 intersection_ratio_train_w.p
X_test_q2_w2v_vect.p              jaccard_distance_test_w.p
X_train.p                         jaccard_distance_train_w.p
X_train_q1_tfidf.p                l1_test_w.p
X_train_q1_w2v_vect.p             l1_train_w.p
X_train_q2_tfidf.p                l2_test_w.p
X_train_q2_w2v_vect.p             l2_train_w.p
braycurtis_distance_test_w.p      len_diff_test_w.p
braycurtis_distance_train_w.p     len_diff_train_w.p
braycurtis_test_w.p               manhattan_test_w.p
braycurtis_train_w.p              manhattan_train_w.p
canberra_distance_test_w.p        minkowski_distance_test_w.p
canberra_distance_train_w.p       minkowski_distance_train_w.p
canberra_test_w.p                 minkowski_test_w.p

In [16]:
build_x(X_train, 'train', serialization_objects_folder).head(1)['clique_size']

Unnamed: 0_level_0,clique_size,clique_size
id,Unnamed: 1_level_1,Unnamed: 2_level_1
243973,2,2


In [17]:
build_x(X_test, 'test', serialization_objects_folder).head(1)['clique_size']

Unnamed: 0_level_0,clique_size,clique_size
id,Unnamed: 1_level_1,Unnamed: 2_level_1
92217,2,2


In [18]:
del X_train, y_train,X_test, y_test