In [1]:
#Google Colab needs
import os
import sys
def isCollab():
    return os.environ.get('COLAB_GPU', None) != None

if isCollab():
    #Mounting GDrive disc
    from google.colab import drive
    drive.mount('/content/gdrive')
    path_to_storage = '/content/gdrive/My Drive/UCU-2019-final-project-storage'

    #Append path where custom modules stored. I put custom modules to GDrive disc
    path_to_modules = '/content/gdrive/My Drive/UCU-2019-final-project-storage'
    sys.path.append(path_to_modules)
else:
    sys.path.append('..')
    path_to_storage = None

In [2]:
import numpy
import pandas as pd
import networkx as nx
import pickle
import hashlib
from itertools import combinations
from utils.func.functions import pickle_and_remove, build_x

In [3]:
if not path_to_storage:
    path_to_storage = os.path.abspath(os.path.join(os.getcwd(), '../storage')) 

data_folder = path_to_storage+'/data/'
serialization_objects_folder = path_to_storage+'/serialization_objects/'

In [4]:
X_train = pickle.load(open(serialization_objects_folder+'X_train.p', 'rb'))
y_train = pickle.load(open(serialization_objects_folder+'y_train.p', 'rb'))
X_test = pickle.load(open(serialization_objects_folder+'X_train.p', 'rb'))
y_test = pickle.load(open(serialization_objects_folder+'y_test.p', 'rb'))

In [5]:
def add_clique_size(X, y):
    G = nx.Graph()
    edges = [tuple(x) for x in X[['question1', 'question2']].values]
    G.add_edges_from(edges)
    with_y = pd.concat([X, y], axis=1)
    map_label = dict(((x[0], x[1]), x[2]) for x in with_y[['question1', 'question2', 'is_duplicate']].values)
    map_clique_size = {}
    cliques = sorted(list(nx.find_cliques(G)), key=lambda x: len(x))
    for cli in cliques:
        for q1, q2 in combinations(cli, 2):
            if (q1, q2) in map_label:
                map_clique_size[q1, q2] = len(cli)
            elif (q2, q1) in map_label:
                map_clique_size[q2, q1] = len(cli)
    X['clique_size'] = X.apply(lambda row: map_clique_size.get((row['question1'], row['question2']), -1), axis=1)


In [6]:
add_clique_size(X_train, y_train)

In [7]:
add_clique_size(X_test, y_test)

In [8]:
X_train.head(5)

Unnamed: 0_level_0,qid1,qid2,question1,question2,clique_size
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
243973,356509,356510,How does airbnb screen its host?,When as an Airbnb host can I review my guests?,2
82523,123111,45893,How can I learn hacking for free?,How can I learn hacking for security purposes?,5
373083,41716,2986,How can I speak fluent English with accuracy?,How do I speak English like celebrities?,7
145241,86221,51226,What are the best books for UPSC?,Which are the best books to prepare for IAS exam?,2
227393,336229,302258,Why do smart people have to ask questions on Q...,Why do people ask questions on Quora?,2


In [9]:
pickle_and_remove(X_train['clique_size'], 'clique_size_train_w', serialization_objects_folder)
pickle_and_remove(X_test['clique_size'], 'clique_size_test_w', serialization_objects_folder)

In [10]:
ls "$serialization_objects_folder"

1_train.p               chebyshev_test_w.p      hausdorff_train_w.p
X_test.p                chebyshev_train_w.p     l1_test_w.p
X_test_q1_tfidf.p       cityblock_test_w.p      l1_train_w.p
X_test_q1_w2v_vect.p    cityblock_train_w.p     l2_test_w.p
X_test_q2_tfidf.p       clique_size_test.p      l2_train_w.p
X_test_q2_w2v_vect.p    clique_size_test_w.p    manhattan_test_w.p
X_train.p               clique_size_train.p     manhattan_train_w.p
X_train_q1_tfidf.p      clique_size_train_w.p   minkowski_test_w.p
X_train_q1_w2v_vect.p   correlation_test_w.p    minkowski_train_w.p
X_train_q2_tfidf.p      correlation_train_w.p   readme
X_train_q2_w2v_vect.p   cosine_test_w.p         sqeuclidean_test_w.p
braycurtis_test_w.p     cosine_train_w.p        sqeuclidean_train_w.p
braycurtis_train_w.p    euclidean_test_w.p      weighted_mean2_train.p
canberra_test_w.p       euclidean_train_w.p     y_test.p
canberra_train_w.p      hausdorff_test_w.p      y_train.p


In [11]:
del X_train, y_train,X_test, y_test