## Imports

In [1]:
import os

In [2]:
from gensim.models.wrappers.fasttext import FastText

In [3]:
from scipy.spatial.distance import cosine, euclidean, cityblock

## Config

In [4]:
data_folder = os.path.abspath(os.path.join(os.curdir, os.pardir, 'data')) + os.path.sep
aux_data_folder = os.path.join(data_folder, 'aux') + os.path.sep
preproc_data_folder = os.path.join(data_folder, 'preproc') + os.path.sep
features_data_folder = os.path.join(data_folder, 'features') + os.path.sep

## Read Data

In [5]:
df_questions_train = pd.read_csv(data_folder + 'train.csv').fillna('')
df_questions_test = pd.read_csv(data_folder + 'test.csv').fillna('')

In [6]:
question_tokens_train = load_json(preproc_data_folder + 'question_tokens_train.json')
question_tokens_test = load_json(preproc_data_folder + 'question_tokens_test.json')

In [7]:
embedding_model = FastText.load_word2vec_format(aux_data_folder + 'quora_filtered.vec')

In [8]:
X_tfidf_train = load(features_data_folder + 'X_train_tfidf_distances.pickle')
X_tfidf_test = load(features_data_folder + 'X_test_tfidf_distances.pickle')

## Build Features

In [9]:
def build_features(df_questions_original, questions_tokenized, embedding_model, X_tfidf_distances):
    num_pairs = len(questions_tokenized)
    word_vector_dim = len(embedding_model['apple'])
    num_features = 12
    
    X = np.zeros((num_pairs, num_features), dtype=float)
    
    for index, pair in progressbar(enumerate(questions_tokenized), size=num_pairs):
        q1_original = df_questions_original.ix[pair['id']].question1
        q2_original = df_questions_original.ix[pair['id']].question2

        q1_vectors = [embedding_model[token] for token in pair['question1']]
        q2_vectors = [embedding_model[token] for token in pair['question2']]

        q1_mean = np.mean(q1_vectors, axis=0)
        q2_mean = np.mean(q2_vectors, axis=0)

        # Length of questions (in characters)
        X[index, 0] = len(q1_original)
        X[index, 1] = len(q2_original)

        # Length of questions (in tokens)
        X[index, 2] = len(pair['question1'])
        X[index, 3] = len(pair['question2'])

        # Difference of question length (in characters)
        X[index, 4] = abs(len(q1_original) - len(q2_original))

        # Difference of question length (in tokens)
        X[index, 5] = abs(len(pair['question1']) - len(pair['question2']))

        # Cosine distance between average word vectors
        X[index, 6] = cosine(q1_mean, q2_mean)

        # Manhattan distance between average word vectors
        X[index, 7] = cityblock(q1_mean, q2_mean)

        # Euclidean distance between average word vectors
        X[index, 8] = euclidean(q1_mean, q2_mean)

        # Word Mover's Distance between the documents
        X[index, 9] = embedding_model.wmdistance(pair['question1'], pair['question2'])
    
    # TF-IDF cosine and euclidean distances between word vectors
    X[:, 10:12] = X_tfidf_distances
    
    return X

In [10]:
X_train = build_features(df_questions_train, question_tokens_train, embedding_model, X_tfidf_train)

In [11]:
save(X_train, features_data_folder + 'X_train_all_features.pickle')

In [12]:
df_X_train = pd.DataFrame(X_train, columns=[
    'q1_len', 'q2_len', 'q1_token_len', 'q2_token_len',
    'len_diff', 'token_len_diff',
    'mean_emb_cosine', 'mean_emb_manhattan', 'mean_emb_euclidean',
    'wmd', 'tfidf_cosine', 'tfidf_euclidean',
])

In [13]:
df_X_train['is_duplicate'] = load(features_data_folder + 'y_train.pickle')

In [14]:
df_X_train.to_csv(features_data_folder + 'X_train_all_features.csv', header=True, index=True, index_label='id')

In [15]:
# df_X['question1'] = df_questions_train.question1
# df_X['question2'] = df_questions_train.question2

In [16]:
df_X_train

Unnamed: 0,q1_len,q2_len,q1_token_len,q2_token_len,len_diff,token_len_diff,mean_emb_cosine,mean_emb_manhattan,mean_emb_euclidean,wmd,tfidf_cosine,tfidf_euclidean,is_duplicate
0,66.0,57.0,9.0,8.0,9.0,1.0,2.505759e-02,6.281017,0.459552,0.557206,2.134459e-02,0.206614,0
1,51.0,88.0,7.0,11.0,37.0,4.0,7.731999e-02,13.733343,0.975774,2.110534,2.297947e-01,0.677930,0
2,73.0,59.0,9.0,8.0,14.0,1.0,9.676523e-02,13.260343,0.964824,2.095801,7.420523e-01,1.218238,0
3,50.0,65.0,7.0,15.0,15.0,8.0,4.607860e-01,28.246296,2.034251,3.840113,1.000000e+00,1.414214,0
4,76.0,39.0,12.0,6.0,37.0,6.0,2.231927e-01,21.542313,1.556873,3.242191,7.365554e-01,1.213718,0
5,86.0,90.0,11.0,10.0,4.0,1.0,1.163057e-01,14.961195,1.053995,1.888861,4.890423e-01,0.988982,1
6,19.0,62.0,3.0,9.0,43.0,6.0,4.965189e-01,32.204063,2.338865,4.333144,1.000000e+00,1.414214,0
7,30.0,41.0,4.0,4.0,11.0,0.0,1.218676e-01,16.147606,1.144285,1.558236,1.856960e-01,0.609419,1
8,31.0,37.0,6.0,4.0,6.0,2.0,2.894151e-01,25.266159,1.823681,2.196834,0.000000e+00,0.000000,0
9,60.0,49.0,11.0,11.0,11.0,0.0,8.217730e-02,11.594727,0.854075,1.360483,3.255035e-01,0.806850,0


In [17]:
X_test = build_features(df_questions_test, question_tokens_test, embedding_model, X_tfidf_test)

In [18]:
save(X_test, features_data_folder + 'X_test_all_features.pickle')

In [19]:
df_X_test = pd.DataFrame(X_test, columns=[
    'q1_len', 'q2_len', 'q1_token_len', 'q2_token_len',
    'len_diff', 'token_len_diff',
    'mean_emb_cosine', 'mean_emb_manhattan', 'mean_emb_euclidean',
    'wmd', 'tfidf_cosine', 'tfidf_euclidean',
])

In [20]:
df_X_test.to_csv(
    features_data_folder + 'X_test_all_features.csv',
    header=True,
    index=True,
    index_label='id'
)