In [1]:
import os
import re
import tqdm
import string
import pandas as pd
import numpy as np
import util
from sklearn.decomposition import TruncatedSVD
from data_preprocessing import *
from word_embedding_load import *

In [2]:
[all_data, train_size, test_size, train_x, train_y, test_x] = util.loadData()

  ID    Gene             Variation  \
0  0  FAM58A  Truncating Mutations   
1  1     CBL                 W802*   
2  2     CBL                 Q249E   
3  3     CBL                 N454D   
4  4     CBL                 L399V   

                                                Text  
0  Cyclin-dependent kinases (CDKs) regulate a var...  
1   Abstract Background  Non-small cell lung canc...  
2   Abstract Background  Non-small cell lung canc...  
3  Recent evidence has demonstrated that acquired...  
4  Oncogenic mutations in the monomeric Casitas B...  


In [3]:
[allText, sentences] = data_preprocess(all_data)

In [4]:
print(allText.head())

0    cyclindependent kinases cdks regulate variety ...
1    abstract background nonsmall cell lung cancer ...
2    abstract background nonsmall cell lung cancer ...
3    recent evidence demonstrated acquired uniparen...
4    oncogenic mutations monomeric casitas blineage...
Name: Text, dtype: object


In [5]:
allText.shape

(4675,)

In [6]:
Text_INPUT_DIM=200
param = Doc2VecParam(1, 5, 200, 1e-4, 5, 4, 30, 1)
filename='../model/doc2vec/docEmbeddings_30_load_all.d2v'


In [7]:
GENE_INPUT_DIM=25
TEXT_INPUT_DIM=200
svd = TruncatedSVD(n_components=25, n_iter=GENE_INPUT_DIM, random_state=12)

In [12]:
text_model = getTextModel(sentences, param, filename)
truncated_one_hot_gene = getGeneVec(all_data, svd)
truncated_one_hot_variation = getVariationVec(all_data, svd)
text_train_arrays, text_test_arrays = getTextVec(text_model, train_size, test_size, TEXT_INPUT_DIM)

Loading model...
successfully loaded the textmodel from ../model/doc2vec/docEmbeddings_30_load_all.d2v


In [13]:
print(text_train_arrays.shape)
print(text_test_arrays.shape)
text_train_arrays[0]

(3689, 200)
(986, 200)


array([-4.32804871, -1.51728463, -1.11980414, -1.48884988, -1.94485927,
        2.92375898,  0.50411427, -1.12256765,  3.63756609,  0.98047012,
       -3.59505153, -1.69239998,  1.08154798,  0.04358532, -1.45718324,
       -1.40607941, -0.81513387, -1.83771491,  0.84762865,  1.21229076,
        2.47014523, -2.76897311,  1.76767576,  0.63634545, -1.18918824,
        0.41207117,  0.46613771, -0.66748267, -0.08578082,  0.74831831,
       -0.27361712, -1.59590268, -0.68817914,  1.80353093, -2.18088555,
       -0.95929134,  3.10893011,  1.76786911, -0.6726135 ,  1.6783632 ,
        0.86585337,  2.32940316,  3.42221165, -1.29582059,  1.10946131,
       -1.03413177,  1.08570337, -1.0144273 ,  0.31510681,  0.28210998,
        2.33720469,  0.54392654, -2.83733749, -1.03502595, -0.16337521,
       -3.04194999, -0.25847396, -2.89910054,  1.95583844,  2.05173278,
       -0.68942761,  2.17256355, -2.50274634,  2.23825169, -2.29581237,
        4.42594433,  2.62937164,  2.88882828, -0.28914687, -2.05

In [14]:
train_set = np.hstack((truncated_one_hot_gene[:train_size], truncated_one_hot_variation[:train_size], text_train_arrays))
test_set = np.hstack((truncated_one_hot_gene[train_size:], truncated_one_hot_variation[train_size:], text_test_arrays))
encoded_y = pd.get_dummies(train_y)
encoded_y = np.array(encoded_y)

In [15]:
print(train_set.shape)
print(test_set.shape)

(3689, 250)
(986, 250)


In [18]:
print(train_set[0, :25])
print(train_set[0, 25:50])
print(train_set[0, 50:])

[  2.48455806e-23   5.95345506e-19   2.55148417e-20  -3.12596635e-22
   4.07695291e-22   5.14309572e-25   2.12813627e-24  -2.97221364e-27
   2.40533195e-28  -2.34699855e-30  -1.02435863e-29  -2.70616608e-28
   3.11053711e-28   1.52301261e-27  -6.31031488e-27  -4.29210763e-27
  -2.58870185e-26   1.64361290e-26  -6.65840234e-26   1.85506791e-26
  -3.44148337e-25  -1.13345002e-25   1.15081601e-24  -1.65793325e-23
   4.35367578e-23]
[  1.00000000e+00   7.43920261e-19   1.86967768e-19   2.36987218e-29
   3.78288394e-25   7.97142704e-23  -1.06895319e-20   6.87926690e-21
   4.36854413e-21  -4.04470778e-21   8.91881464e-21   6.59040569e-18
   5.67979623e-19  -3.15512997e-18  -9.40491899e-18  -6.36407932e-18
  -3.20292165e-18   9.53349364e-18   7.28293876e-18   2.84124514e-17
  -1.85442723e-17   1.28733104e-17   1.05102497e-17   4.67448189e-17
  -8.77125342e-21]
[-4.32804871 -1.51728463 -1.11980414 -1.48884988 -1.94485927  2.92375898
  0.50411427 -1.12256765  3.63756609  0.98047012 -3.59505153 

In [29]:
print(encoded_y.shape)
print(encoded_y[300])
print(encoded_y[1000])

(3689, 9)
[0 0 0 0 0 0 1 0 0]
[0 0 0 1 0 0 0 0 0]


In [30]:
v1 = train_set[0]
v2 = train_set[1]

In [31]:
np.dot(v1, v2)

372.08543091707043

In [32]:
from baseline_classification import *

In [33]:
computeSim(v2, v1, [25, 25, 200])

0.13847298392729604

In [34]:
getCos(v2[0:25], v2[0:25])

1.0