In [1]:
import os
import re
import tqdm
import string
import pandas as pd
import numpy as np
import util
from sklearn.decomposition import TruncatedSVD
from data_preprocessing import *
from word_embedding_load import *

In [2]:
[all_data, train_size, test_size, train_x, train_y, test_x] = util.loadData()

  ID    Gene             Variation  \
0  0  FAM58A  Truncating Mutations   
1  1     CBL                 W802*   
2  2     CBL                 Q249E   
3  3     CBL                 N454D   
4  4     CBL                 L399V   

                                                Text  
0  Cyclin-dependent kinases (CDKs) regulate a var...  
1   Abstract Background  Non-small cell lung canc...  
2   Abstract Background  Non-small cell lung canc...  
3  Recent evidence has demonstrated that acquired...  
4  Oncogenic mutations in the monomeric Casitas B...  


In [3]:
[allText, sentences] = data_preprocess(all_data)

In [4]:
print(allText.head())

0    cyclindependent kinases cdks regulate variety ...
1    abstract background nonsmall cell lung cancer ...
2    abstract background nonsmall cell lung cancer ...
3    recent evidence demonstrated acquired uniparen...
4    oncogenic mutations monomeric casitas blineage...
Name: Text, dtype: object


In [5]:
Text_INPUT_DIM=200
param = Doc2VecParam(1, 5, 600, 1e-4, 5, 4, 30, 1)
filename='../model/doc2vec/docEmbeddings_30_load_all.d2v'


In [6]:
GENE_INPUT_DIM=25
TEXT_INPUT_DIM=200
svd = TruncatedSVD(n_components=25, n_iter=GENE_INPUT_DIM, random_state=12)

In [7]:
text_model = getTextModel(param, filename)
truncated_one_hot_gene = getGeneVec(all_data, svd)
truncated_one_hot_variation = getVariationVec(all_data, svd)
text_train_arrays, text_test_arrays = getTextVec(text_model, train_size, test_size, TEXT_INPUT_DIM)

Loading model...
successfully loaded the textmodel from ../model/doc2vec/docEmbeddings_30_load_all.d2v


In [8]:
print(text_train_arrays.shape)
print(text_test_arrays.shape)

(3689, 200)
(986, 200)


In [9]:
train_set = np.hstack((truncated_one_hot_gene[:train_size], truncated_one_hot_variation[:train_size], text_train_arrays))
test_set = np.hstack((truncated_one_hot_gene[train_size:], truncated_one_hot_variation[train_size:], text_test_arrays))
encoded_y = pd.get_dummies(train_y)
encoded_y = np.array(encoded_y)

In [10]:
print(train_set.shape)
print(test_set.shape)

(3689, 250)
(986, 250)


In [11]:
train_set[0, 25:50]

array([  1.00000000e+00,   7.43920261e-19,   1.86967768e-19,
         2.36987218e-29,   3.78288394e-25,   7.97142704e-23,
        -1.06895319e-20,   6.87926690e-21,   4.36854413e-21,
        -4.04470778e-21,   8.91881464e-21,   6.59040569e-18,
         5.67979623e-19,  -3.15512997e-18,  -9.40491899e-18,
        -6.36407932e-18,  -3.20292165e-18,   9.53349364e-18,
         7.28293876e-18,   2.84124514e-17,  -1.85442723e-17,
         1.28733104e-17,   1.05102497e-17,   4.67448189e-17,
        -8.77125342e-21])

In [12]:
encoded_y.shape

(3689, 9)

In [13]:
v1 = train_set[0]
v2 = train_set[1]

In [14]:
np.dot(v1, v2)

372.08543091707043

In [16]:
from baseline_classification import *

In [17]:
computeSim(v2, v1, [25, 25, 200])

0.13847298392729604

In [18]:
getCos(v2[0:25], v2[0:25])

1.0