In [1]:
import os
import re
import tqdm
import string
import pandas as pd
import numpy as np
import util
from sklearn.decomposition import TruncatedSVD
from data_preprocessing import *
from word_embedding_load import *

In [2]:
[all_data, train_size, test_size, train_x, train_y, test_x] = util.loadData()

  ID    Gene             Variation  \
0  0  FAM58A  Truncating Mutations   
1  1     CBL                 W802*   
2  2     CBL                 Q249E   
3  3     CBL                 N454D   
4  4     CBL                 L399V   

                                                Text  
0  Cyclin-dependent kinases (CDKs) regulate a var...  
1   Abstract Background  Non-small cell lung canc...  
2   Abstract Background  Non-small cell lung canc...  
3  Recent evidence has demonstrated that acquired...  
4  Oncogenic mutations in the monomeric Casitas B...  


In [3]:
[allText, sentences] = data_preprocess(all_data)

In [4]:
print(allText.head())

0    cyclindependent kinases cdks regulate variety ...
1    abstract background nonsmall cell lung cancer ...
2    abstract background nonsmall cell lung cancer ...
3    recent evidence demonstrated acquired uniparen...
4    oncogenic mutations monomeric casitas blineage...
Name: Text, dtype: object


In [5]:
param = Doc2VecParam(3, 6, 201,...)

In [6]:
Text_INPUT_DIM=200
param = Doc2VecParam(1, 5, 600, 1e-4, 5, 4, 30, 1)
filename='../model/doc2vec/docEmbeddings_30_load_all.d2v'


In [7]:
temp = getTextModel(param, filename)

NameError: name 'os' is not defined

In [None]:
def getTextVec(text_model, train_size, test_size, TEXT_INPUT_DIM = 200):
    text_train_arrays = np.zeros((train_size, TEXT_INPUT_DIM))
    text_test_arrays = np.zeros((test_size, TEXT_INPUT_DIM))
    for i in range(train_size):
        text_train_arrays[i] = text_model.docvecs['Text_'+str(i)]
    j = 0
    for i in range(train_size, test_size + train_size):
        text_test_arrays[j] = text_model.docvecs['Text_'+str(i)]
        j += 1
    return text_train_arrays, text_test_arrays

In [None]:
text_test_arrays, text_test_arrays = getTextVec(temp, train_size, test_size)

In [None]:
print(text_test_arrays.shape)

In [None]:
'''
Get the vector representation for the Gene, the length of the vector is compressed by SVD with default input dimension 25
@param: 
    all_data,
    svd, TruncatedSVD model from sklearn
@return: truncated_one_hot_gene, gene vector representation

'''
def getGeneVec(all_data, svd):
    one_hot_gene = pd.get_dummies(all_data['Gene'])
    truncated_one_hot_gene = svd.fit_transform(one_hot_gene.values)
    return truncated_one_hot_gene

'''
Get the vector representation for the variation type, the length of the vector is compressed by SVD with default input dimension 25
@param: 
    all_data,
    svd, TruncatedSVD model from sklearn
@return: truncated_one_hot_variation, variation vector representation

'''
def getVariationVec(all_data, svd):
    one_hot_variation = pd.get_dummies(all_data['Variation'])
    truncated_one_hot_variation = svd.fit_transform(one_hot_variation.values)
    return truncated_one_hot_variation

GENE_INPUT_DIM=25
TEXT_INPUT_DIM=200
svd = TruncatedSVD(n_components=25, n_iter=GENE_INPUT_DIM, random_state=12)


In [None]:
text_model = getTextModel(param, filename)
truncated_one_hot_gene = getGeneVec(all_data, svd)
truncated_one_hot_variation = getVariationVec(all_data, svd)
text_train_arrays, text_test_arrays = getTextVec(text_model, train_size, test_size, TEXT_INPUT_DIM)

In [None]:
print(text_train_arrays.shape)

In [None]:
train_set = np.hstack((truncated_one_hot_gene[:train_size], truncated_one_hot_variation[:train_size], text_train_arrays))
test_set = np.hstack((truncated_one_hot_gene[train_size:], truncated_one_hot_variation[train_size:], text_test_arrays))
encoded_y = pd.get_dummies(train_y)
encoded_y = np.array(encoded_y)

In [None]:
print(truncated_one_hot_gene[:train_size].shape)
print(truncated_one_hot_variation[:train_size].shape)
print(text_train_arrays.shape)