# Generate our corpus to train the Base Model for Transfer Learning

In [1]:
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
import numpy as np
import sklearn as sk
import random
import csv
import re
import collections
import pickle
import sys

#pip install -U gensim
from gensim.models import word2vec
from sklearn.model_selection import train_test_split

In [2]:
%run ../utils.ipynb

In [3]:
embSize = 200
pos_file = "data/dataset_disgent_ctd_6719.csv"
neg_file="data/negatives_5000.csv"
ftrain="../data/our_corpus/train_samples_50of100.csv"
ftest="../data/our_corpus/test_samples_20of100.csv"
fval="../data/our_corpus/val_samples_30of100.csv"
# Replace with path of word embdding file   
wefile = "/mnt/admin/GDA_backup/Dataset/embeddings/PubMed-and-PMC-w2v.bin"
train_frac=0.5
test_frac=0.2
val_frac=0.3
random_seed=1331

In [4]:
neg_samples= pd.read_csv(neg_file, encoding='latin-1')


# sample postive and negative ,  and then combine& shuffle

In [5]:
#change random seed to sample different data in each run 
positive_samples,negative_samples=sample_positive_negative(pos_file,neg_file,neg_ratio= 1,num_pos_samples= 4000,random_seed=random_seed)

pos_train_samples,pos_test_samples,pos_val_samples=split_train_test_val(positive_samples,train_frac,test_frac,val_frac,random_seed)

neg_train_samples,neg_test_samples,neg_val_samples=split_train_test_val(negative_samples,train_frac,test_frac,val_frac,random_seed)

In [6]:
#combine
train_samples=pos_train_samples.append(neg_train_samples,sort=False)  
#shuffle
train_samples=train_samples.sample(frac=1,random_state=random_seed)

#combine
test_samples=pos_test_samples.append(neg_test_samples,sort=False)  
#shuffle
test_samples=test_samples.sample(frac=1,random_state=random_seed)

#combine
val_samples=pos_val_samples.append(neg_val_samples,sort=False)  
#shuffle
val_samples=val_samples.sample(frac=1,random_state=random_seed)

train_samples.to_csv(ftrain,index=None)
test_samples.to_csv(ftest,index=None)
val_samples.to_csv(fval,index=None)

# Read  Data  and Create Features

In [7]:
Tr_sent_contents, Tr_entity1_list, Tr_entity2_list, Tr_sent_lables,Tr_gene_id_list,Tr_disease_id_list = dataRead(ftrain,max_length=100 )

Tr_word_list, Tr_d1_list, Tr_d2_list  = get_wordList_and_distances_Corpus(Tr_sent_contents, Tr_entity1_list, Tr_entity2_list)

V_sent_contents, V_entity1_list, V_entity2_list, V_sent_lables, V_gene_id_list,V_disease_id_list = dataRead(fval,max_length=100 )

V_word_list, V_d1_list, V_d2_list  = get_wordList_and_distances_Corpus(V_sent_contents, V_entity1_list, V_entity2_list)

Te_sent_contents, Te_entity1_list, Te_entity2_list, Te_sent_lables,Te_gene_id_list,Te_disease_id_list = dataRead(ftest,max_length=100 )

Te_word_list, Te_d1_list, Te_d2_list  = get_wordList_and_distances_Corpus(Te_sent_contents, Te_entity1_list, Te_entity2_list)

Input File Reading
Input File Reading
Input File Reading


In [8]:
print ("train_size", len(Tr_word_list))
print( "val_size", len(V_word_list))
print( "test_size", len(Te_word_list))

train_size 3997
val_size 2398
test_size 1597


In [9]:
train_sent_lengths, val_sent_lengths, test_sent_lengths = findSentLengths([Tr_word_list, V_word_list, Te_word_list])

sentMax = max(train_sent_lengths + val_sent_lengths + test_sent_lengths)


In [10]:
label_dict = {'Negative':0, 'Positive':1}

### Load SNP dataset to synchronize the dictionary IDs with our generated Corpus

In [11]:
with open('../data/pickles/train_and_test_data_sentences_snp_2class.pickle', 'rb') as handle:    
    
    W_train_snp = pickle.load(handle)
    d1_train_snp = pickle.load(handle)
    d2_train_snp = pickle.load(handle)
    Y_train_snp = pickle.load(handle)
    Tr_word_list_snp = pickle.load(handle)
    
    W_test_snp = pickle.load(handle)
    d1_test_snp = pickle.load(handle)
    d2_test_snp = pickle.load(handle)
    Y_test_snp = pickle.load(handle)
    Te_word_list_snp = pickle.load(handle)
    
    
    word_vectors_snp = pickle.load(handle)
    word_dict_snp = pickle.load(handle)
    d1_dict_snp = pickle.load(handle)
    d2_dict_snp = pickle.load(handle)
    label_dict_snp = pickle.load(handle)
    sentMax_snp = pickle.load(handle)

In [12]:
sent_list = sum([Tr_word_list, V_word_list, Te_word_list,Tr_word_list_snp,Te_word_list_snp], [])

word_dict, word_to_id, id_to_word = word_mapping(sent_list)

Found 17340 unique words (243071 in total)


# Generate Word Embedding Vectors

In [14]:
print( "word dictonary length", len(word_dict))

# Word Embedding
wv = readWordEmb(word_dict,id_to_word,word_to_id, wefile, embSize)


word dictonary length 17340
Reading word vectors
Loaded 4087446 pretrained embeddings.
number of unknown word in word embedding 2875


In [16]:
sentMax_snp=100

In [17]:
with open('data/integrated_entities.pickle', 'wb') as handle:
    pickle.dump(id_to_word, handle)
    pickle.dump(word_to_id, handle)
    pickle.dump(sentMax_snp, handle)
    pickle.dump(wv, handle)
    pickle.dump(word_dict, handle)
    

In [18]:
# Mapping Train
W_train =   mapWordToId(Tr_word_list, word_to_id)

## Prepare Lable Matrix

In [19]:
Y_t = mapLabelToId(Tr_sent_lables, label_dict)
Y_train = np.zeros((len(Y_t), len(label_dict)))
for i in range(len(Y_t)):
    Y_train[i][Y_t[i]] = 1.0

In [20]:
#Mapping Validation
W_val =   mapWordToId(V_word_list, word_to_id)
Y_t = mapLabelToId(V_sent_lables, label_dict)
Y_val = np.zeros((len(Y_t), len(label_dict)))
for i in range(len(Y_t)):
    Y_val[i][Y_t[i]] = 1.0

In [21]:
# Mapping Test
W_test =   mapWordToId(Te_word_list, word_to_id)
Y_t = mapLabelToId(Te_sent_lables, label_dict)
Y_test = np.zeros((len(Y_t), len(label_dict)))
for i in range(len(Y_t)):
    Y_test[i][Y_t[i]] = 1.0

### Pad Embdding Vectors

In [22]:
sentMax=sentMax_snp

In [23]:
#padding
W_train, W_val, W_test = paddData([W_train, W_val, W_test], sentMax) 

print ("train", len(W_train))
print ("test", len(W_test))

train 3997
test 1597


In [88]:
W_train.shape

(3997, 91)

# Save Integrated data as a Pickle file to be used for training the Base Model 

In [24]:
with open('data/my_corpus_integrated.pickle', 'wb') as handle:
    pickle.dump(W_train, handle)    
    pickle.dump(Y_train, handle)
    pickle.dump(Tr_word_list, handle)
    
    pickle.dump(W_val, handle)
    pickle.dump(Y_val, handle)
    pickle.dump(V_word_list, handle)

    pickle.dump(W_test, handle)
    pickle.dump(Y_test, handle)
    pickle.dump(Te_word_list, handle)

    pickle.dump(wv, handle)
    pickle.dump(word_dict, handle)
 
    pickle.dump(label_dict, handle) 
    pickle.dump(sentMax, handle)
    
    
    