In [12]:
import numpy as np
import re
from string import punctuation
from nltk.corpus import stopwords
from collections import Counter

In [13]:
train_file = open('/Users/zhang/MscProject_tweak2vec/pubmed-rct/PubMed_20k_RCT_numbers_replaced_with_at_sign/train.txt','r')
dev_file = open('/Users/zhang/MscProject_tweak2vec/pubmed-rct/PubMed_20k_RCT_numbers_replaced_with_at_sign/dev.txt','r')
test_file = open('/Users/zhang/MscProject_tweak2vec/pubmed-rct/PubMed_20k_RCT_numbers_replaced_with_at_sign/test.txt','r')

In [14]:
def file_to_docs(filename):
    file = filename
    lines = file.readlines()
    start = 0
    end = 0
    docs = []
    for i in range(len(lines)):
        if lines[i] == '\n':
            end = i
            docs.append(lines[start:end])
            start = i+1
    return docs

def docs_to_xy(docs, add_unk = False, vocab_list = []):
    stop_words = set(stopwords.words('english'))
    punctuation = '!"#$%&\'()*+,-./:;<=>?[\\]^_`{|}~'
    docs_x = []
    docs_y = []
    for doc in docs:
        doc_x = []
        doc_y = []
        for sentence in doc:
            sentence = re.sub(r"\n", " ", sentence)
            sentence = re.sub(r"\t", " ", sentence)
            #sentence = re.sub(r"@", " @ ", sentence)
            sentence = sentence.split()
            sentence = [c for c in sentence if c not in punctuation]
            sentence = [c for c in sentence if c not in stop_words]
            sentence = [c.lower() for c in sentence]
            if sentence[0][:3]!='###':
                if add_unk:
                    sentence = ['UNK' if c not in vocab_list else c for c in sentence]
                if len(sentence)>1:
                    doc_y.append(sentence[0])
                    doc_x.append(' '.join(sentence[1:]))
        docs_x.append(doc_x)
        docs_y.append(doc_y)
    return docs_x, docs_y 

In [15]:
train_docs = file_to_docs(train_file)
dev_docs = file_to_docs(dev_file)
test_docs = file_to_docs(test_file)

train_x, train_y = docs_to_xy(train_docs)
dev_x, dev_y = docs_to_xy(dev_docs)
test_x, test_y = docs_to_xy(test_docs)

In [8]:
corpus = np.concatenate([train_x,dev_x],axis=0).tolist()
b = str(corpus)
b = b.replace('[','')
b = b.replace(']','')
corpus = list(eval(b))

### create vocab

In [16]:
vocab = Counter()
for i in corpus:
    tokens = i.split()
    vocab.update(tokens)

In [17]:
vocab_list = [k for k,c in vocab.most_common() if c >= 5]
vocab_freq_list = [[k,c] for k,c in vocab.most_common() if c >= 5]
print(len(vocab_list))
vocab_list.append('UNK')
word2int = {word: ii for ii, word in enumerate(vocab_list)}
lable2int = {'background':0,
             'objective':1,
             'methods':2,
             'results':3,
             'conclusions':4}
print(len(vocab_list))
np.save('pubmed_vocab5.npy',np.array(vocab_list))
np.save('pubmed_vocab5_frq.npy',np.array(vocab_freq_list))

27187
27188


### corpus with unk

In [60]:
train_x_unk, train_y_unk = docs_to_xy(train_docs,True,vocab_list)
print('train')
dev_x_unk, dev_y_unk = docs_to_xy(dev_docs,True,vocab_list)
print('dev')
test_x_unk, test_y_unk = docs_to_xy(test_docs,True,vocab_list)
print('test')

train
dev
test


In [44]:
corpus_unk = np.concatenate([train_x_unk,dev_x_unk],axis=0).tolist()
b = str(corpus_unk)
b = b.replace('[','')
b = b.replace(']','')
corpus_unk = list(eval(b))

### corpus of int

In [45]:
corpus_int = []
for line in corpus_unk:
    corpus_int.append([word2int[w] for w in line.split()])

In [46]:
np.save('pubmed_corpus_int5.npy',np.array(corpus_int))

In [47]:
def x_to_int(docs, word2int):
    ints_x = []
    for doc in docs:
        int_x = []
        for sentence in doc:
            int_x.append([word2int[word] for word in sentence.split()])
        ints_x.append(int_x)   
    return ints_x

def y_to_int(docs, word2int):
    ints_y = []
    for doc in docs:
        int_y = []
        for lable in doc:
            int_y.append(word2int[lable])
        ints_y.append(int_y)   
    return ints_y

In [48]:
train_x_int = x_to_int(train_x_unk, word2int)
train_y_int = y_to_int(train_y_unk, lable2int)

In [49]:
dev_x_int = x_to_int(dev_x_unk, word2int)
dev_y_int = y_to_int(dev_y_unk, lable2int)

In [50]:
test_x_int = x_to_int(test_x_unk, word2int)
test_y_int = y_to_int(test_y_unk, lable2int)

In [51]:
np.save('pubmed_train_x',np.array(train_x_int))
np.save('pubmed_train_y',np.array(train_y_int))
np.save('pubmed_dev_x',np.array(dev_x_int))
np.save('pubmed_dev_y',np.array(dev_y_int))
np.save('pubmed_test_x',np.array(test_x_int))
np.save('pubmed_test_y',np.array(test_y_int))

### get google word2vec

In [18]:
import gensim.models
from gensim.models import Word2Vec
from sklearn.decomposition import PCA

In [19]:
google_model_full = gensim.models.KeyedVectors.load_word2vec_format(
    '/Users/zhang/MscProject_tweak2vec/GoogleNews-vectors-negative300.bin',binary=True)

In [20]:
google_wordVec = []
for word in vocab_list:
    if word in google_model_full:
        google_wordVec.append(google_model_full[word])
    if word not in google_model_full:
        google_wordVec.append(google_model_full['UNK'])

In [21]:
pca = PCA(n_components=50)
google_wordVec_50 = pca.fit_transform(google_wordVec)
google_wordVec_50.shape

(27188, 50)

In [22]:
np.save('w2v_google_50d.npy',np.array(google_wordVec_50))

### pivots selection

In [77]:
google_model_40k = gensim.models.KeyedVectors.load_word2vec_format(
    '/Users/zhang/MscProject_tweak2vec/GoogleNews-vectors-negative300.bin',binary=True, limit=40000)

In [83]:
pivots = []
for word in word2int.keys():
    if word in google_model_40k:
        pivots.append(word)
len(pivots)

8327

In [84]:
remove_at_pivots = pivots[1:]

In [99]:
pivots_vec = {}
pivots_size = 5000
for i in range(pivots_size):
    pivots_vec[ word2int[remove_at_pivots[i]] ] = google_wordVec_50[word2int[remove_at_pivots[i]]].tolist()

In [101]:
f = open('pubmed_pivots_google_500.txt','w')  
f.write(str(pivots_vec))
f.close()