In [None]:
from gensim.models import word2vec
from pycorenlp import StanfordCoreNLP
import tensorflow as tf
import numpy as np
import gensim
import os
import pprint

pp=pprint.PrettyPrinter(indent=4)

#param
nlpserver="http://localhost:9000"
vector_size=100
corpus_file="enwiki-20150112-400-r100-10576.txt"
word2vec_model_file="word2vec.mod"

debug=True #when debug is true, you need to use vector_size=5 word2vec model

lr=0.01
lambda_2=0.2
#batch_size=

In [None]:
nlp=StanfordCoreNLP(nlpserver)
properties={'annotators':'parse','outputFormat':'json'}

#test CoreNLP server
print nlp.annotate("This is a pen which I bought",properties)

In [None]:
sentence="This is a pen. Sky is bleu"
pp.pprint(nlp.annotate(sentence,properties))

In [None]:
#load(or create) word2vec model 
def load_word2vec_model(model_file):
    if os.path.exists(model_file):
        model= wv.Word2Vec.load_word2vec_format(model_file,binary=False,unicode_errors='ignore')
    else:
        data =word2vec.Text8Corpus(corpus_file)
        model=word2vec.Word2Vec(data,size=vector_size)
        #model.save(model_file)
    
    return model

#test word2vec
model=load_word2vec_model(word2vec_model_file)
out=model.most_similar(positive=["apple"])
for x in out:
    print x[0],x[1]


In [None]:
"""
CoreNLP "parse" annotator JSON format
...
u'basic-dependencies':
    {  u'dep': u'ROOT', 
    u'dependent': 4, #term ID
    u'dependentGloss': u'pen', #term text
    u'governor': 0, #
    u'governorGloss': u'ROOT'},
...

"""
class TermNode:
    def __init__(self,param):
        self.term=param["dependentGloss"]
        self.param=param
        self.childs=[]
    
    def add_child(self,child):
        self.childs.append(child)
    
    def find_id(self,node_id):
        if self.param["dependent"]==node_id:
            return self
        else:
            for child in self.childs:
                result=child.find_id(node_id)
                if result!=None:
                    return result
            return None
    
    def get_training_data(self,session,rnn_op,word2vec_model):
        training_data=[]
        #process child node
        for child in self.childs:
            child_data=child.get_training_data(session,rnn_op,word2vec_model)
            if len(child_data)>0: #when leaf node, child_data will be None
                training_data.extend(child_data)
        
        #calculate RNN output on this node
        #calculate RNN output and use it for next input data
        try:
            rnn_result=word2vec_model[self.term]
        except KeyError:
            vector_size=len(word2vec_model["apple"])
            rnn_result=[0 for i in range(vector_size)]
        vector_size=len(rnn_result)
        for child in self.childs:
            #concatinate former iteration RNN result and next child node vector,and make it training data
            concatinated=vector_size*2*[0]
            for i in range(len(rnn_result)):
                concatinated[i]=rnn_result[i]
            for i in range(len(child.rnn_result)):
                concatinated[i+vector_size]=child.rnn_result[i]
            training_data.append(concatinated)
            
            #calculate next RNN output
            x_input=np.array([concatinated])
            #try:
            rnn_result=session.run(rnn_op,feed_dict={x:x_input})
#             except ValueError:
#                 print x_input
            rnn_result=[elem for elem in rnn_result[0]]
            
        #memorize final rnn output as feature for this node
        self.rnn_result=rnn_result
        
        return training_data

def check_dependency_format(basic_dependency):
    keys=["dependent","governor","dependentGloss"]
    for key in keys:
        if not key in basic_dependency.keys():
            return False
    return True
    
#arg : basic-dependencies result of CoreNLP for a sentence
#return : term tree structure  
def get_term_tree(basic_dependencies):
    #before processing checking result format
    for basic_dependency in basic_dependencies:
        if not check_dependency_format(basic_dependency):
            return None
        
    root_node=TermNode(basic_dependencies[0])
    node_dict={basic_dependencies[0]["dependent"]:root_node}
    
    #construct all node
    for i in range(1,len(basic_dependencies)):
        node_dict[basic_dependencies[i]["dependent"]]=TermNode(basic_dependencies[i])
    
    #make node into tree 
    for i in range(1,len(basic_dependencies)):
        parent_node=node_dict[basic_dependencies[i]["governor"]]
        parent_node.add_child(node_dict[basic_dependencies[i]["dependent"]])
 
    return root_node

In [None]:
#def build_auto_encoder(vector_size,lr,lambda_2):
x=tf.placeholder(tf.float32,[None,vector_size*2])
hidden_size=vector_size
#input layer 
# dict_size x 2
W_in=tf.Variable(tf.zeros([vector_size*2,hidden_size]))
b_in=tf.Variable(tf.zeros([hidden_size]))

hidden_y=tf.nn.sigmoid(tf.matmul(x,W_in)+b_in)

#output layer
# 2x2
W_out=tf.Variable(tf.zeros([hidden_size,vector_size*2]))
b_out=tf.Variable(tf.zeros([vector_size*2]))

#predictions
y=tf.nn.sigmoid(tf.matmul(hidden_y,W_out)+b_out)

#true label
y_=tf.placeholder(tf.float32,[None,vector_size*2])

#learning process
cost = tf.reduce_mean(tf.reduce_sum(tf.pow(y-y_,2)))
L2_sqr=tf.nn.l2_loss(W_in)+tf.nn.l2_loss(W_out)
loss=cost+lambda_2*L2_sqr
train_step = tf.train.GradientDescentOptimizer(lr).minimize(loss)
    
#return train_step,loss,hidden_y



In [None]:

def get_mini_batch(session,hidden_y,word2vec_model,corpus_itr,batch_sentence=2):
    sentence=""
    for i in range(batch_sentence):
        sentence+=corpus_itr.next().rstrip()+" "
    x_input=[]
    try:
        annotated=nlp.annotate(sentence,properties)
    
    #error handling of core nlp
    except UnicodeDecodeError:
        return None
    if not isinstance(annotated,dict):
        return None
    
    for sentence in annotated["sentences"]:
        root_node=get_term_tree(sentence["basic-dependencies"])
        if root_node==None:
            return None
        x_input.extend(root_node.get_training_data(session,hidden_y,word2vec_model))
    return np.array(x_input)


#train_step,loss,hiddeny=build_auto_encoder(vector_size,lr,lambda_2)
init = tf.initialize_all_variables()
sess=tf.Session()
sess.run(init)

#train auto encoder
#here, input x and output y_ are same 
skipped=0
with iter(open(corpus_file,"r")) as corpus_itr:
    for i in range(10000):
        x_input=get_mini_batch(sess,hidden_y,model,corpus_itr)
        if x_input is None or len(x_input)==0:
            skipped+=1
            continue
            
        result=sess.run([train_step,cost,loss],feed_dict={x:x_input,y_:x_input})
        if i%100==0:
            print result,skipped
    


In [None]:
#test
if debut==True:
    x_input=get_mini_batch(sess,hidden_y,)

In [None]:
import os
print os.path.exists(corpus_file)

In [None]:
with open(corpus_file,"r") as corpus:
    print len(corpus.readlines())

In [None]:
dic={}
print isinstance(dic,dict)