In [None]:
from nltk.corpus import wordnet as wn
from gensim.models import word2vec
from pycorenlp import StanfordCoreNLP
import tensorflow as tf
import os

corpus_file="data/enwiki-20150112-400-r10-105752.txt"
word2vec_model_file="word2vec_v2.mod"
vector_size=200
nlpserver="http://localhost:9000"

In [None]:
print wn.synsets("apple")
print wn.synsets("word")[0].definition()

In [None]:
#word2vec learning
#load(or create) word2vec model 
def load_word2vec_model(model_file):
    if os.path.exists(model_file):
        model= word2vec.Word2Vec.load(model_file)
    else:
        data =word2vec.Text8Corpus(corpus_file)
        model=word2vec.Word2Vec(data,size=vector_size)
        model.save(model_file)
    
    return model

#test
model=load_word2vec_model(word2vec_model_file)
out=model.most_similar(positive=["apple"])
for x in out:
    print x[0],x[1]


In [None]:
nlp=StanfordCoreNLP(nlpserver)
properties={'annotators':'parse','outputFormat':'json'}

#test CoreNLP server
print nlp.annotate("fruit with red or yellow or green skin and sweet to tart crisp whitish flesh",properties)

In [None]:
"""
CoreNLP "parse" annotator JSON format
...
u'basic-dependencies':
    {  u'dep': u'ROOT', 
    u'dependent': 4, #term ID
    u'dependentGloss': u'pen', #term text
    u'governor': 0, #
    u'governorGloss': u'ROOT'},
...

"""
class TermNode:
    def __init__(self,param):
        self.term=param["dependentGloss"]
        self.param=param
        self.childs=[]
    
    def add_child(self,child):
        self.childs.append(child)
    
    def find_id(self,node_id):
        if self.param["dependent"]==node_id:
            return self
        else:
            for child in self.childs:
                result=child.find_id(node_id)
                if result!=None:
                    return result
            return None
    
    def get_training_data(self,weights,biases,keep_prob,word2vec_model):
        #process child node
        for child in self.childs:
            child.get_training_data(weights,biases,keep_prob,word2vec_model)
        
        #calculate RNN output on this node
        #calculate RNN output and use it for next input data
        try:
            rnn_result=word2vec_model[self.term]
        except KeyError:
            vector_size=len(word2vec_model["apple"])
            rnn_result=[0 for i in range(vector_size)]
        
        rnn_result=tf.constant(rnn_result)
        no_droped=rnn_result
        for child in self.childs:
            #concatinate former iteration RNN result and next child node vector,and make it training data
            x=tf.concat(0,[child.rnn_result,rnn_result])
            rnn_result,no_droped=process_NN(weights,biases,keep_prob,x)
            
        #memorize final rnn output as feature for this node
        self.rnn_result=rnn_result
        self.no_droped=no_droped
        

def check_dependency_format(basic_dependency):
    keys=["dependent","governor","dependentGloss"]
    for key in keys:
        if not key in basic_dependency.keys():
            return False
    return True
    
#arg : basic-dependencies result of CoreNLP for a sentence
#return : term tree structure  
def get_term_tree(basic_dependencies):
    #before processing checking result format
    for basic_dependency in basic_dependencies:
        if not check_dependency_format(basic_dependency):
            return None
        
    root_node=TermNode(basic_dependencies[0])
    node_dict={basic_dependencies[0]["dependent"]:root_node}
    
    #construct all node
    for i in range(1,len(basic_dependencies)):
        node_dict[basic_dependencies[i]["dependent"]]=TermNode(basic_dependencies[i])
    
    #make node into tree 
    for i in range(1,len(basic_dependencies)):
        parent_node=node_dict[basic_dependencies[i]["governor"]]
        parent_node.add_child(node_dict[basic_dependencies[i]["dependent"]])
 
    return root_node

# x: vector size*2
#output : vector size
def process_NN(weights,biases,keep_prob,x):
    y_drop=x
    for w,b in zip(weights,biases):
        y=tf.nn.sigmoid(tf.matmul(w,y_drop)+b)
        y_drop=tf.nn.dropout(y,keep_prob)
    
    return y_drop,y
    
    

In [None]:
#def build_auto_encoder(vector_size,lr,lambda_2):

keep_prob=tf.placeholder("float")

def define_NN(input_size,layer_defs):
    weights=[]
    biases=[]
    L2_sqr=0
    prev_layer_size=input_size
    for layer_size in layer_defs:
        w_h=tf.Variable(tf.zeros([prev_layer_size,layer_size]))
        b_h=tf.Variable(tf.zeros([layer_size]))
        L2_sqr=L2_sqr+tf.nn.l2_loss(w_h)
        
        
        weights.append(w_h)
        biases.append(b_h)
        prev_layer_size=layer_size
        
    return weights,biases,L2_sqr

weights,biases,L2_sqr=define_NN(vector_size*2,[200,200])

In [None]:
init = tf.initialize_all_variables()
sess=tf.Session()
sess.run(init)

terms=["apple"]

for term in terms:
    try:
        termVec=model["term"]
    except KeyError:
        continue
    
    synsets=wn.synsets(term)
    if len(synsets)==0:
        continue
    
    #use first synset definition 
    definition=str(synsets[0].definition())
    print definition
    try:
        annotated=nlp.annotate(definition,properties)
    
    #error handling of core nlp
    except UnicodeDecodeError:
        continue
    if not isinstance(annotated,dict):
        continue
    
    #use only first sentence
    sentence=annotated["sentences"][0]
    root_node=get_term_tree(sentence["basic-dependencies"])
    if root_node==None:
        continue
    
    #making feed back phase for RNN
    root_node.get_training_data(weights,biases,keep_prob,model)
    rnn_result=root_node.non_droped
    true_label=model[term]
    cost=tf.reduce_mean(tf.reduce_sum(tf.pow(rnn_result-true_label,2)))
    loss=cost+lambda_2*L2_sqr
    train_step = tf.train.GradientDescentOptimizer(lr).minimize(loss)
    
    #training
    session.run([train_step,],feed_dict={keep_prob:0.5})