In [1]:
from nltk.corpus import wordnet as wn
from gensim.models import word2vec
from pycorenlp import StanfordCoreNLP
import tensorflow as tf
import os
import random
import gc

corpus_file="data/enwiki-20150112-400-r10-105752.txt"
term_file="data/4000-most-common-english-words-csv.csv"
word2vec_model_file="word2vec_v2.mod"
vector_size=200
nlpserver="http://localhost:9000"
model_directory="model/"
epoch=200
lr=0.1
lambda_2=0.01

In [2]:
print wn.synsets("apple")
print wn.synsets("word")[0].definition()

[Synset('apple.n.01'), Synset('apple.n.02')]
a unit of language that native speakers can identify


In [3]:
#word2vec learning
#load(or create) word2vec model 
def load_word2vec_model(model_file):
    if os.path.exists(model_file):
        model= word2vec.Word2Vec.load(model_file)
    else:
        data =word2vec.Text8Corpus(corpus_file)
        model=word2vec.Word2Vec(data,size=vector_size)
        model.save(model_file)
    
    return model

#test
model=load_word2vec_model(word2vec_model_file)
out=model.most_similar(positive=["apple"])
for x in out:
    print x[0],x[1]


apple, 0.877130806446
coconut 0.801503241062
edible 0.800157427788
almond 0.799016475677
olive 0.797267079353
bean 0.793999910355
butter 0.790080070496
onion 0.785118877888
mango 0.783805608749
fruit, 0.781246781349


In [4]:
nlp=StanfordCoreNLP(nlpserver)
properties={'annotators':'parse','outputFormat':'json'}

#test CoreNLP server
print nlp.annotate("fruit with red or yellow or green skin and sweet to tart crisp whitish flesh",properties)

{u'sentences': [{u'tokens': [{u'index': 1, u'word': u'fruit', u'after': u' ', u'pos': u'VBN', u'characterOffsetEnd': 5, u'characterOffsetBegin': 0, u'originalText': u'fruit', u'before': u''}, {u'index': 2, u'word': u'with', u'after': u' ', u'pos': u'IN', u'characterOffsetEnd': 10, u'characterOffsetBegin': 6, u'originalText': u'with', u'before': u' '}, {u'index': 3, u'word': u'red', u'after': u' ', u'pos': u'JJ', u'characterOffsetEnd': 14, u'characterOffsetBegin': 11, u'originalText': u'red', u'before': u' '}, {u'index': 4, u'word': u'or', u'after': u' ', u'pos': u'CC', u'characterOffsetEnd': 17, u'characterOffsetBegin': 15, u'originalText': u'or', u'before': u' '}, {u'index': 5, u'word': u'yellow', u'after': u' ', u'pos': u'JJ', u'characterOffsetEnd': 24, u'characterOffsetBegin': 18, u'originalText': u'yellow', u'before': u' '}, {u'index': 6, u'word': u'or', u'after': u' ', u'pos': u'CC', u'characterOffsetEnd': 27, u'characterOffsetBegin': 25, u'originalText': u'or', u'before': u' '}, 

In [5]:
"""
CoreNLP "parse" annotator JSON format
...
u'basic-dependencies':
    {  u'dep': u'ROOT', 
    u'dependent': 4, #term ID
    u'dependentGloss': u'pen', #term text
    u'governor': 0, #
    u'governorGloss': u'ROOT'},
...

"""
class TermNode:
    def __init__(self,param):
        self.term=param["dependentGloss"]
        self.param=param
        self.childs=[]
    
    def add_child(self,child):
        self.childs.append(child)
    
    def find_id(self,node_id):
        if self.param["dependent"]==node_id:
            return self
        else:
            for child in self.childs:
                result=child.find_id(node_id)
                if result!=None:
                    return result
            return None
    
    def get_training_data(self,word2vec_model,keep_prob):
        #process child node
        for child in self.childs:
            child.get_training_data(word2vec_model,keep_prob)
        
        #calculate RNN output on this node
        #calculate RNN output and use it for next input data
        try:
            rnn_result=word2vec_model[self.term]
        except KeyError:
            vector_size=len(word2vec_model["apple"])
            rnn_result=[0.0 for i in range(vector_size)]
            
        rnn_result=tf.convert_to_tensor([[value for value in rnn_result]])
        no_droped=rnn_result
        for child in self.childs:
            #concatinate former iteration RNN result and next child node vector,and make it input data
            x=tf.concat(1,[child.rnn_result,rnn_result])
            rnn_result,no_droped=process_NN(x,keep_prob)
            
        #memorize final rnn output as feature for this node
        self.rnn_result=rnn_result
        self.no_droped=no_droped
        

def check_dependency_format(basic_dependency):
    keys=["dependent","governor","dependentGloss"]
    for key in keys:
        if not key in basic_dependency.keys():
            return False
    return True
    
#arg : basic-dependencies result of CoreNLP for a sentence
#return : term tree structure  
def get_term_tree(basic_dependencies):
    #before processing checking result format
    for basic_dependency in basic_dependencies:
        if not check_dependency_format(basic_dependency):
            return None
        
    root_node=TermNode(basic_dependencies[0])
    node_dict={basic_dependencies[0]["dependent"]:root_node}
    
    #construct all node
    for i in range(1,len(basic_dependencies)):
        node_dict[basic_dependencies[i]["dependent"]]=TermNode(basic_dependencies[i])
    
    #make node into tree 
    for i in range(1,len(basic_dependencies)):
        parent_node=node_dict[basic_dependencies[i]["governor"]]
        parent_node.add_child(node_dict[basic_dependencies[i]["dependent"]])
 
    return root_node

# x: vector size*2
#output : vector size
def process_NN(x,keep_prob):
    with tf.variable_scope("composition",reuse=True):
        w=tf.get_variable("weight")
        b=tf.get_variable("biase")

    y=tf.nn.sigmoid(tf.matmul(x,w)+b)
    y_drop=tf.nn.dropout(y,keep_prob)
    
    return y_drop,y

def printt(x):
    print x.get_shape().dims,x.get_shape().ndims
    

In [6]:
#def build_auto_encoder(vector_size,lr,lambda_2):

keep_prob=tf.placeholder("float")

def define_NN(input_size,layer_defs):
    L2_sqr=0
    with tf.variable_scope("composition"):
            w_h=tf.get_variable("weight",[vector_size*2,vector_size]
                                ,initializer=tf.random_normal_initializer())
            b_h=tf.get_variable("biase",vector_size
                                ,initializer=tf.random_normal_initializer())
   
    L2_sqr=L2_sqr+tf.nn.l2_loss(w_h)
    tf.scalar_summary("L2_loss",L2_sqr)
    
    return L2_sqr

def variable_summarizer(var,name):
    with tf.name_scope('summaries'):
        mean = tf.reduce_mean(var)
        tf.scalar_summary('mean/' + name, mean)
        with tf.name_scope('stddev'):
            stddev = tf.sqrt(tf.reduce_mean(tf.square(var - mean)))
        tf.scalar_summary('stddev/' + name, stddev)
        tf.scalar_summary('max/' + name, tf.reduce_max(var))
        tf.scalar_summary('min/' + name, tf.reduce_min(var))
        tf.histogram_summary(name, var)
    

L2_sqr=define_NN(vector_size*2,[200])

In [7]:
#load frequent term from term_file
with open(term_file,"r") as f:
    terms=[term.rstrip() for term in f.readlines() ]
terms=terms[500:3500]

In [8]:
def get_model_path():
    return model_directory+"lr_%f-lambda2_%f-vector_size_%d" %(lr,lambda_2,vector_size)

new_model=False


sess=tf.Session()
if new_model:
    init = tf.initialize_all_variables()
    sess.run(init)
else:
    saver=tf.train.Saver()
    saver.restore(sess,get_model_path())

skipped=0

for i in range(epoch):
    term=terms[random.randrange(len(terms))]
    try:
        termVec=model[term]
    except KeyError:
        skipped+=1
        continue

    synsets=wn.synsets(term)
    if len(synsets)==0:
        skipped+=1
        continue

    #use first synset definition 
    definition=str(synsets[0].definition())
    try:
        annotated=nlp.annotate(definition,properties)

    #error handling of core nlp
    except UnicodeDecodeError:
        skipped+=1
        continue
    if not isinstance(annotated,dict):
        skipped+=1
        continue
    #use only first sentence
    sentence=annotated["sentences"][0]
    root_node=get_term_tree(sentence["basic-dependencies"])
    if root_node==None:
        skipped+=1
        continue

    #making feed back phase for RNN
    root_node.get_training_data(model,keep_prob)

    rnn_result=root_node.no_droped
    true_label=model[term]

    cost=tf.reduce_mean(tf.reduce_sum(tf.pow(rnn_result-true_label,2)))
    loss=cost+lambda_2*L2_sqr
    train_step = tf.train.GradientDescentOptimizer(lr).minimize(loss)


    #training
    if i%10==0:
        result=sess.run([train_step,loss,cost],feed_dict={keep_prob:0.5})
        print i,result[1],result[1]-result[2],result[2]
    else:
        sess.run([train_step],feed_dict={keep_prob:0.5})

    del root_node
    gc.collect()

saver=tf.train.Saver()
saver.save(sess,get_model_path())
sess.close()


0 epoch session
0 503.911 202.86 301.05
10 478.868 199.375 279.493
20 585.906 195.747 390.16
30 566.417 192.252 374.165
40 345.545 188.978 156.567
50 515.113 185.95 329.164
60 438.011 182.886 255.126
70 522.504 179.763 342.741
80 486.12 176.887 309.233
90 409.369 174.732 234.637
100 514.892 172.149 342.742
110 641.646 169.314 472.332
120 598.35 166.735 431.615
130 457.959 163.69 294.269
140 311.122 160.894 150.228
150 486.195 158.158 328.037
160 544.34 155.516 388.825
170 412.405 152.781 259.624
180 459.535 149.981 309.554
190 457.325 147.352 309.972
200 epoch session
0 362.439 145.014 217.426


KeyboardInterrupt: 