In [8]:
import os
import re
import pandas as pd
import tensorflow as tf

# Load all files from a directory in a DataFrame.
def load_directory_data(directory):
    data = {}
    data["sentence"] = []
    data["sentiment"] = []
    for file_path in os.listdir(directory):
        with tf.gfile.GFile(os.path.join(directory, file_path), "r") as f:
            data["sentence"].append(f.read())
            data["sentiment"].append(
                re.match("\d+_(\d+)\.txt", file_path).group(1))
    return pd.DataFrame.from_dict(data)


# Merge positive and negative examples, add a polarity column and shuffle.
def load_dataset(directory):
    pos_df = load_directory_data(os.path.join(directory, "pos"))
    neg_df = load_directory_data(os.path.join(directory, "neg"))
    pos_df["polarity"] = 1
    neg_df["polarity"] = 0
    return pd.concat([pos_df, neg_df]).sample(frac=1).reset_index(drop=True)


# Download and process the dataset files.
def download_and_load_datasets(force_download=False):
    dataset = tf.keras.utils.get_file(
        fname="aclImdb.tar.gz",
        origin="http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz",
        cache_subdir=r'F:\testDemo\AI\estimator\data\text',
        extract=True)
    print(dataset)
    print(os.path.dirname(dataset))

    train_df = load_dataset(
        os.path.join(os.path.dirname(dataset), "aclImdb", "train"))
    test_df = load_dataset(
        os.path.join(os.path.dirname(dataset), "aclImdb", "test"))

    return train_df, test_df

In [9]:
train, test = download_and_load_datasets()

F:\testDemo\AI\estimator\data\text\aclImdb.tar.gz
F:\testDemo\AI\estimator\data\text


In [10]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000 entries, 0 to 24999
Data columns (total 3 columns):
sentence     25000 non-null object
sentiment    25000 non-null object
polarity     25000 non-null int64
dtypes: int64(1), object(2)
memory usage: 586.0+ KB


In [11]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000 entries, 0 to 24999
Data columns (total 3 columns):
sentence     25000 non-null object
sentiment    25000 non-null object
polarity     25000 non-null int64
dtypes: int64(1), object(2)
memory usage: 586.0+ KB


In [12]:
train=train
test=test.sample(5000)

In [13]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000 entries, 0 to 24999
Data columns (total 3 columns):
sentence     25000 non-null object
sentiment    25000 non-null object
polarity     25000 non-null int64
dtypes: int64(1), object(2)
memory usage: 586.0+ KB


In [14]:
train.head(5)

Unnamed: 0,sentence,sentiment,polarity
0,This is really a very bad movie. Why? First of...,3,0
1,"I can hardly believe that this inert, turgid a...",4,0
2,I first saw this movie when it originally came...,10,1
3,"Superb cast, more please!<br /><br />If you ca...",10,1
4,"You can debate Prince's acting talent, or even...",8,1


In [15]:
MAX_SEQ_LENGTH=128 # 可根据自己实际 设定
embedding_dim=50
dropout=0.5
vocab_size=89528 # 根据自己实际情况设置
BATCH_SIZE=128
EPOCH=5

# 接下来 还是 参照我之前的 estimator template

In [16]:
def create_model(features,vocab_size,embedding_dim,drop_out,output_cls,max_seq_length):
    # 定义网络结构 和  返回值
    if isinstance(features,dict):
        features=features['text']
#     tf.logging.info('call create_model')
#     inputlayer=tf.feature_column.input_layer(features,feature_columns)
    # inputlayer 形状是[batch_size,MAX_SEQ_LENGTH]
#     tf.logging.info('input layer shape si :'+str(inputlayer.shape))
#     inputlayer=tf.cast(inputlayer,tf.int64)
    table=tf.contrib.lookup.index_table_from_file(vocabulary_file='data/text/aclImdb/imdb.vocab',default_value=0)
    tokens=table.lookup(features)
    
    
    # embedding layer,如果使用 pre-trained matrix，请参考 https://github.com/AlbertBJ/ChineseNER-Based-DL/blob/master/Model.py
    embedding=tf.get_variable('embedding',[vocab_size,embedding_dim],dtype=tf.float32)
    lstm_input=tf.nn.embedding_lookup(embedding,tokens,name='embedding_layer')
    # lstm_input shape is [batch_size,max-seq_length,embedding_dim]
    
    # 使用bi-lstm
#     fw=tf.contrib.rnn.LSTMBlockCell(num_units=lstm_input,name='fw')
    # 此处由于 分类使用，故而 设置 return_sequences=False,只返回 最后一个单元的 输出
    lstm=tf.keras.layers.LSTM(units=embedding_dim,dropout=drop_out,return_sequences=False)
    bi_lstm=tf.keras.layers.Bidirectional(lstm,merge_mode='concat',name='bi_lstm')
    dense_input=bi_lstm(lstm_input)
    # dense_input shape is [batch_size,2*embedding_dim]
    dense_1=tf.keras.layers.Dense(units=embedding_dim,activation='relu',name='dense_1')(dense_input)
    dropout_1=tf.keras.layers.Dropout(drop_out,name='drop_out_1')(dense_1)
    logits=tf.keras.layers.Dense(units=output_cls,name='out_put')(dropout_1)
    return logits 

In [39]:
def  model_fn_builder(lr,threshold):
    # 该方法实际 创建 estimator的model_fn
    # 可以 有其他操作
    def model_fn(features, labels, mode, params,config): # estimator需要的model_fn参数固定
        '''
        features: from input_fn的返回  切记返回的顺序
        labels： from input_fn 的返回  切记返回的顺序
        mode: tf.estimator.ModeKeys实例的一种
        params: 在初始化estimator时 传入的参数列表，dict形式,或者直接使用self.params也可以
        config:初始化estimator时 的 Runconfig

        '''
        if not (mode==tf.estimator.ModeKeys.PREDICT):
            labels=tf.reshape(labels,[-1,1])
            tf.logging.info('labels shape:'+ str(labels.shape))
        logits=create_model(features,params['vocab_size'],params['embedding_dim'],params['drop_out'],params['output_cls'],params['max_seq_length'])

        tf.logging.info('logits shape:'+ str(logits.shape))
        
        
        #         pre_cls=tf.math.argmax(input=logits,axis=1)
        pre_prob=tf.nn.sigmoid(logits) # 在此处 由于 是 二分类，此值 是 y=1的概率
        tf.logging.info('pre_prob shape:'+ str(pre_prob.shape))
        # 根据 自己需要调整 阈值 threshold
        pre_cls=tf.where(pre_prob<threshold,tf.zeros_like(pre_prob),tf.ones_like(pre_prob))
        
        is_predict=mode==tf.estimator.ModeKeys.PREDICT
        if not is_predict:
            # train .eval
            loss=tf.losses.sigmoid_cross_entropy(labels,logits=logits)

            def metric_fn(labels,predictions):
                '''
                define metrics
                '''
                accuracy,accuracy_update=tf.metrics.accuracy(labels=labels,predictions=predictions,name='text_accuracy')
                recall,recall_update=tf.metrics.recall(labels=labels,predictions=predictions,name='text_recall')
                precision,precision_update=tf.metrics.precision(labels=labels,predictions=predictions,name='text_precision')


                return {
                    'accuracy':(accuracy,accuracy_update),
                    'recall':(recall,recall_update),
                    'precision':(precision,precision_update)                  
                }


            if mode==tf.estimator.ModeKeys.EVAL:
                return tf.estimator.EstimatorSpec(mode=mode,loss=loss,eval_metric_ops=metric_fn(labels,pre_cls))

            # train process
            train_op=tf.train.AdamOptimizer(learning_rate=lr).minimize(loss=loss,global_step=tf.train.get_global_step())
            return tf.estimator.EstimatorSpec(mode=mode,loss=loss,train_op=train_op,eval_metric_ops=metric_fn(labels,pre_cls))


        else:
            # 此处转换只针对 二分类
            neg=tf.identity(1-pre_prob)
            pro=tf.where(pre_prob<threshold,neg,pre_prob)
            
            predictions={'predict_cls':pre_cls,'predict_pro':pro}
            return tf.estimator.EstimatorSpec(mode=mode,predictions=predictions)     
    return model_fn

In [40]:
def input_fn_builder(x,y,batch_size,epochs,max_seq_length,is_train=True):
    '''
    创建 输入函数闭包
    
    '''
    def pad_or_trunc(t):
        dim = tf.size(t)
        return tf.cond(tf.equal(dim, max_seq_length), lambda: t, lambda: tf.cond(tf.greater(dim, max_seq_length), lambda: tf.slice(t, [0], [max_seq_length]), lambda: tf.concat([t, tf.fill([max_seq_length-dim],'UNK')], 0)))
    
    
    def token(d,y):
        

        words=tf.string_split([d['text']])
        tokens=tf.sparse_tensor_to_dense(words,default_value='UNK')
        tf.logging.info('token 0 is type:'+str(type(tokens)))

        pad=pad_or_trunc(tf.reshape(tokens,[-1]))
        
        tf.logging.info('pad shape is :'+str(type(pad)))
        return {'text':pad},y    
    def input_fn():
        tf.logging.info('call input_fn')
        dataset=tf.data.Dataset.from_tensor_slices(({'text':x},y) )  
        if is_train:
            dataset=dataset.shuffle(1000).repeat(epochs)
        dataset=dataset.map(token)
        dataset=dataset.batch(batch_size)
        return dataset # 返回的 顺序要和 model_fn一致 或者 dataset元素 格式为（features,label）元组 也可以
    return input_fn

In [41]:
#对于 中文来说，目前 如果要使用
# def preprocess_func(x):
#     ret= "*".join(x.decode('utf-8'))
#     return ret

# str_t = tf.py_func(
#         preprocess_func,
#         [tf.constant(strs)],
#         tf.string)

In [42]:
def serving_input_receiver_fn():
    def pad_or_trunc(t):
        dim = tf.size(t)
        return tf.cond(tf.equal(dim, MAX_SEQ_LENGTH), lambda: t, lambda: tf.cond(tf.greater(dim, MAX_SEQ_LENGTH), lambda: tf.slice(t, [0], [MAX_SEQ_LENGTH]), lambda: tf.concat([t, tf.fill([MAX_SEQ_LENGTH-dim],'UNK')], 0)))
 
    
    def process(text):
        """split ,pad and truncate"""
        words=tf.string_split([text])
        tokens=tf.sparse_tensor_to_dense(words,default_value='UNK')
        padded=pad_or_trunc(tf.reshape(tokens,[-1]))       
        return padded

     # Optional; currently necessary for batch prediction.
    key_input = tf.placeholder(tf.string, shape=[None]) 
    key_output = tf.identity(key_input)

    input_ph = tf.placeholder(tf.string, shape=[None], name='texts')
    text_tensor = tf.map_fn(
      process, input_ph, back_prop=False, dtype=tf.string)
    receiver_tensors = {'texts': input_ph}
   
    features = {
       'text': text_tensor
    }    
    
  
    return tf.estimator.export.ServingInputReceiver(features,receiver_tensors)

In [43]:
model_dir=r'F:\testDemo\AI\estimator\model\text'
params={}

output_cls=1
# params['feature_columns']=feature_columns
params['output_cls']=output_cls
params['vocab_size']=vocab_size
params['embedding_dim']=embedding_dim
params['drop_out']=dropout
params['max_seq_length']=MAX_SEQ_LENGTH

In [44]:
config=tf.estimator.RunConfig(save_checkpoints_steps=100)

estimator=tf.estimator.Estimator(model_fn=model_fn_builder(0.001,0.5),model_dir=model_dir,params=params,config=config)

INFO:tensorflow:Using config: {'_model_dir': 'F:\\testDemo\\AI\\estimator\\model\\text', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': 100, '_save_checkpoints_secs': None, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x000001776DC0ADA0>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


如果需要加入 early_stop  则可以使用 train_and_evaluate,然后指定TrainSpec，EvalSpec中的hooks就可以了，具体可以参考官网

In [57]:
train_result=estimator.train(input_fn=input_fn_builder(x=train['sentence'],y=train['polarity'],batch_size=BATCH_SIZE,epochs=EPOCH,max_seq_length=MAX_SEQ_LENGTH,is_train=True),steps=10000)

INFO:tensorflow:call input_fn
INFO:tensorflow:token 0 is type:<class 'tensorflow.python.framework.ops.Tensor'>
INFO:tensorflow:pad shape is :<class 'tensorflow.python.framework.ops.Tensor'>
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:labels shape:(?, 1)
INFO:tensorflow:call create_model
INFO:tensorflow:logits shape:(?, 1)
INFO:tensorflow:pre_prob shape:(?, 1)
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from F:\testDemo\AI\estimator\model\text\model.ckpt-1368
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 1368 into F:\testDemo\AI\estimator\model\text\model.ckpt.
INFO:tensorflow:loss = 1.5149491, step = 1369
INFO:tensorflow:Saving checkpoints for 1468 into F:\testDemo\AI\estimator\model\text\model.ckpt.
INFO:tensorflow:global_step/sec: 3.3262
INFO:tensorflow:loss = 0.40239045, step = 1469 (30.067 

In [58]:
eval_result=estimator.evaluate(input_fn=input_fn_builder(x=test['sentence'],y=test['polarity'],batch_size=BATCH_SIZE,epochs=EPOCH,max_seq_length=MAX_SEQ_LENGTH,is_train=False),steps=10000)

INFO:tensorflow:call input_fn
INFO:tensorflow:token 0 is type:<class 'tensorflow.python.framework.ops.Tensor'>
INFO:tensorflow:pad shape is :<class 'tensorflow.python.framework.ops.Tensor'>
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:labels shape:(?, 1)
INFO:tensorflow:call create_model
INFO:tensorflow:logits shape:(?, 1)
INFO:tensorflow:pre_prob shape:(?, 1)
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2019-06-19T06:43:58Z
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from F:\testDemo\AI\estimator\model\text\model.ckpt-2345
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Finished evaluation at 2019-06-19-06:44:01
INFO:tensorflow:Saving dict for global step 2345: accuracy = 0.7706, global_step = 2345, loss = 0.9025137, precision = 0.7454058, recall = 0.80156446
INFO:tensorflow:Saving 'checkpoint_path' summary for global step 2345: F:\testDemo\AI\estimator\model\text\mod

In [45]:
estimator.export_savedmodel('export_base/text',serving_input_receiver_fn=serving_input_receiver_fn)

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:call create_model
INFO:tensorflow:logits shape:(?, 1)
INFO:tensorflow:pre_prob shape:(?, 1)
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Signatures INCLUDED in export for Classify: None
INFO:tensorflow:Signatures INCLUDED in export for Regress: None
INFO:tensorflow:Signatures INCLUDED in export for Predict: ['serving_default']
INFO:tensorflow:Signatures INCLUDED in export for Train: None
INFO:tensorflow:Signatures INCLUDED in export for Eval: None
INFO:tensorflow:Restoring parameters from F:\testDemo\AI\estimator\model\text\model.ckpt-2345
INFO:tensorflow:Assets added to graph.
INFO:tensorflow:Assets written to: export_base/text\temp-b'1561009656'\assets
INFO:tensorflow:SavedModel written to: export_base/text\temp-b'1561009656'\saved_model.pb


b'export_base/text\\1561009656'

## post body:<br>
{"instances" : [{"texts":"I went and saw this movie last night after being coaxed to by a few friends of mine. I'll admit that I was reluctant to see it because from what I knew of Ashton Kutcher he was only able to do comedy. I was wrong. Kutcher played the character of Jake Fischer very well, and Kevin Costner played Ben Randall with such professionalism. The sign of a good movie is that it can toy with our emotions. This one did exactly that. The entire theater (which was sold out) was overcome by laughter during the first half of the movie, and were moved to tears during the second half. While exiting the theater I not only saw many women in tears, but many full grown men as well, trying desperately not to let anyone see them crying. This movie was great, and I suggest that you go see it before you judge."},
{"texts":"How many movies are there that you can think of when you see a movie like this? I can't count them but it sure seemed like the movie makers were trying to give me a hint. I was reminded so often of other movies, it became a big distraction. One of the borrowed memorable lines came from a movie from 2003 - Day After Tomorrow. One line by itself, is not so bad but this movie borrows so much from so many movies it becomes a bad risk.BUT...See The Movie! Despite its downfalls there is enough to make it interesting and maybe make it appear clever. While borrowing so much from other movies it never goes overboard. In fact, you'll probably find yourself battening down the hatches and riding the storm out. Why? ...Costner and Kutcher played their characters very well. I have never been a fan of Kutcher's and I nearly gave up on him in The Guardian, but he surfaced in good fashion. Costner carries the movie swimmingly with the best of Costner's ability. I don't think Mrs. Robinson had anything to do with his success.The supporting cast all around played their parts well. I had no problem with any of them in the end. But some of these characters were used too much From here on out I can only nit-pick so I will save you the wear and tear. Enjoy the movie, the parts that work, work well enough to keep your head above water. Just don't expect a smooth ride.7 of 10 but almost a 6."},
{"texts":"I attended an advance screening of this film not sure of what to expect from Kevin Costner and Ashton Kutcher; both have delivered less than memorable performances & films. While the underlying general storyline is somewhat familiar, this film was excellent. Both Costner and Kutcher delivered powerful performances playing extremely well off each other. The human frailties and strengths of their respective characters were incredibly played by both; the scene when Costner confronts Kutcher with the personal reasons why Kutcher joined the Coast Guard rescue elite was the film's most unforgettable emotional moment. The specific storyline was an education in itself depicting the personal sacrifice and demanding physical training the elite Coast Guard rescuers must go through in preparation of their only job & responsibility...to save lives at sea. The special effects of the rescue scenes were extremely realistic and wowing...I haven't seen such angry seas since The Perfect Storm. Co-star Clancy Brown (HBO's Carnivale - great to see him again) played the captain of the Coast Guard's Kodiak, Alaska base in a strong, convincing role as a leader with the prerequisite and necessary ice water in his veins. The film wonderfully, and finally, gives long overdue exposure and respect to the Coast Guard; it had the audience applauding at the end."}
]}<br>
## 格式：<br>
{instances:[{"text":"sentence"},{"text":"sentence"},...,{"text":"sentence"}]}

## return format:

{
    "predictions": [
        {
            "predict_pro": [
                0.815035
            ],
            "predict_cls": [
                0
            ]
        },
        {
            "predict_pro": [
                0.591119
            ],
            "predict_cls": [
                0
            ]
        },
        {
            "predict_pro": [
                0.994651
            ],
            "predict_cls": [
                1
            ]
        }
    ]
}<br>

foramt:<br>
{"predictions":[{"predict_pro:[pro_value],"predict_cls":[cls_value]},...,{"predict_pro:[pro_value],"predict_cls":[cls_value]}]}
