Before running this notebook, you need to pull:

https://github.com/google-research/bert

And add bert_preprocessing.py and multilabel_bert.py script from to the pulled directory (put it in the same directory with modeling.py, optimization.py, tokenization.py):

https://github.com/Andoree/tweets_classification/tree/master/multilabel_classification_scripts

In [1]:
import codecs
from datetime import datetime
import os

import pandas as pd
import tensorflow as tf
import numpy as np
import modeling
import optimization
import tokenization
from bert_preprocessing import create_examples, file_based_convert_examples_to_features, \
    convert_examples_to_features
from multilabel_bert import file_based_input_fn_builder, create_model, model_fn_builder, \
input_fn_builder, create_output, predict, get_estimator, train_and_evaluate

In [4]:
# Setting CUDA device
%env CUDA_VISIBLE_DEVICES = 2

env: CUDA_VISIBLE_DEVICES=2


### Parameters

In [3]:
corpus_dir  = r"otzovik_csvs/fold_0/"
bert_vocab_path = r"/home/tlenusik/DATA/pretrained_models/multilingual_russian_reviews_finetuned/vocab.txt"
# Change checkpoint if you want to use multilanguage Bert model that is finetuned on another dataset.
bert_init_chkpnt_path = r"/home/tlenusik/DATA/pretrained_models/multilingual_russian_reviews_finetuned/bert_model.ckpt"
bert_config_path =  r"/home/tlenusik/DATA/pretrained_models/multilingual_russian_reviews_finetuned/bert_config.json"
batch_size = 32
num_train_epochs = 5
warmup_proportion = 0.1
max_seq_length = 128
learning_rate = 2e-5
save_summary_steps = 500
output_dir = r"results/"
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
predicted_proba_filename = "predicted_labels.csv"

# Number of classes
NUM_LABELS = 5
# The column with this name must exist in test data
text_column_name = 'sentences'

### Training and evaluation
Validation loss and accuracy for all classes is saved in "output_dir/eval_results.txt" (path parameters are initialized at "Parameters" section). 

The first column of csv file must contain document's text. The next NUM_LABELS columns are binary columns of class correspondence.  test_df should have the same structure.

In [4]:
# Change paths if needed
train_df = pd.read_csv(os.path.join(corpus_dir, "train.csv"), encoding="utf-8")
dev_df = pd.read_csv(os.path.join(corpus_dir, "dev.csv"), encoding="utf-8")

train_examples = create_examples(train_df)
eval_examples = create_examples(dev_df)
# Model is saved and evaluated every epoch. It might be too frequent, change it.
num_train_steps = int(len(train_examples) / batch_size * num_train_epochs)
num_warmup_steps = int(num_train_steps * warmup_proportion)
num_steps_in_epoch = int(len(train_examples) / batch_size * num_train_epochs) // num_train_epochs
save_checkpoints_steps = num_steps_in_epoch

In [5]:
# Creating tokenizer
tokenizer = tokenization.FullTokenizer(
    vocab_file=bert_vocab_path, do_lower_case=True)
# Definition of estimator's config
run_config = tf.estimator.RunConfig(
    model_dir=output_dir,
    save_summary_steps=save_summary_steps,
    keep_checkpoint_max=1,
    save_checkpoints_steps=save_checkpoints_steps)
# Loading config of pretrained Bert model
bert_config = modeling.BertConfig.from_json_file(bert_config_path)

model_fn = model_fn_builder(
    bert_config=bert_config,
    num_labels=NUM_LABELS ,
    init_checkpoint=bert_init_chkpnt_path,
    learning_rate=learning_rate,
    num_train_steps=num_train_steps,
    num_warmup_steps=num_warmup_steps,
    use_tpu=False,
    use_one_hot_embeddings=False)

estimator = get_estimator(model_fn=model_fn, run_config=run_config, batch_size=batch_size)

INFO:tensorflow:Using config: {'_model_dir': 'results/', '_tf_random_seed': None, '_save_summary_steps': 500, '_save_checkpoints_steps': 50, '_save_checkpoints_secs': None, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 1, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f17db38cb70>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [6]:
tf.logging.set_verbosity(tf.logging.INFO)

eval_steps = None

train_and_evaluate(train_examples, eval_examples, max_seq_length, estimator, tokenizer, batch_size, eval_steps,
                   num_train_steps, output_dir, num_labels=NUM_LABELS)

INFO:tensorflow:***** Running training *****
INFO:tensorflow:  Num examples = 1627
INFO:tensorflow:  Batch size = 32
INFO:tensorflow:  Num steps = 254
Beginning Training!
INFO:tensorflow:Not using Distribute Coordinator.
INFO:tensorflow:Running training and evaluation locally (non-distributed).
INFO:tensorflow:Start train and evaluate loop. The evaluate will happen after every checkpoint. Checkpoint frequency is determined based on RunConfig arguments: save_checkpoints_steps 50 or save_checkpoints_secs None.


  import pandas.util.testing as tm


Instructions for updating:
Use `tf.data.experimental.map_and_batch(...)`.
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:num_labels:5;logits:Tensor("loss/BiasAdd:0", shape=(32, 5), dtype=float32);labels:Tensor("loss/Cast:0", shape=(32, 5), dtype=float32)
INFO:tensorflow:**** Trainable Variables ****
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 0 into results/model.ckpt.
INFO:tensorflow:loss = 0.6960664, step = 0
INFO:tensorflow:accuracy = 0.43125, loss = 0.6960664
INFO:tensorflow:accuracy = 0.584375, loss = 0.6169146 (12.620 sec)
INFO:tensorflow:accuracy = 0.6625, loss = 0.44438583 (6.415 sec)
INFO:tensorflow:accuracy = 0.69375, loss = 0.45706087 (6.414 sec)
INFO:tensorflow:accuracy = 0.71375, loss = 0.41767502 (6.412 sec)
INFO:tensorflow:Saving checkpoints for 50 into results/model.ckpt.


INFO:tensorflow:accuracy = 0.8908654, loss = 0.1188524 (53.453 sec)
INFO:tensorflow:Saving checkpoints for 254 into results/model.ckpt.
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:num_labels:5;logits:Tensor("loss/BiasAdd:0", shape=(?, 5), dtype=float32);labels:Tensor("loss/Cast:0", shape=(?, 5), dtype=float32)
INFO:tensorflow:**** Trainable Variables ****
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2020-05-19-21:12:17
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from results/model.ckpt-254
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:accuracy = 0.89375, loss = 0.25370747
INFO:tensorflow:accuracy = 0.890625, loss = 0.30256602 (0.806 sec)
INFO:tensorflow:accuracy = 0.90416664, loss = 0.20476274 (0.357 sec)
INFO:tensorflow:Finished evaluation at 2020-05-19-21:12:28
INFO:tensorflow:Saving dict for global step 254: 0 = 0.90753424, 1 = 0.96616244, 2 = 0.9616012, 3 = 0.93

### Predicting class probabilities
The resulting file with test labels is saved at "output_dir/predicted_proba_filename" (path parameters are initialized at "Parameters" section). 

#### Defining documents to predict labels for manually

In [7]:
strings = ['This is some string',
       'This is another string']
test_df = pd.DataFrame(strings, columns =[text_column_name], )

#### Loading test set from csv file

In [8]:
train_examples = None
num_train_steps = None
num_warmup_steps = None
save_checkpoints_steps = 1000

# Creating tokenizer
tokenizer = tokenization.FullTokenizer(
    vocab_file=bert_vocab_path, do_lower_case=True)
# Definition of estimator's config
run_config = tf.estimator.RunConfig(
    model_dir=output_dir,
    save_summary_steps=save_summary_steps,
    keep_checkpoint_max=1,
    save_checkpoints_steps=save_checkpoints_steps)
# Loading config of pretrained Bert model
bert_config = modeling.BertConfig.from_json_file(bert_config_path)

model_fn = model_fn_builder(
    bert_config=bert_config,
    num_labels=NUM_LABELS ,
    init_checkpoint=bert_init_chkpnt_path,
    learning_rate=learning_rate,
    num_train_steps=num_train_steps,
    num_warmup_steps=num_warmup_steps,
    use_tpu=False,
    use_one_hot_embeddings=False)

estimator = get_estimator(model_fn=model_fn, run_config=run_config, batch_size=batch_size)

INFO:tensorflow:Using config: {'_model_dir': 'results/', '_tf_random_seed': None, '_save_summary_steps': 500, '_save_checkpoints_steps': 1000, '_save_checkpoints_secs': None, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 1, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f163c3ed860>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [9]:
# Change path if needed
test_df = pd.read_csv(os.path.join(corpus_dir, "test.csv"), encoding="utf-8")

In [10]:
output_df = predict(test_df, estimator, tokenizer, max_seq_length, num_labels=NUM_LABELS)

resulting_df = test_df[text_column_name]
resulting_df = pd.concat([test_df, output_df], axis=1)
resulting_df.to_csv(os.path.join(output_dir, predicted_proba_filename), index=False)

resulting_df.head()

Beginning Predictions!
Prediction took time  0:00:00.000154
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:num_labels:5;logits:Tensor("loss/BiasAdd:0", shape=(?, 5), dtype=float32);labels:Tensor("loss/Cast:0", shape=(?, 5), dtype=float32)
INFO:tensorflow:**** Trainable Variables ****
mode: infer probabilities: Tensor("loss/Sigmoid:0", shape=(?, 5), dtype=float32)
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from results/model.ckpt-254
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.


Unnamed: 0,sentences,EF,INF,ADR,DI,Finding,annotation,review_id,sentence_id,p_label_1,p_label_2,p_label_3,p_label_4,p_label_5
0,"Стала нервной, капризной, чуть что -сразу визг...",0,0,0,1,1,DI[3]|Finding[1],252298,4,0.138464,0.04736,0.374648,0.949777,0.238476
1,После недельного приема дочурка легче стала ос...,1,0,0,0,0,EF[2],252298,7,0.966925,0.040048,0.058967,0.198397,0.087818
2,"Очень радует то, что таблетки не горькие, я ра...",0,0,0,0,0,NEUTRAL,252298,8,0.07015,0.017458,0.028494,0.022673,0.06075
3,И так с появление ребенка в нашей семье и част...,0,0,0,1,0,DI[1],2457636,2,0.03664,0.02394,0.015324,0.941991,0.047814
4,"Болезнь немного отступала, но потом с новой си...",0,1,0,1,0,INF[3]|DI[2],2457636,5,0.059428,0.910076,0.046749,0.824391,0.087311
