Before running this notebook, you need to pull:

https://github.com/google-research/bert

And add bert_preprocessing.py and multilabel_bert.py script from to the pulled directory (put it in the same directory with modeling.py , optimization.py, tokenization.py):

https://github.com/Andoree/tweets_classification/tree/master/multilabel_classification_scripts

In [None]:
import codecs
from datetime import datetime
import os

import pandas as pd
import tensorflow as tf
import numpy as np
import modeling
import optimization
import tokenization
from bert_preprocessing import create_examples, file_based_convert_examples_to_features, \
    convert_examples_to_features
from multilabel_bert import file_based_input_fn_builder, create_model, model_fn_builder, \
input_fn_builder, create_output, predict, get_estimator, train_and_evaluate

In [None]:
# Setting CUDA device
! export CUDA_VISIBLE_DEVICES = 2

### Parameters

In [None]:
corpus_dir  = r"otzovik_csvs/fold_0/"
bert_vocab_path = r"/media/data/datasets/biomed/EMBEDDINGS/BERT/multilingual_russian_reviews_finetuned/vocab.txt"
# Change checkpoint if you want to use multilanguage Bert model that is finetuned on another dataset.
bert_init_chkpnt_path = r"/media/data/datasets/biomed/EMBEDDINGS/BERT/multilingual_russian_reviews_finetuned/bert_model.ckpt"
bert_config_path =  r"/media/data/datasets/biomed/EMBEDDINGS/BERT/multilingual_russian_reviews_finetuned/bert_config.json"
batch_size = 2
num_train_epochs = 1
warmup_proportion = 0.1
max_seq_length = 12
learning_rate = 2e-5
save_summary_steps = 500
output_dir = r"results/"
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
predicted_proba_filename = "predicted_labels.csv"

num_train_steps = int(len(train_examples) / batch_size * num_train_epochs)
num_warmup_steps = int(num_train_steps * warmup_proportion)
num_steps_in_epoch = int(len(train_examples) / batch_size * num_train_epochs) // num_train_epochs
# Model is saved and evaluated every epoch. It might be too frequent, change it.
save_checkpoints_steps = num_steps_in_epoch
# Number of classes
NUM_LABELS = 5
# The column with this name must exist in test data
text_column_name = 'text'

### Loading model
This part is common for both training and prediction of test labels. 

In [None]:
# Creating tokenizer
tokenizer = tokenization.FullTokenizer(
    vocab_file=bert_vocab_path, do_lower_case=True)
# Definition of estimator's config
run_config = tf.estimator.RunConfig(
    model_dir=output_dir,
    save_summary_steps=save_summary_steps,
    keep_checkpoint_max=1,
    save_checkpoints_steps=save_checkpoints_steps)
# Loading config of pretrained Bert model
bert_config = modeling.BertConfig.from_json_file(bert_config_path)

model_fn = model_fn_builder(
    bert_config=bert_config,
    num_labels=NUM_LABELS ,
    init_checkpoint=bert_init_chkpnt_path,
    learning_rate=learning_rate,
    num_train_steps=num_train_steps,
    num_warmup_steps=num_warmup_steps,
    use_tpu=False,
    use_one_hot_embeddings=False)

estimator = get_estimator(model_fn=model_fn, run_config=run_config, batch_size=batch_size)

### Training and evaluation
Validation loss and accuracy for all classes is saved in "output_dir/eval_results.txt" (path parameters are initialized at "Parameters" section). 

The first column of csv file must contain document's text. The next NUM_LABELS columns are binary columns of class correspondence.  test_df should have the same structure.

In [None]:
# Change paths if needed
train_df = pd.read_csv(os.path.join(corpus_dir, "train.csv"), encoding="utf-8")
dev_df = pd.read_csv(os.path.join(corpus_dir, "dev.csv"), encoding="utf-8")
tf.logging.set_verbosity(tf.logging.INFO)

train_examples = create_examples(train_df)
eval_examples = create_examples(dev_df)

eval_steps = None

train_and_evaluate(train_examples, eval_examples, max_seq_length, estimator, tokenizer, batch_size, eval_steps,
                   num_train_steps, output_dir, num_labels=NUM_LABELS)

### Predicting class probabilities
The resulting file with test labels is saved at "output_dir/predicted_proba_filename" (path parameters are initialized at "Parameters" section). 

#### Defining documents to predict labels for manually

In [None]:
strings = ['This is some string',
       'This is another string']
test_df = pd.DataFrame(strings, columns =[text_column_name], )

#### Loading test set from csv file

In [None]:
# Change path if needed
test_df = pd.read_csv(os.path.join(corpus_dir, "test.csv"), encoding="utf-8")

In [None]:
output_df = predict(test_df, estimator, tokenizer, max_seq_length, num_labels=NUM_LABELS)

resulting_df = test_df[text_column_name]
resulting_df = pd.concat([test_df, output_df], axis=1)
resulting_df.to_csv(os.path.join(output_dir, predicted_proba_filename), index=False)

resulting_df.head()