<a href="https://colab.research.google.com/github/EHDEV/xitext_model_trainer/blob/main/trainer_nb-onnx.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#### Install requirements

In [25]:
!pip install transformers install onnxruntime onnxruntime-tools
 



#### Download repo and set working directory
- Get the latest trainer code from github
- set working directory then execute code

In [54]:
import os
os.environ['TRAINER_HOME'] = '/content/drive/MyDrive/xitext/xitext_model_trainer'
trainer_home = os.environ.get('TRAINER_HOME')

In [55]:
%%shell

# export TRAINER_HOME=/content/drive/MyDrive/xitext/xitext_model_trainer
cd $TRAINER_HOME; git pull


Already up to date.




In [56]:
cd $trainer_home

/content/drive/MyDrive/xitext/xitext_model_trainer


#### Configurations

In [80]:
import configparser
import os
from pathlib import Path

config = configparser.ConfigParser(interpolation=configparser.ExtendedInterpolation())
config.read('config.ini')


['config.ini']

In [81]:
default_config = dict(config['DEFAULT'])
data_config = dict(config['DATA'])
model_config = dict(config['MODEL'])
onnx_config = dict(config['ONNX'])

#### Import classes and required functions

In [31]:
from file_config import FileConfig
from models import SequenceClassifierModel
from convert_optimize_onnx import TorchToONNX
from data_preprocess import TextClassifierData, _encode_text_into_tokens
from pathlib import Path
import torch

#### Set data file path and other configurations of the file

In [72]:
data_config

{'classes_file_path': '/content/drive/MyDrive/xitext/restaurant-sentiment/models/classes.json',
 'company': 'xitext',
 'company_home': '/content/drive/MyDrive/xitext',
 'epochs': '1',
 'eval_metric': 'accuracy',
 'model_filename': 'distilbert-topic-seq-classifier.bin',
 'model_group': 'distilbert',
 'model_output_dir': '/content/drive/MyDrive/xitext/restaurant-sentiment/models',
 'optimizer': 'adam',
 'project_home': '/content/drive/MyDrive/xitext/restaurant-sentiment',
 'project_name': 'restaurant-sentiment',
 'scheduler': 'linear',
 'transformers_model_id': 'distilbert-base-uncased'}

In [66]:
fconfig = FileConfig(
    path_to_directory=Path(data_config['source_dir']), 
    target_column=data_config['target_col'],
    sequence_column=data_config['text_col'],
    delimiter=data_config['delimiter'],
    header_column=data_config.get('header'))


In [67]:
text_clas_data = TextClassifierData(fconfig)
train_data, val_data = text_clas_data.preprocess()

DEBUG:data-preprocessing.log:load data started
  ) for fp in files
DEBUG:data-preprocessing.log:dataframe with shape (1000, 2) has been created
DEBUG:data-preprocessing.log:sentence column cleaned
DEBUG:data-preprocessing.log:Clean_label_column complete
DEBUG:data-preprocessing.log:Underrepresented classes have been removed and data condensed
DEBUG:data-preprocessing.log:preparing data for training: train/val split and convert to tensor
DEBUG:data-preprocessing.log:train test split completed
DEBUG:data-preprocessing.log:Tokenizing train and valid data
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): huggingface.co:443
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "HEAD /bert-base-uncased/resolve/main/vocab.txt HTTP/1.1" 200 0
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): huggingface.co:443
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "HEAD /bert-base-uncased/resolve/main/tokenizer.json HTTP/1.1" 200 0
DEBUG:data-preprocessing.log:En

#### Write classes and their indices as a json in model directory

In [73]:
import json

model_output_dir = model_config['model_output_dir']

if not os.path.exists(model_output_dir):
    os.makedirs(model_output_dir)

with open(model_config['classes_file_path'], 'w') as cfile:
    cfile.write(json.dumps(
        { k: v for k,v in enumerate(text_clas_data.classes) }
    ))

In [74]:
pickle_path = Path(data_config['data_pickle_output_path'])
if not os.path.isdir(pickle_path):
  os.makedirs(pickle_path)

torch.save(train_data, pickle_path/'train_pickle.pth')
torch.save(val_data, pickle_path/'val_pickle.pth')

In [75]:
train_data = torch.load(pickle_path/'train_pickle.pth')
val_data = torch.load(pickle_path/'val_pickle.pth')

#### Train

In [76]:
model_config

{'classes_file_path': '/content/drive/MyDrive/xitext/restaurant-sentiment/models/classes.json',
 'company': 'xitext',
 'company_home': '/content/drive/MyDrive/xitext',
 'epochs': '1',
 'eval_metric': 'accuracy',
 'model_filename': 'distilbert-topic-seq-classifier.bin',
 'model_group': 'distilbert',
 'model_output_dir': '/content/drive/MyDrive/xitext/restaurant-sentiment/models',
 'optimizer': 'adam',
 'project_home': '/content/drive/MyDrive/xitext/restaurant-sentiment',
 'project_name': 'restaurant-sentiment',
 'scheduler': 'linear',
 'transformers_model_id': 'distilbert-base-uncased'}

In [82]:
seq_model = SequenceClassifierModel(
    project_home=model_config['project_home'],
    tr_model_id=model_config['transformers_model_id'],
    model_group=model_config['model_group'],
    optimizer=model_config['optimizer'],
    scheduler=model_config['scheduler'],
    eval_metric=model_config['eval_metric'],
    num_labels=text_clas_data.num_labels,
    epochs=int(model_config['epochs']),
    train_data=train_data,
    val_data=val_data,
    model_output_dir=model_config['model_output_dir'],
    model_output_filename=model_config['model_filename']
);

seq_model.train(save_model=True)

DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): huggingface.co:443
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "HEAD /distilbert-base-uncased/resolve/main/config.json HTTP/1.1" 200 0
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): huggingface.co:443
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "HEAD /distilbert-base-uncased/resolve/main/pytorch_model.bin HTTP/1.1" 302 0
Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you ar

Training...


DEBUG:model-training.log:Running Validation...


 Train Accuracy: 0.65
Average training loss: 0.6830058296521505
Training epoch took: 0:00:04

 Accuracy: 1.279233870967742
 Validation took: 0:00:00
Training...


DEBUG:model-training.log:Running Validation...


 Train Accuracy: 0.86
Average training loss: 0.6078944106896719
Training epoch took: 0:00:04

 Accuracy: 1.3341733870967742
 Validation took: 0:00:00
Training...


DEBUG:model-training.log:Running Validation...


 Train Accuracy: 0.99
Average training loss: 0.4510484461983045
Training epoch took: 0:00:04

 Accuracy: 1.3664314516129032
 Validation took: 0:00:00
Training...


DEBUG:model-training.log:Running Validation...


 Train Accuracy: 1.00
Average training loss: 0.35320454090833664
Training epoch took: 0:00:04



DEBUG:model-training.log:Training Complete
DEBUG:model-training.log:/content/drive/MyDrive/xitext/restaurant-sentiment/models


 Accuracy: 1.3654233870967742
 Validation took: 0:00:00



DEBUG:model-training.log:model saved to /content/drive/MyDrive/xitext/restaurant-sentiment/models/distilbert-topic-seq-classifier.bin


#### ONNX

In [83]:
onnx_config

{'company': 'xitext',
 'company_home': '/content/drive/MyDrive/xitext',
 'model_type': 'bert',
 'onnx_model_output_dir': '/content/drive/MyDrive/xitext/restaurant-sentiment/models/onnx',
 'project_home': '/content/drive/MyDrive/xitext/restaurant-sentiment',
 'project_name': 'restaurant-sentiment',
 'runtimeprovider': 'CPUExecutionProvider',
 'torch_model_path': '/content/drive/MyDrive/xitext/restaurant-sentiment/models/distilbert-topic-seq-classifier.bin'}

In [84]:
from pathlib import Path

torch_model_path = onnx_config['torch_model_path']
onnx_model_dir = Path('/content/drive/MyDrive/xitext/xitext_model_trainer/models/news-topic-classifier/onnx/')

tt2 = TorchToONNX(
    torch_model_path=Path(onnx_config['torch_model_path']),
    onnx_model_dir=Path(onnx_config['onnx_model_output_dir']),
    tokenizer=text_clas_data.tokenizer
)
tt2.model_type='bert'

DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): huggingface.co:443
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "HEAD /bert-base-uncased/resolve/main/vocab.txt HTTP/1.1" 200 0
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): huggingface.co:443
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "HEAD /bert-base-uncased/resolve/main/tokenizer.json HTTP/1.1" 200 0


In [85]:
tt2.convert_torch_to_onnx()
# tt2.model_type

ONNX opset version set to: 11
Loading pipeline (model: DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0): TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
     

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
    self.io_loop.start()
  File "/usr/local/lib/python3.6/dist-packages/tornado/platform/asyncio.py", line 132, in start
    self.asyncio_loop.run_forever()
  File "/usr/lib/python3.6/asyncio/base_events.py", line 438, in run_forever
    self._run_once()
  File "/usr/lib/python3.6/asyncio/base_events.py", line 1451, in _run_once
    handle._run()
  File "/usr/lib/python3.6/asyncio/events.py", line 145, in _run
    self._callback(*self._args)
  File "/usr/local/lib/python3.6/dist-packages/tornado/ioloop.py", line 758, in _run_callback
    ret = callback()
  File "/usr/local/lib/python3.6/dist-packages/tornado/stack_context.py", line 300, in null_wrapper
    return fn(*args, **kwargs)
  File "/usr/local/lib/python3.6/dist-packages/zmq/eventloop/zmqstream.py", line 548, in <lambda>
    self.io_loop.add_callback(lambda : self._handle_events(self.socket, 0))
  File "/usr/local/lib/python3.6/dist-packages/zmq/eventloop/zmqstrea

In [86]:
from scipy.special import softmax
def make_predictions(model, encoded_sentence, attention_mask, token_type_id=None):
    
    model.eval()
    with torch.no_grad():
        
        preds = model(
                    encoded_sentence,
                    attention_mask=attention_mask
                ) 
            # labels are not passed here in validation
            # Get the "logits" output by the model. The "logits" are the output
            # values prior to applying an activation function like softmax
        
        logits = preds[0]
        probabilities = torch.nn.functional.softmax(logits, dim=1)        
        probabilities = probabilities.detach().cpu().numpy()
        # Move logits and labels to CPU

        np.set_printoptions(suppress=True)

    return probabilities[-1].round(4)


In [94]:
import numpy as np
sentence = [
	'''
  
  I need to eat there again. tastey meal

  ''']

encoded_tensor = _encode_text_into_tokens( sentence, text_clas_data.tokenizer)

# print(encoded_tensor.shape, attention_mask_tensor.shape)

seq_model.model.eval()
seq_model.model.to('cpu')
prediction_probabilities = make_predictions(
    model=seq_model.model,
    encoded_sentence=encoded_tensor['input_ids'], 
    attention_mask=encoded_tensor['attention_mask']
)

from collections import OrderedDict
top_topics = OrderedDict()
classes=text_clas_data.classes
for i in prediction_probabilities.argsort()[-10:][::-1]:
    top_topics[classes[i]] = prediction_probabilities[i]

top_topics

DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): huggingface.co:443
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "HEAD /bert-base-uncased/resolve/main/vocab.txt HTTP/1.1" 200 0
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): huggingface.co:443
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "HEAD /bert-base-uncased/resolve/main/tokenizer.json HTTP/1.1" 200 0
DEBUG:data-preprocessing.log:Encoding input sentences completed


OrderedDict([('positive', 0.6473), ('negative', 0.3527)])

In [91]:
onnx_res = tt2.run_inference(sentence, text_clas_data.tokenizer)

from collections import OrderedDict
probs = {}
for i, x in enumerate(onnx_res[0]):
  probs[text_clas_data.classes[i]] = round(x, 4) 
  
sorted(probs.items(), key=lambda x: x[1], reverse=True)[:10]

DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): huggingface.co:443
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "HEAD /bert-base-uncased/resolve/main/vocab.txt HTTP/1.1" 200 0
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): huggingface.co:443
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "HEAD /bert-base-uncased/resolve/main/tokenizer.json HTTP/1.1" 200 0


[('negative', 0.6602), ('positive', 0.3398)]

#### End

In [79]:
text_clas_data.data_df.shape

(1000, 2)