In [None]:
import sys
print("Python version")
print (sys.version)
print("Version info.")
print (sys.version_info)

In [None]:
# Required installation
!pip install transformers
!pip install onnx
!pip install onnxruntime
!pip install onnxruntime_extensions

In [None]:
import glob
import os
import json
import torch
import torch.nn.functional as F
import onnxruntime
from transformers import GPT2Model, GPT2LMHeadModel, GPT2Tokenizer
from onnx import numpy_helper
import numpy as np

### **GPT2**

There are 3 ONNX models available in the ONNX repo. (https://github.com/onnx/models/tree/main/text/machine_comprehension). **gpt2-lm-head** is chosen for testing the effect of fault injection.  

### **Model details**
**Task**: The task here is 'Text Predictiction'. \
**Input to the model**: Sequence of words as a string. Example: "The chair is white and the table is"\
**Model Prediction**: black

The three ONNX models available and the reason for choosing gpt2-lm-head is given below:

1. **GPT2** : It outputs only the **last hidden state**. Since it does not output any prediction score, the text prediction task cannot be performed
  - Output of the model: (last_hidden_state, past)
2. **GPT2-LM-HEAD**:  Outputs prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). Therefore in the post-processing steps, softmax needs to be applied before converting the tokens to string.
  - Output of the model: (prediction_scores, past)
3. **GPT2-bs**: This GPT-2 model with generation can produce the result without any extra code or algorithm. It already embedded a beam search algorithm into the ONNX model, so there is NO Post-Processing code to inference. But, there is an error while converting this onnx model to LLVM IR. (https://github.com/DependableSystemsLab/LLTFI/issues/42) 
  - Output of the model: (Output tokens)

Because of the above reason, gpt2-lm-head was chosen for testing fault injection.

**References:** \
https://github.com/onnx/models/tree/main/text/machine_comprehension/gpt-2
https://github.com/onnx/models/tree/main/text/machine_comprehension/gpt2-bs


Below is the working code for **gpt-bs**

In [None]:
# Download the gpt2-lm-head-bs-12 ONNX model
!wget https://github.com/onnx/models/raw/main/text/machine_comprehension/gpt2-bs/model/gpt2-lm-head-bs-12.onnx

In [None]:
# Code for executing the gpt2-lm-head-bs-12.onnx model and perform Text prediction task
from transformers import GPT2Tokenizer
from onnxruntime_extensions import PyOrtFunction

tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.padding_side = "left"
tokenizer.pad_token = tokenizer.eos_token

gpt2_all = PyOrtFunction.from_model('gpt2-lm-head-bs-12.onnx')
encdict = tokenizer('Today is a beautiful sunny day. This is', padding=True, return_tensors='np')

len_str = 30 # Length of the text you want the model to predict
outputs = gpt2_all(encdict['input_ids'], encdict['attention_mask'].astype('float32'),30)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

Output of the above cell (Predicted text in bold): 

Today is a beautiful sunny day. This is **the first time in my life that I have been able to see the sun. It is a beautiful day. This is the first time in my life**

# GPT2-lm-head

In [None]:
# Convert text inputs to input.pb files

input = []
input.append("This chair is white and the table is")
input.append("It is bright and")
input.append("I am a doctor and I work at a")
input.append("I like playing with my")
input.append("A rose by any other name would smell as")

tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

for index in range(len(input)):
  tokens = np.array(tokenizer.encode(input[index])) # Shape : (len,)
  input_arr = tokens.reshape(1,1,-1) # Shape : (1,1,len)
  input_tensor = numpy_helper.from_array(input_arr)
  with open("input_{}.pb".format(index), 'wb') as file:
      file.write(input_tensor.SerializeToString())

Execute LLTFI with this input. Below is the code to convert LLTFI output to text

In [None]:

ROOT = os.getcwd()
# Reads all the text files from 'prog_output' directory
PROG_OUT = os.path.join(ROOT, 'prog_output')
txtfiles = []

for file in glob.glob(os.path.join(PROG_OUT, "*.txt")):
    txtfiles.append(file)

# List to store all the outputs
listResArr = []
for filename in txtfiles:
  resforSingleInput = []
  with open(filename, "r") as read_file:
      resultJson = json.load(read_file)

      for key, value in resultJson.items():
          resforSingleInput.append(value['Data'])
      listResArr.append(resforSingleInput)

list_output_np = []
# Reshape the output
for elem in listResArr:
  output_np = np.asarray(elem[0])
  output_np = output_np.reshape(1,1,-1,50257)
  list_output_np.append(output_np)

For one output(one layeroutput.txt file) :

In [None]:
output_np = list_output_np[0]

In [None]:
# Script to convert numpy output to text
input_to_model = torch.tensor(
      [[tokenizer.encode("It is bright and", add_special_tokens=True)]]) # [1, 1, len]

prev = input_to_model # [1, 1, len] Set prev as input in the first step
prev = prev[0] # [1, len]
output = prev

logits = output_np[0]
logits = logits[:, -1, :]
logits = torch.tensor(logits)
log_probs = F.softmax(logits, dim=-1)
_, prev = torch.topk(log_probs, k=1, dim=-1)
output2 = torch.cat((output, prev), dim=1)
output = output2
input_to_model = output2

output1 = output2[:, len(tokens):].tolist()
generated = 0
batch_size = 1
for i in range(batch_size):
    generated += 1
    text = tokenizer.decode(output1[i])
    print(text)

**Below is the code to run inference in this jupyter notebook using the ONNX model directly**

In [None]:
!wget https://github.com/onnx/models/raw/main/text/machine_comprehension/gpt-2/model/gpt2-lm-head-10.onnx

In [None]:
def flatten(inputs):
    return [[flatten(i) for i in inputs] if isinstance(inputs, (list, tuple)) else inputs]


def update_flatten_list(inputs, res_list):
    for i in inputs:
        res_list.append(i) if not isinstance(i, (list, tuple)) else update_flatten_list(i, res_list)
    return res_list

def to_numpy(x):
    if type(x) is not np.ndarray:
        x = x.detach().cpu().numpy() if x.requires_grad else x.cpu().numpy()
    return x

def inference(inputs):
    inputs_flatten = flatten(inputs)
    inputs_flatten = update_flatten_list(inputs_flatten, [])
    #outputs_flatten = flatten(outputs)
    #outputs_flatten = update_flatten_list(outputs_flatten, [])

    # Start from ORT 1.10, ORT requires explicitly setting the providers parameter if you want to use execution providers
    # other than the default CPU provider (as opposed to the previous behavior of providers getting set/registered by default
    # based on the build flags) when instantiating InferenceSession.
    # For example, if NVIDIA GPU is available and ORT Python package is built with CUDA, then call API as following:
    # onnxruntime.InferenceSession(path/to/model, providers=['CUDAExecutionProvider'])
    #sess = onnxruntime.InferenceSession(file)
    ort_session = onnxruntime.InferenceSession("gpt2-lm-head-10.onnx")
    ort_inputs = dict((ort_session.get_inputs()[i].name, to_numpy(input)) for i, input in enumerate(inputs_flatten))
    res = ort_session.run(None, ort_inputs)
    return res

In [None]:
from transformers import GPT2Model, GPT2LMHeadModel, GPT2Tokenizer
import tensorflow as tf 

tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

text = "It is bright and"

input_to_model = torch.tensor(
      [[tokenizer.encode(text, add_special_tokens=True)]]) # [1, 1, len]

tokens = np.array(tokenizer.encode(text))

prev = input_to_model # [1, 1, len]
prev = prev[0] # [1, len]
output = prev
print(output)

length = 10 # Length of the text you want the model to predict

for i in range(length): 
  if(len(input_to_model.shape) == 2):
    first = input_to_model.shape[0]
    second = input_to_model.shape[1]
    input_to_model = input_to_model.reshape(1,first,second)
  result = inference(input_to_model)
  print(result[0].shape)
  logits = result[0][0]
  logits = logits[:, -1, :]
  logits = torch.tensor(logits)
  log_probs = F.softmax(logits, dim=-1)
  _, prev = torch.topk(log_probs, k=1, dim=-1)
  output2 = torch.cat((output, prev), dim=1)
  output = output2
  input_to_model = output2

output1 = output2[:, len(tokens):].numpy().tolist()
generated = 0
batch_size = 1
for i in range(batch_size):
    generated += 1
    text = tokenizer.decode(output1[i])
    print(text)

## Note: The above code uses pytorch. 

Output: sunny in the morning, and the sun is shining