# Fine tune bert model

#### Load dataset from downloads 📥📥

In [1]:
!pip install transformers==4.28.0 datasets evaluate accelerate 


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.2.2[0m[39;49m -> [0m[32;49m23.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [2]:
import pandas as pd
import os
# Assuming the file is in the current working directory
df = pd.read_csv("labeled_dataset.csv")

# Display the first few rows of the dataframe
df.head()

Unnamed: 0,annotation_id,annotator,created_at,id,lead_time,question,sentiment,updated_at
0,194,1,2023-10-13T21:02:06.194361Z,195,2.511,Do you have shirts?,Inventory,2023-10-13T21:02:06.194370Z
1,195,1,2023-10-13T21:02:10.562741Z,196,1.604,Do you have t-shirts?,Inventory,2023-10-13T21:02:10.562761Z
2,196,1,2023-10-13T21:02:13.785399Z,197,2.348,I want to buy a short to go out for a party,Inventory,2023-10-13T21:02:13.785410Z
3,197,1,2023-10-13T21:02:16.348390Z,198,1.652,I wanna buy a coffe mug,Inventory,2023-10-13T21:02:16.348404Z
4,198,1,2023-10-13T21:02:20.943221Z,199,3.431,I want to complete my order,Checkout,2023-10-13T21:02:20.943231Z


In [3]:
test_size= 0.2 ## Define the testing size for metrics
number_of_labels= os.environ.get('number_labels') if os.environ.get('number_labels') is not None else 2
label_column_name='sentiment'
text_column_name=os.environ.get('prompt_column') if os.environ.get('prompt_column') is not None else 'question'

In [4]:
print("The number of labels to be showed are: {} with a label colum name: {} and a prompt column name: {} (The test size is {})".format(number_of_labels,label_column_name,text_column_name,test_size))

The number of labels to be showed are: 2 with a label colum name: sentiment and a prompt column name: question (The test size is 0.2)


## Define mappings 

Here you have to define a map so the model can be properly trained lets see an example

```python
category_to_label = {
    'availability': 0,
    'irrelevant': 1,
    'post sale': 2,
    'invoice':3,
    'service':4,
    'pricing':5,
    'general':6,
    'cancelation policy':7,
    'cancel reservation':8
}
```

In [5]:
### For now the map is being defined within the notebook

category_to_label={
 "Inventory":0,
 "Checkout":1
}
# Add the new 'label' column to the dataframe by mapping values from the 'category' column
df['label'] = df[label_column_name].replace(category_to_label)
df=df.drop('annotation_id',axis=1)
df=df.drop('annotator',axis=1)
df=df.drop('created_at',axis=1)
df=df.drop('id',axis=1)
df=df.drop('lead_time',axis=1)
df=df.drop('updated_at',axis=1)

In [6]:
category_to_label

{'Inventory': 0, 'Checkout': 1}

In [7]:
df.head(3)

Unnamed: 0,question,sentiment,label
0,Do you have shirts?,Inventory,0
1,Do you have t-shirts?,Inventory,0
2,I want to buy a short to go out for a party,Inventory,0


In [8]:
from sklearn.model_selection import train_test_split


df_train,df_test=train_test_split(df,test_size=test_size)

In [9]:
from datasets import Dataset

train_dataset=Dataset.from_pandas(df_train)
test_dataset=Dataset.from_pandas(df_test)

In [10]:
from transformers import AutoTokenizer
model_name='distilbert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [11]:
def preprocess_function(examples):
    return tokenizer(examples[text_column_name],truncation=True)

In [12]:
tokenized_train = train_dataset.map(preprocess_function, batched=True)
tokenized_test=test_dataset.map(preprocess_function,batched=True)

Map:   0%|          | 0/30 [00:00<?, ? examples/s]

Map:   0%|          | 0/8 [00:00<?, ? examples/s]

In [13]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(model_name,num_labels=number_of_labels)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias', 'pre_classifier.bias', 'pre_classifier.we

In [14]:
from transformers import TrainingArguments, Trainer
import evaluate
import numpy as np
from transformers import DataCollatorWithPadding

data_collator=DataCollatorWithPadding(tokenizer=tokenizer)

In [15]:
metric= evaluate.load("accuracy")

def compute_metrics(eval_pred):
  logits,labels=eval_pred
  predictions=np.argmax(logits,axis=-1)
  return metric.compute(predictions=predictions,references=labels)

In [16]:
model_name=os.environ.get("model_name") if os.environ.get("model_name") is not None else "medusa_retail_intent"

In [None]:
training_args = TrainingArguments(
    hub_model_id=model_name,
    output_dir="./output",
    learning_rate=2e-4,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    logging_strategy="epoch",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    eval_dataset=tokenized_test
)

In [None]:
### Fine tune model
trainer.train()

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.6568,0.353137,1.0
2,0.246,0.052877,1.0
3,0.0362,0.012131,1.0
4,0.0107,0.006282,1.0
5,0.0062,0.005072,1.0


TrainOutput(global_step=20, training_loss=0.19116783225908876, metrics={'train_runtime': 20.2182, 'train_samples_per_second': 7.419, 'train_steps_per_second': 0.989, 'total_flos': 649918174656.0, 'train_loss': 0.19116783225908876, 'epoch': 5.0})

In [None]:
print(model_name)

medusa_retail_intent


In [None]:
## Save pytorch 
trainer.save_model(model_name)

In [25]:
!pip install onnx onnxoptimizer -q

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.2.2[0m[39;49m -> [0m[32;49m23.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [27]:
import torch
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
import onnx
import onnxoptimizer

# Load the fine-tuned DistilBERT model and tokenizer
model_checkpoint = model_name
model = DistilBertForSequenceClassification.from_pretrained(model_checkpoint)
tokenizer = DistilBertTokenizer.from_pretrained(model_checkpoint)

# Create dummy input data for inference
text = "Do you have red t shirts?"
input_data = tokenizer.encode_plus(text, add_special_tokens=True, return_tensors="pt")

# Export the PyTorch model to ONNX
onnx_path = "model.onnx"
dummy_input = input_data["input_ids"]
torch.onnx.export(model, (dummy_input,), onnx_path, input_names=['input_ids'], output_names=['logits'])

# Load the ONNX model
onnx_model = onnx.load(onnx_path)

# Optimize the ONNX model using onnxoptimizer
optimized_model = onnxoptimizer.optimize(onnx_model)

# Save the optimized ONNX model using file handling
optimized_onnx_path = "optimized_model.onnx"
with open(optimized_onnx_path, "wb") as f:
    f.write(optimized_model.SerializeToString())


In [28]:
## Load the bucket and file path 
bucket_name =  os.environ.get('bucket_name') if os.environ.get('bucket_name') is not None else "ecommerce-medusa"

In [29]:
## AWS_ACCESS_KEY_ID & AWS_SECRET_ACCESS_KEY should be set as Env variables
key_id = os.environ.get('AWS_ACCESS_KEY_ID')
secret_key = os.environ.get('AWS_SECRET_ACCESS_KEY')

In [30]:
import boto3
from boto3 import session
session = boto3.session.Session(aws_access_key_id=key_id, aws_secret_access_key=secret_key)
s3_client = boto3.client('s3', aws_access_key_id=key_id, aws_secret_access_key=secret_key)

In [33]:
#Upload file to S3
s3_client.upload_file('optimized_model.onnx', bucket_name, "models/intent/" + model_name + "/" +'model.onnx')

In [34]:
## Delete directory in Jupyter Notebook
import shutil


# Remove the local model directory
shutil.rmtree(model_name)
os.remove("optimized_model.onnx")
os.remove("model.onnx")
os.remove("labeled_dataset.csv")
shutil.rmtree("output")

print(f"{model_name} pytorch has been deleted.")
print("ONNX model has been deleted.")


medusa_retail_intent pytorch has been deleted.
ONNX model has been deleted.
