# Fine tune bert model

#### Load dataset from downloads 📥📥

In [1]:
!pip install transformers==4.28.0 datasets evaluate accelerate 

Collecting transformers==4.28.0
  Downloading transformers-4.28.0-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m76.1 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting datasets
  Downloading datasets-2.15.0-py3-none-any.whl (521 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m329.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m257.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting accelerate
  Downloading accelerate-0.24.1-py3-none-any.whl (261 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m261.4/261.4 kB[0m [31m331.8 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.19.4-py3-none-any.whl (311 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m 

In [3]:
import pandas as pd
import os
labeled_dataset = "labeled_dataset.csv"

# Assuming the file is in the current working directory
df = pd.read_csv(labeled_dataset)

# Display the first few rows of the dataframe
df.head()

Unnamed: 0,annotation_id,annotator,created_at,id,lead_time,question,sentiment,updated_at
0,1120,1,2023-11-08T20:41:06.680849Z,1165,2.084,"I ordered two pairs of skis, 173cm, and they s...",postSale,2023-11-08T20:41:06.680863Z
1,283,1,2023-10-30T13:20:25.196662Z,287,2.081,"What does ""lithotripsy"" mean?",irrelevant,2023-10-30T13:20:25.196675Z
2,1295,1,2023-11-10T15:55:32.460520Z,1340,1.637,I bought a summer skirt for my daughter in a s...,postSale,2023-11-10T15:55:32.460532Z
3,1085,1,2023-11-03T23:19:37.431473Z,1130,1.527,I appreciated the user-friendly payment platfo...,feedback,2023-11-03T23:19:37.431494Z
4,1307,1,2023-11-10T15:57:10.625237Z,1352,2.595,"I acquired a Rowdy hoodie in size small, but I...",postSale,2023-11-10T15:57:10.625259Z


In [4]:
test_size = 0.2 ## Define the testing size for metrics
number_of_labels = os.environ.get('number_labels', 6)
label_column_name = 'sentiment'
text_column_name = os.environ.get('prompt_column', 'question')
model_name = "intent"

In [5]:
print(f"The number of labels to be showed are: {number_of_labels} with a label colum name: {label_column_name} and a prompt column name: {text_column_name} (The test size is {test_size})")

The number of labels to be showed are: 6 with a label colum name: sentiment and a prompt column name: question (The test size is 0.2)


## Define mappings 

Here you have to define a map so the model can be properly trained lets see an example

```python
category_to_label = {
    'availability': 0,
    'irrelevant': 1,
    'post sale': 2,
    'invoice':3,
    'service':4,
    'pricing':5,
    'general':6,
    'cancelation policy':7,
    'cancel reservation':8
}
```

In [6]:
### For now the map is being defined within the notebook

category_to_label={
 'inventory': 0,
 'checkout': 1,
 'irrelevant': 2,
 'conversational': 3,
 'feedback': 4,
 'postSale': 5
}

# Add the new 'label' column to the dataframe by mapping values from the 'category' column
df['label'] = df[label_column_name].replace(category_to_label)
df = df.drop('annotation_id',axis=1)
df = df.drop('annotator',axis=1)
df = df.drop('created_at',axis=1)
df = df.drop('id',axis=1)
df = df.drop('lead_time',axis=1)
df = df.drop('updated_at',axis=1)

In [7]:
category_to_label

{'inventory': 0,
 'checkout': 1,
 'irrelevant': 2,
 'conversational': 3,
 'feedback': 4,
 'postSale': 5}

In [8]:
df.head(3)

Unnamed: 0,question,sentiment,label
0,"I ordered two pairs of skis, 173cm, and they s...",postSale,5
1,"What does ""lithotripsy"" mean?",irrelevant,2
2,I bought a summer skirt for my daughter in a s...,postSale,5


In [9]:
from sklearn.model_selection import train_test_split


df_train, df_test = train_test_split(df, test_size=test_size)

In [10]:
from datasets import Dataset

train_dataset = Dataset.from_pandas(df_train)
test_dataset = Dataset.from_pandas(df_test)

In [11]:
from transformers import AutoTokenizer
base_model_name = 'distilbert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(base_model_name)

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [12]:
def preprocess_function(examples):
    return tokenizer(examples[text_column_name], truncation=True)

In [13]:
tokenized_train = train_dataset.map(preprocess_function, batched=True)
tokenized_test = test_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/386 [00:00<?, ? examples/s]

Map:   0%|          | 0/97 [00:00<?, ? examples/s]

In [14]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(base_model_name, num_labels=number_of_labels)

pytorch_model.bin:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias', 'pre_classifier

In [15]:
from transformers import TrainingArguments, Trainer
import evaluate
import numpy as np
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [16]:
metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
  logits, labels = eval_pred
  predictions = np.argmax(logits, axis=-1)
  return metric.compute(predictions=predictions, references=labels)

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [17]:
training_args = TrainingArguments(
    hub_model_id=model_name,
    output_dir="./output",
    learning_rate=2e-4,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    logging_strategy="epoch",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    eval_dataset=tokenized_test
)

In [18]:
### Fine tune model
trainer.train()

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.6542,0.41716,0.917526
2,0.1907,0.316821,0.917526
3,0.1432,0.248782,0.938144
4,0.0686,0.230845,0.948454
5,0.0679,0.255307,0.958763


TrainOutput(global_step=245, training_loss=0.224959135055542, metrics={'train_runtime': 167.5709, 'train_samples_per_second': 11.518, 'train_steps_per_second': 1.462, 'total_flos': 16476807985920.0, 'train_loss': 0.224959135055542, 'epoch': 5.0})

In [19]:
## Save pytorch 
trainer.save_model(model_name)

In [20]:
!pip install onnx onnxoptimizer -q

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.2.2[0m[39;49m -> [0m[32;49m23.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [21]:
import torch
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
import onnx
import onnxoptimizer

# Load the fine-tuned DistilBERT model and tokenizer
model_checkpoint = model_name
model = DistilBertForSequenceClassification.from_pretrained(model_checkpoint)
tokenizer = DistilBertTokenizer.from_pretrained(model_checkpoint)

# Create dummy input data for inference
text = "Do you have red t shirts?"
input_data = tokenizer.encode_plus(text, add_special_tokens=True, return_tensors="pt")

# Export the PyTorch model to ONNX
onnx_filename = f"{model_name}.onnx"
dummy_input = input_data["input_ids"]
torch.onnx.export(model, (dummy_input,), onnx_filename, input_names=['input_ids'], output_names=['logits'])

# Load the ONNX model
onnx_model = onnx.load(onnx_filename)

# Optimize the ONNX model using onnxoptimizer
optimized_model = onnxoptimizer.optimize(onnx_model)

# Save the optimized ONNX model using file handling
optimized_onnx_filename = "optimized_model.onnx"
with open(optimized_onnx_filename, "wb") as f:
    f.write(optimized_model.SerializeToString())


  mask, torch.tensor(torch.finfo(scores.dtype).min)


verbose: False, log level: Level.ERROR



In [22]:
## Load the bucket and file path
bucket_name =  os.environ['bucket_name']
model_path =  os.environ.get('model_base_dir', "ecommerce-medusa")

In [23]:
## AWS_ACCESS_KEY_ID & AWS_SECRET_ACCESS_KEY should be set as Env variables
key_id = os.environ.get('AWS_ACCESS_KEY_ID')
secret_key = os.environ.get('AWS_SECRET_ACCESS_KEY')

In [24]:
import boto3
from datetime import datetime

now_date = datetime.now()
s3_client = boto3.client('s3', aws_access_key_id=key_id, aws_secret_access_key=secret_key)

In [25]:
#Upload file to S3
s3_client.upload_file(optimized_onnx_filename, bucket_name, f"{model_path}/bins/{onnx_filename}")
s3_client.upload_file(labeled_dataset, bucket_name,  f"{model_path}/datasets/dataset-{now_date.isoformat()}.csv")

In [26]:
## Delete directory in Jupyter Notebook
import shutil

# Remove the local model directory
shutil.rmtree(model_name)
os.remove(optimized_onnx_filename)
os.remove(onnx_filename)
os.remove(labeled_dataset)
shutil.rmtree("output")