In [1]:
!pip install transformers
!pip install datasets evaluate accelerate optuna optimum
!pip install optimum[onnxruntime]@git+https://github.com/huggingface/optimum.git
!pip install onnxruntime-gpu

Collecting transformers
  Downloading transformers-4.33.1-py3-none-any.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m53.2 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.15.1 (from transformers)
  Downloading huggingface_hub-0.17.1-py3-none-any.whl (294 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m294.8/294.8 kB[0m [31m33.0 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m99.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m78.3 MB/s[0m eta [36m0:00:0

In [2]:
import os
import time
import shutil
import torch
import pandas as pd
from datasets import load_dataset
from transformers import AutoTokenizer
from transformers import Trainer, TrainingArguments
from datasets import Features, Value, ClassLabel
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from transformers import AutoModelForSequenceClassification, AutoConfig
from optimum.onnxruntime import ORTQuantizer, ORTModelForSequenceClassification
from optimum.onnxruntime.configuration import AutoQuantizationConfig

In [None]:
from huggingface_hub import HfApi
api = HfApi()

In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [3]:
df = pd.read_csv("/content/helpdest_dataset.csv",encoding='utf-8')

In [4]:
df.head()

Unnamed: 0,text,label
0,The customer described problems with blurry im...,2
1,The customer was having issues with login cred...,2
2,The customer complained that the product they ...,2
3,The customer asked about the status of their d...,0
4,The customer needed help connecting their new ...,2


In [5]:
def finetune(df):
  train , validation = train_test_split(df , test_size=0.09 , random_state = 42 , stratify = df["label"])
  train.to_csv("train_help.csv" , index=False)
  validation.to_csv("val_help.csv",index=False)

  class_names = ['Negative','Neutral','Positive']
  ft=Features({'text':Value(dtype='string',id=None), 'label': ClassLabel(num_classes=3, names=class_names)})
  dataset=load_dataset('csv',data_files={'train':'train_help.csv','validation':'val_help.csv'},features=ft)

  model_ckpt = "distilbert-base-uncased"
  tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

  def tokenize(batch):
    return tokenizer(batch["text"], padding=True, truncation=True)

  helpdesk_encoded = dataset.map(tokenize, batched=True, batch_size=None)
  helpdesk_encoded.set_format("torch", columns=["input_ids", "attention_mask", "label"])

  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

  num_labels = 3
  model = (AutoModelForSequenceClassification
         .from_pretrained(model_ckpt, num_labels=num_labels)
         .to(device))

  config = (AutoConfig.from_pretrained(model_ckpt,
                                    num_labels=len(class_names),
                                    id2label={i: label for i, label in enumerate(class_names)},
                                    label2id={label: i for i, label in enumerate(class_names)}))

  model.config = config


  def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(labels, preds, average="weighted")
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1}

  batch_size = 2
  logging_steps = len(helpdesk_encoded["train"]) // batch_size
  model_name = "Venkatesh4342/distilbert-helpdesk-sentiment"
  training_args = TrainingArguments(
    output_dir=model_name,
    num_train_epochs=6,
    learning_rate=2e-5,
    evaluation_strategy ='steps',
    eval_steps=100,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_steps=100,
    save_total_limit=3,
    load_best_model_at_end= True,
    logging_steps=logging_steps,
    gradient_checkpointing=True,
    push_to_hub=True)


  trainer = Trainer(model=model,
                  args=training_args,
                  compute_metrics=compute_metrics,
                  train_dataset=helpdesk_encoded["train"],
                  eval_dataset=helpdesk_encoded["validation"],
                  tokenizer=tokenizer)

  trainer.train()
  trainer.save_model("fine_tuned_model")

  onnx_model = ORTModelForSequenceClassification.from_pretrained("fine_tuned_model", export=True)
  quantizer = ORTQuantizer.from_pretrained(onnx_model)
  dqconfig = AutoQuantizationConfig.avx512_vnni(is_static=False, per_channel=False)
  model_quantized_path = quantizer.quantize(
    save_dir="model",
    quantization_config=dqconfig,
  )
  time.sleep(15)
  api.upload_folder(
    folder_path="model",
    repo_id="Venkatesh4342/quantized-helpdesk",
    repo_type="model"
)
  shutil.rmtree("fine_tuned_model")
  os.remove("train_help.csv")
  os.remove("val_help.csv")

In [6]:
finetune(df)

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/900 [00:00<?, ? examples/s]

Map:   0%|          | 0/90 [00:00<?, ? examples/s]

Downloading model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.weight', 'classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,Accuracy,F1
100,No log,0.37104,0.888889,0.88819
200,No log,0.20487,0.922222,0.921342
300,No log,0.112782,0.977778,0.977775
400,No log,0.27732,0.911111,0.908428
500,0.327400,0.113064,0.955556,0.955364
600,0.327400,0.087544,0.988889,0.988886


Framework not specified. Using pt to export to ONNX.
Using the export variant default. Available variants are:
	- default: The default ONNX variant.
Using framework PyTorch: 2.0.1+cu118
  mask, torch.tensor(torch.finfo(scores.dtype).min)


verbose: False, log level: Level.ERROR



Creating dynamic quantizer: QOperator (mode: IntegerOps, schema: u8/s8, channel-wise: False)
Quantizing model...
Saving quantized model at: model (external data format: False)
Configuration saved in model/ort_config.json
