In [53]:
import pandas as pd
import numpy as np
import re
from tqdm import tqdm
import matplotlib.pyplot as plt
from transformers import GPT2Tokenizer
import emoji
from transformers import AutoTokenizer,AutoModelForSequenceClassification
from sklearn.preprocessing import MultiLabelBinarizer
from torch.utils.data import Dataset, DataLoader
import torch
from transformers import Trainer,TrainingArguments
from datasets import Dataset
from transformers import DataCollatorWithPadding
import joblib

In [47]:
def create_dataset(filename):
    data = []
    sentence = []
    emojis = []

    with open(f'/kaggle/input/emojifydata-en/{filename}.txt', 'r') as f:
        for line in tqdm(f):
            line = line.strip()
            if not line:
                if sentence:
                    sent = " ".join([w for w in sentence if w not in ['<START>', '<STOP>']])
                    emjs = "".join(emojis)
                    data.append((sent, emoji.emojize(emjs, language='alias')))
                    sentence = []
                    emojis = []
            else:
                if len(line.split()) == 2:
                    word, tag = line.split()
                    sentence.append(word)
                    if tag != 'O':
                        emojis.append(tag)

        # Handle last sentence if file doesn’t end with a newline
        if sentence:
            sent = " ".join([w for w in sentence if w not in ['<START>', '<STOP>']])
            emjs = "".join(emojis)
            data.append((sent, emoji.emojize(emjs, language='alias')))

    df = pd.DataFrame(data, columns=['Text', 'Emoji'])
    return df

In [3]:
train_df=create_dataset('train')

99514776it [01:39, 999657.67it/s] 


In [4]:
train_df.head(10)

Unnamed: 0,Text,Emoji
0,CeeC is going to be another Tboss What is 45 m...,😂
1,This gif kills me Death is literally gushing t...,😩
2,LOVE TEST Raw Real JaDine,💜
3,i swear we dont gotta look it finds us,😂
4,We would like to wish everyone a very Happy Ne...,🎉
5,15000 retweets a new song song off “ Swaecatio...,🗣️
6,just know ilysm k bye friend,💜
7,Too glam to give a damn,✨
8,🏼 🏼 fuck that sicko,👏👏
9,Can I marry someone who understands all these ...,😭


In [None]:
def extract_emoji(text):
    return [c for c in text if c in emoji.EMOJI_DATA]
train_df['clean emoji']=train_df['Emoji'].apply(extract_emoji)

In [6]:
train_df.head(10)

Unnamed: 0,Text,Emoji,clean emoji
0,CeeC is going to be another Tboss What is 45 m...,😂,[😂]
1,This gif kills me Death is literally gushing t...,😩,[😩]
2,LOVE TEST Raw Real JaDine,💜,[💜]
3,i swear we dont gotta look it finds us,😂,[😂]
4,We would like to wish everyone a very Happy Ne...,🎉,[🎉]
5,15000 retweets a new song song off “ Swaecatio...,🗣️,[🗣]
6,just know ilysm k bye friend,💜,[💜]
7,Too glam to give a damn,✨,[✨]
8,🏼 🏼 fuck that sicko,👏👏,"[👏, 👏]"
9,Can I marry someone who understands all these ...,😭,[😭]


In [7]:
train_df =  train_df[train_df['clean emoji'].map(len) > 0].sample(n=500_000, random_state=42).reset_index(drop=True)

In [8]:
train_df.shape

(500000, 3)

In [9]:
emojis_in_data=set()
for i in train_df.index:
    emj=train_df.iloc[i]['clean emoji']
    for j in emj:
        emojis_in_data.update(j)
    

In [10]:
emojis_in_data

{'‼',
 '☺',
 '♀',
 '♂',
 '♥',
 '✔',
 '✨',
 '❤',
 '➡',
 '🌟',
 '🎉',
 '🏆',
 '👀',
 '👇',
 '👉',
 '👌',
 '👍',
 '👏',
 '💀',
 '💕',
 '💖',
 '💙',
 '💛',
 '💜',
 '💥',
 '💪',
 '💯',
 '🔥',
 '🗣',
 '😁',
 '😂',
 '😉',
 '😊',
 '😍',
 '😎',
 '😘',
 '😢',
 '😩',
 '😭',
 '😳',
 '🙄',
 '🙌',
 '🙏',
 '🚨',
 '🤔',
 '🤣',
 '🤦',
 '🤷'}

In [11]:
train_df = train_df[train_df['clean emoji'].map(len) > 0].reset_index(drop=True)
train_df
print(train_df.shape)

(500000, 3)


In [12]:
mlb=MultiLabelBinarizer()
emoji_label=mlb.fit_transform(train_df['clean emoji'])

emoji_id_to_label={i:e for i,e in enumerate(mlb.classes_)}
emoji_label_to_id={e:i for i,e in emoji_id_to_label.items()}

In [13]:
train_df["labels"] = list(emoji_label)

In [14]:
tokenizer=AutoTokenizer.from_pretrained("vinai/bertweet-base", use_fast=True)
num_labels=emoji_label.shape[1]
model=AutoModelForSequenceClassification.from_pretrained(
    "vinai/bertweet-base",
    num_labels=num_labels,
    problem_type="multi_label_classification"
)

config.json:   0%|          | 0.00/558 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/843k [00:00<?, ?B/s]

bpe.codes:   0%|          | 0.00/1.08M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.91M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/543M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/bertweet-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
dataset = Dataset.from_pandas(train_df[["Text", "labels"]])

model.safetensors:   0%|          | 0.00/543M [00:00<?, ?B/s]

In [None]:
def tokenize_function(example):
    tokens = tokenizer(
        example["Text"],
        padding="max_length",
        truncation=True,
        max_length=128
    )
    tokens["labels"] = example["labels"]
    return tokens
    
tokenized_dataset = dataset.map(tokenize_function, batched=True, num_proc=4)

In [17]:
tokenized_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
tokenized_dataset = tokenized_dataset.map(
    lambda x: {"labels": [list(map(float, l)) for l in x["labels"]]},
    batched=True
)

Map:   0%|          | 0/500000 [00:00<?, ? examples/s]

In [49]:
class CustomDataCollator(DataCollatorWithPadding):
    def __call__(self, features):
        for f in features:
            f['labels'] = torch.tensor(f['labels'], dtype=torch.float32)
        return super().__call__(features)


In [19]:
tokenized_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

In [20]:
data_collator = CustomDataCollator(tokenizer=tokenizer)
training_args = TrainingArguments(
    output_dir='tweet-emoji',
    per_device_train_batch_size=16,
    num_train_epochs=2,
    save_strategy='epoch',
    logging_steps=1000,
    # evaluation_strategy="no",
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator
)

trainer.train()


  trainer = Trainer(
  f['labels'] = torch.tensor(f['labels'], dtype=torch.float32)


Step,Training Loss
1000,0.1167
2000,0.0878
3000,0.0838
4000,0.0821
5000,0.0793
6000,0.0786
7000,0.0776
8000,0.0772
9000,0.0757
10000,0.0748


  f['labels'] = torch.tensor(f['labels'], dtype=torch.float32)


TrainOutput(global_step=62500, training_loss=0.0674023611755371, metrics={'train_runtime': 13154.7956, 'train_samples_per_second': 76.018, 'train_steps_per_second': 4.751, 'total_flos': 6.5804931072e+16, 'train_loss': 0.0674023611755371, 'epoch': 2.0})

In [21]:
! mkdir /kaggle/working/model

In [23]:
model_dir='model'
model.save_pretrained(model_dir,save_serialization=True)
tokenizer.save_pretrained(model_dir)
joblib.dump(mlb,'model\mlb_emoji_encoder.pkl')

['model\\mlb_emoji_encoder.pkl']

In [24]:
joblib.dump(mlb,'/kaggle/working/model/mlb_emoji_encoder.pkl')

mkdir: missing operand
Try 'mkdir --help' for more information.


['/kaggle/working/model/mlb_emoji_encoder.pkl']

In [26]:
mlb.classes_

array(['‼', '☺', '♀', '♂', '♥', '✔', '✨', '❤', '➡', '🌟', '🎉', '🏆', '👀',
       '👇', '👉', '👌', '👍', '👏', '💀', '💕', '💖', '💙', '💛', '💜', '💥', '💪',
       '💯', '🔥', '🗣', '😁', '😂', '😉', '😊', '😍', '😎', '😘', '😢', '😩', '😭',
       '😳', '🙄', '🙌', '🙏', '🚨', '🤔', '🤣', '🤦', '🤷'], dtype=object)

In [27]:
import shutil
shutil.make_archive('tweet_emoji_bert', 'zip', '/kaggle/working/model')

'/kaggle/working/tweet_emoji_bert.zip'

## Evaluation

In [29]:
! pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting fsspec>=2021.05.0 (from fsspec[http]>=2021.05.0->evaluate)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2025.3.0-py3-none-any.whl (193 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: fsspec, evaluate
  Attempting uninstall: fsspec
    Found existing installation: fsspec 2025.3.2
    Uninstalling fsspec-2025.3.2:
      Successfully uninstalled fsspec-2025.3.2
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
cesium 0.12.4 requires numpy<3.0,>=2.

In [65]:
import evaluate
from sklearn.metrics import accuracy_score,f1_score

In [31]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    probs = torch.sigmoid(torch.tensor(logits)).numpy()
    preds = (probs > 0.5).astype(int)
    acc = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average='micro')
    return {"accuracy": acc, "f1": f1}

In [32]:
model = AutoModelForSequenceClassification.from_pretrained("model")
tokenizer = AutoTokenizer.from_pretrained("model")

In [38]:
trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    data_collator=data_collator,  
    compute_metrics=compute_metrics
)

  trainer = Trainer(
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [39]:
import os
os.environ["WANDB_DISABLED"] = "true"

train_metrics = trainer.evaluate(eval_dataset=tokenized_dataset)
print("Training Set Metrics:", train_metrics)

  f['labels'] = torch.tensor(f['labels'], dtype=torch.float32)


Training Set Metrics: {'eval_loss': 0.05261430889368057, 'eval_model_preparation_time': 0.0033, 'eval_accuracy': 0.398562, 'eval_f1': 0.5596929174318587, 'eval_runtime': 2075.0598, 'eval_samples_per_second': 240.957, 'eval_steps_per_second': 30.12}


In [None]:
valid_df=create_dataset('test')

In [None]:
valid_df['clean emoji']=valid_df['Emoji'].apply(extract_emoji)
emoji_label=mlb.transform(valid_df['clean emoji'])
valid_df["labels"] = list(emoji_label)

val_dataset = Dataset.from_pandas(valid_df[["Text", "labels"]])
val_tokenized_dataset = val_dataset.map(tokenize_function, batched=True, num_proc=4)
val_tokenized_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
val_tokenized_dataset = val_tokenized_dataset.map(
    lambda x: {"labels": [list(map(float, l)) for l in x["labels"]]},
    batched=True
)
val_tokenized_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

val_metrics = trainer.evaluate(eval_dataset=val_tokenized_dataset)
print("Validation Set Metrics:", val_metrics)

# Prediction Part

In [140]:
model = AutoModelForSequenceClassification.from_pretrained("model")
tokenizer = AutoTokenizer.from_pretrained("model")
mlb=joblib.load('/kaggle/working/model/mlb_emoji_encoder.pkl')

In [141]:
model

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(64001, 768, padding_idx=1)
      (position_embeddings): Embedding(130, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
         

In [45]:
def predict_emojis(text):
    model.eval()
    inputs=tokenizer(
        text,
        padding="max_length",
        truncation=True,
        max_length=128,
        return_tensors="pt"
    )

    with torch.no_grad():
        outputs=model(**inputs)
        print(outputs.logits)
        probs=torch.sigmoid(outputs.logits)
        print(probs)
        predictions=(probs>=0.3).int().numpy()
    print(predictions)

    return mlb.inverse_transform(predictions)

In [43]:
predict_emojis("It is gonna be fun today, will play games and do fun activities")

tensor([[-9.9818, -3.2426, -7.5848, -9.5882, -5.2063, -6.5351, -3.4494, -3.4749,
         -8.4590, -5.3543, -4.9825, -7.8210, -5.3048, -6.4848, -6.7298, -4.4016,
         -3.0211, -5.8498, -7.4705, -2.6714, -3.8847, -4.0045, -4.1654, -3.9560,
         -7.2476, -5.9317, -6.0126, -6.0635, -7.9244, -2.0577, -2.8466, -2.5008,
         -0.2568, -3.1581, -3.9901, -3.1193, -6.1196, -5.9622, -4.9209, -6.3456,
         -4.9971, -5.3569, -4.7934, -8.4494, -5.0017, -5.1919, -9.6236, -6.9423]])
tensor([[4.6230e-05, 3.7592e-02, 5.0787e-04, 6.8531e-05, 5.4521e-03, 1.4494e-03,
         3.0786e-02, 3.0036e-02, 2.1194e-04, 4.7054e-03, 6.8100e-03, 4.0108e-04,
         4.9432e-03, 1.5242e-03, 1.1933e-03, 1.2109e-02, 4.6481e-02, 2.8721e-03,
         5.6934e-04, 6.4679e-02, 2.0140e-02, 1.7908e-02, 1.5287e-02, 1.8780e-02,
         7.1138e-04, 2.6469e-03, 2.4416e-03, 2.3208e-03, 3.6169e-04, 1.1328e-01,
         5.4859e-02, 7.5801e-02, 4.3615e-01, 4.0774e-02, 1.8161e-02, 4.2316e-02,
         2.1946e-03, 2.567

[('😊',)]

In [46]:
predict_emojis("It is gonna be boring today, I don't have anything to do")

tensor([[-10.3065,  -6.8796,  -7.9452,  -9.8666,  -8.4167,  -9.4749,  -6.8766,
          -6.6593, -10.2658,  -8.7952,  -7.2954,  -9.4924,  -6.0826,  -8.7549,
          -9.4072,  -6.9320,  -6.5513,  -6.2663,  -5.2185,  -6.2692,  -6.9855,
          -7.2521,  -7.4664,  -7.4758,  -7.7659,  -7.4456,  -5.9887,  -6.6328,
          -7.1367,  -5.2754,  -2.7389,  -6.2015,  -4.4402,  -5.4425,  -6.3196,
          -6.8769,  -2.4257,  -0.9126,  -0.7196,  -5.1986,  -1.3077,  -6.6778,
          -5.7267,  -8.6202,  -4.1181,  -5.3393,  -6.4791,  -7.3667]])
tensor([[3.3415e-05, 1.0275e-03, 3.5424e-04, 5.1877e-05, 2.2110e-04, 7.6748e-05,
         1.0306e-03, 1.2804e-03, 3.4801e-05, 1.5144e-04, 6.7823e-04, 7.5414e-05,
         2.2771e-03, 1.5766e-04, 8.2122e-05, 9.7507e-04, 1.4262e-03, 1.8957e-03,
         5.3865e-03, 1.8901e-03, 9.2437e-04, 7.0817e-04, 5.7166e-04, 5.6632e-04,
         4.2378e-04, 5.8367e-04, 2.5007e-03, 1.3148e-03, 7.9476e-04, 5.0901e-03,
         6.0714e-02, 2.0223e-03, 1.1656e-02, 4.310

[('😭',)]

## Finding correct threshold

In [48]:
valid_df=create_dataset('dev')

24885251it [00:24, 1033217.78it/s]


In [62]:
valid_df['clean emoji']=valid_df['Emoji'].apply(extract_emoji)
valid_df = valid_df[valid_df['clean emoji'].map(len) > 0].sample(n=10000, random_state=42).reset_index(drop=True)

emoji_label=mlb.transform(valid_df['clean emoji'])
valid_df["labels"] = list(emoji_label)

val_dataset = Dataset.from_pandas(valid_df[["Text", "labels"]])

val_tokenized_dataset = val_dataset.map(tokenize_function, batched=True, num_proc=4)
val_tokenized_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
val_tokenized_dataset = val_tokenized_dataset.map(
    lambda x: {"labels": [list(map(float, l)) for l in x["labels"]]},
    batched=True
)

val_tokenized_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
data_collator = CustomDataCollator(tokenizer=tokenizer)

valid_loader=DataLoader(val_tokenized_dataset,batch_size=32,collate_fn=data_collator)

model.eval()
all_logits=[]
all_labels=[]

device=torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
model.to(device)
with torch.no_grad():
    for batch in valid_loader:
        input_ids=batch['input_ids'].to(model.device)
        attention_mask=batch['attention_mask'].to(model.device)
        labels=batch['labels'].to(model.device)

        outputs=model(input_ids=input_ids,attention_mask=attention_mask)
        logits=outputs.logits

        all_logits.append(logits.cpu())
        all_labels.append(labels.cpu())

logits=torch.cat(all_logits).numpy()
labels=torch.cat(all_labels).numpy()

Map (num_proc=4):   0%|          | 0/10000 [00:00<?, ? examples/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

  f['labels'] = torch.tensor(f['labels'], dtype=torch.float32)


In [63]:
def tune_thresholds(y_true,y_probs):
    thresholds=[]
    for i in range(y_true.shape[1]):
        if y_true[:,i].sum()==0:
            threshold.append(0.5)
            continue
        best_thresh=0.5
        best_f1=0
        for thresh in np.arange(0.1,0.9,0.05):
            preds=(y_probs[:,i]>=thresh).astype(int)
            score=f1_score(y_true[:,i],preds,zero_division=0)
            if score>best_f1:
                best_f1=score
                best_thresh=thresh
        thresholds.append(best_thresh)
    return np.array(thresholds)

In [66]:
tuned_thresholds_=tune_thresholds(labels,logits)

In [70]:
missing_classes = np.where(labels.sum(axis=0) == 0)[0]
print(f"Classes not present in validation set: {missing_classes}")

Classes not present in validation set: []


In [72]:
tuned_thresholds_

array([0.15, 0.1 , 0.1 , 0.1 , 0.1 , 0.15, 0.15, 0.1 , 0.1 , 0.15, 0.25,
       0.1 , 0.1 , 0.1 , 0.1 , 0.1 , 0.1 , 0.15, 0.3 , 0.5 , 0.1 , 0.1 ,
       0.5 , 0.1 , 0.5 , 0.1 , 0.1 , 0.1 , 0.1 , 0.5 , 0.1 , 0.5 , 0.1 ,
       0.1 , 0.5 , 0.1 , 0.1 , 0.35, 0.1 , 0.1 , 0.1 , 0.1 , 0.2 , 0.15,
       0.25, 0.5 , 0.1 , 0.1 ])

In [142]:
model.to('cpu')
def predict_emojis(text):
    thresholds=np.array([0.15, 0.1 , 0.1 , 0.1 , 0.1 , 0.15, 0.15, 0.1 , 0.1 , 0.15, 0.25,
       0.1 , 0.1 , 0.1 , 0.1 , 0.1 , 0.1 , 0.15, 0.3 , 0.5 , 0.1 , 0.1 ,
       0.5 , 0.1 , 0.5 , 0.1 , 0.1 , 0.1 , 0.1 , 0.5 , 0.1 , 0.5 , 0.1 ,
       0.1 , 0.5 , 0.1 , 0.1 , 0.35, 0.1 , 0.1 , 0.1 , 0.1 , 0.2 , 0.15,
       0.25, 0.5 , 0.1 , 0.1 ])
    model.eval()
    inputs=tokenizer(
        text,
        padding="max_length",
        truncation=True,
        max_length=128,
        return_tensors="pt"
    )
    predictions=[]
    with torch.no_grad():
        outputs=model(**inputs)
        probs=torch.sigmoid(outputs.logits).squeeze(0).numpy()

    predicted_labels=(probs>=thresholds).astype(int).reshape(1,-1)

    return mlb.inverse_transform(predicted_labels)

In [143]:
predict_emojis("It is gonna be boring today, I don't have anything to do")

[('😭', '🙄')]

In [144]:
predict_emojis("It is gonna be fun today, will play games and do fun activities. Lets go!!!")

[('💪', '🔥')]

# Quantize

In [1]:
! pip install onnx onnxruntime optimum

Collecting onnxruntime
  Downloading onnxruntime-1.22.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (4.5 kB)
Collecting optimum
  Downloading optimum-1.26.1-py3-none-any.whl.metadata (16 kB)
Collecting coloredlogs (from onnxruntime)
  Downloading coloredlogs-15.0.1-py2.py3-none-any.whl.metadata (12 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11->optimum)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.11->optimum)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=1.11->optimum)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch>=1.11->optimum)
  Downloading nvidia_curand_cu12-10.3.5.147-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cusol

In [21]:
! pip install --upgrade optimum[onnxruntime] transformers

Collecting transformers
  Downloading transformers-4.52.4-py3-none-any.whl.metadata (38 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets>=1.2.1->optimum[onnxruntime])
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading transformers-4.52.4-py3-none-any.whl (10.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.5/10.5 MB[0m [31m71.8 MB/s[0m eta [36m0:00:00[0m:00:01[0m0:01[0m
[?25hDownloading fsspec-2025.3.0-py3-none-any.whl (193 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: fsspec, transformers
  Attempting uninstall: fsspec
    Found existing installation: fsspec 2025.3.2
    Uninstalling fsspec-2025.3.2:
      Successfully uninstalled fsspec-2025.3.2
  Attempting uninstall: transformers
    Found existing installation: transformers 4.51.3
    Uninstalling transformers-4.51.3:
  

In [22]:
from transformers.onnx import export
from pathlib import Path
from transformers.onnx.features import FeaturesManager
from onnxruntime.quantization import quantize_dynamic,QuantType
import onnxruntime as ort
import onnx
from optimum.exporters.tasks import TasksManager
from transformers import AutoTokenizer,AutoModelForSequenceClassification
import numpy as np
from optimum.onnxruntime import ORTQuantizer
from optimum.onnxruntime.configuration import AutoQuantizationConfig
from optimum.onnxruntime import ORTModelForSequenceClassification, ORTQuantizer


In [3]:
model=AutoModelForSequenceClassification.from_pretrained("ashish-001/tweet-emoji-predictor")

config.json:   0%|          | 0.00/2.79k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/540M [00:00<?, ?B/s]

In [4]:
tokenizer=AutoTokenizer.from_pretrained("ashish-001/tweet-emoji-predictor")

tokenizer_config.json:   0%|          | 0.00/1.23k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/843k [00:00<?, ?B/s]

bpe.codes:   0%|          | 0.00/1.08M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/22.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/167 [00:00<?, ?B/s]

In [None]:
PYTORCH_MODEL_DIR = "ashish-001/tweet-emoji-predictor"  
ONNX_UNQUANTIZED_DIR = Path("onnx_unquantized")
QUANTIZED_MODEL_DIR = Path("onnx_quantized_dynamic")

print("STAGE 1: Exporting PyTorch model to ONNX ---")


model = ORTModelForSequenceClassification.from_pretrained(
    PYTORCH_MODEL_DIR,
    export=True
)
tokenizer = AutoTokenizer.from_pretrained(PYTORCH_MODEL_DIR)


model.save_pretrained(ONNX_UNQUANTIZED_DIR)
tokenizer.save_pretrained(ONNX_UNQUANTIZED_DIR)

print(f"Unquantized ONNX model saved to: {ONNX_UNQUANTIZED_DIR}")


print("STAGE 2: Applying Dynamic Quantization ---")


quantizer = ORTQuantizer.from_pretrained(ONNX_UNQUANTIZED_DIR)


dqconfig = AutoQuantizationConfig.avx2(is_static=False, per_channel=False)


quantizer.quantize(
    save_dir=QUANTIZED_MODEL_DIR,
    quantization_config=dqconfig,
)

print(f"Dynamically quantized model saved to: {QUANTIZED_MODEL_DIR}")


print("STAGE 3: Running Inference ---")


quantized_model_path = QUANTIZED_MODEL_DIR / "model_quantized.onnx"

session = ort.InferenceSession(str(quantized_model_path))


text = "This is a sample text for inference."
inputs = tokenizer(text, return_tensors="np")


onnx_inputs = {
    "input_ids": inputs["input_ids"].astype(np.int64),
    "attention_mask": inputs["attention_mask"].astype(np.int64),
}


logits = session.run(None, onnx_inputs)[0]

print("Inference successful!")
print("Logits shape:", logits.shape)
probabilities = np.exp(logits) / np.sum(np.exp(logits), axis=-1, keepdims=True)
print("Probabilities:", probabilities)

## Comparing finetuned model and onnxruntime

In [34]:
import time
import torch

In [28]:
model=AutoModelForSequenceClassification.from_pretrained("ashish-001/tweet-emoji-predictor")
session = ort.InferenceSession(str(quantized_model_path))

In [31]:
model.eval()

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(64001, 768, padding_idx=1)
      (position_embeddings): Embedding(130, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
         

In [36]:
text="Sample input for timing comparison"
input_pt=tokenizer(text,return_tensors="pt")

for _ in range(5):
    with torch.no_grad():
        _=model(**input_pt)

# Measure
start_time=time.time()
for _ in range(100):
    with torch.no_grad():
        _=model(**input_pt)
end_time=time.time()

avg_pt_time=(end_time-start_time)/100
print(f"Average PyTorch inference time: {avg_pt_time:.6f} sec")
    

Average PyTorch inference time: 0.079958 sec


In [41]:
input_np=tokenizer(text,return_tensors="np")
onnx_inputs = {
    "input_ids": input_np["input_ids"].astype(np.int64),
    "attention_mask": input_np["attention_mask"].astype(np.int64),
}

for _ in range(5):
    with torch.no_grad():
        _=session.run(None,onnx_inputs)

# Measure
start_time=time.time()
for _ in range(100):
    with torch.no_grad():
        _=session.run(None,onnx_inputs)
end_time=time.time()

avg_onnx_time=(end_time-start_time)/100
print(f"Average ONNX inference time: {avg_onnx_time:.6f} sec")
    

Average ONNX inference time: 0.026949 sec


In [45]:
relative_speed=avg_pt_time/avg_onnx_time
print(f"ONNX is {relative_speed:.2f}x faster than PyTorch")

ONNX is 2.97x faster than PyTorch


In [48]:
speedup=((avg_pt_time-avg_onnx_time)/(avg_pt_time))*100
print(f"ONNX model is {speedup:.2f}% faster than PyTorch model")

ONNX model is 66.30% faster than PyTorch model
