In [7]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import re

import typing as tp

import os
import torchview
import torchinfo

from transformers import (
    BartTokenizer, BartForConditionalGeneration,
    pipeline, 
    AutoModelForSeq2SeqLM, Trainer, TrainingArguments,
)
import datasets

import peft

import evaluate

np.random.seed(0)
torch.manual_seed(0);

In [8]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [9]:
tokenizer = BartTokenizer.from_pretrained("AiratNazmiev/text2emoji_tokenizer")

In [10]:
def preprocess(example):
    model_inputs = tokenizer(
        example["text"],
        truncation=True,
    )
    
    labels = tokenizer(
        text_target=example["emoji"],
        truncation=True,
    )
    
    model_inputs["labels"] = labels["input_ids"]
    
    return model_inputs

Let's combine random subset of big dataset with high-quality examples generated by `gpt` and `deepseek` models in multiple rounds, their outputs make up several csv files.

In [11]:
data_root = 'data'

text2emoji_dataset = pd.read_csv(os.path.join(data_root, 'text2emoji.csv'))
text2emoji_dataset.dropna(inplace=True)
text2emoji_dataset.drop('topic', axis=1, inplace=True)
selected_ids = np.random.choice(len(text2emoji_dataset), size=20000, replace=False)
text2emoji_selected_dataset = text2emoji_dataset.iloc[selected_ids]
text2emoji_selected_dataset.to_csv(os.path.join(data_root, 'text2emoji_clean.csv'), index=None)

In [12]:
generated_root = os.path.join('data', 'generated')

text2emoji_dataset_list = [text2emoji_selected_dataset]

for generated_path in os.listdir(generated_root):
    text2emoji_gen_dataset = pd.read_csv(os.path.join(generated_root, generated_path))
    text2emoji_gen_dataset.dropna(inplace=True)
    text2emoji_dataset_list.append(text2emoji_gen_dataset)

New dataset that will be used during training

In [13]:
text2emoji_dataset_final = pd.concat(text2emoji_dataset_list)
text2emoji_dataset_final.reset_index(inplace=True, drop=True)
text2emoji_dataset_final.to_csv(os.path.join(data_root, 'text2emoji_generated.csv'), index=None)

We would better download dataset from HF hub, uploaded by me:

In [20]:
base_dataset = datasets.load_dataset('AiratNazmiev/text2emoji_generated')['train']
val_ratio = 0.3
split_dataset = base_dataset.train_test_split(test_size=val_ratio, seed=0)
train_dataset = split_dataset["train"]
val_dataset = split_dataset["test"]
train_dataset, val_dataset

(Dataset({
     features: ['text', 'emoji'],
     num_rows: 28329
 }),
 Dataset({
     features: ['text', 'emoji'],
     num_rows: 12141
 }))

In [21]:
base_dataset

Dataset({
    features: ['text', 'emoji'],
    num_rows: 40470
})

In [22]:
if not os.path.exists('text2emoji_tokenized_train') or not os.path.exists('text2emoji_tokenized_val'):
    tokenized_val_dataset = val_dataset.map(preprocess, batched=False)#, remove_columns=['text', 'emoji'])
    tokenized_train_dataset = train_dataset.map(preprocess, batched=False)
    tokenized_val_dataset.save_to_disk('text2emoji_tokenized_val')
    tokenized_train_dataset.save_to_disk('text2emoji_tokenized_train')
else:
    tokenized_val_dataset = datasets.load_from_disk('text2emoji_tokenized_val')
    tokenized_train_dataset = datasets.load_from_disk('text2emoji_tokenized_train')
    
# tokenized_train_dataset = tokenized_train_dataset.remove_columns(['text', 'emoji'])
# tokenized_val_dataset = tokenized_val_dataset.remove_columns(['text', 'emoji'])

In [23]:
tokenized_val_dataset

Dataset({
    features: ['text', 'emoji', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 12141
})

We will use LoRA for model uptraining

In [None]:
lora_config = peft.LoraConfig(
    task_type="SEQ_2_SEQ_LM",
    r=4,
    lora_alpha=8,
    target_modules=["k_proj", "v_proj", "q_proj", "out_proj", "fc1", "fc2"],
    lora_dropout=0.01,
    init_lora_weights='olora',
    lora_bias=False
)

model = AutoModelForSeq2SeqLM.from_pretrained("AiratNazmiev/text2emoji-bart-base")
lora_model = peft.get_peft_model(model, lora_config)

In [25]:
pn = 0
tpn = 0

for p in lora_model.parameters():
    if p.requires_grad:
        tpn += p.numel()
    pn += p.numel()
    
print(f"Total number of parameters: {pn/10**6:.2f}M\nLora ft parameters: {tpn/10**6:.2f}M")

Total number of parameters: 141.94M
Lora ft parameters: 0.81M


In [26]:
training_args = TrainingArguments(
    output_dir="./model_ft",
    overwrite_output_dir=True,
    num_train_epochs=5,
    learning_rate=5e-5,
    lr_scheduler_type="cosine",
    warmup_steps=200,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=64,
    log_level="error",
    logging_strategy="steps",
    logging_steps=1,
    save_strategy="epoch",
    save_steps=1,
    save_total_limit=2,
    save_safetensors=True,
    save_only_model=False,
    use_cpu=False,
    seed=42,
    bf16=True,
    eval_strategy="epoch",
    eval_steps=1,
    disable_tqdm=False,
    load_best_model_at_end=False,
    label_smoothing_factor=0.,
    optim="adamw_torch",
    remove_unused_columns=False,
    #max_grad_norm=1.
)

We'll track accuracy as validation metrics

In [27]:
accuracy_metrics = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return accuracy_metrics.compute(predictions=predictions, references=labels)

In [28]:
trainer = Trainer(
    model=lora_model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_val_dataset,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

In [None]:
merged_model = lora_model.merge_and_unload()
merged_model.save_pretrained('merged_model')