dataset: https://www.kaggle.com/datasets/programmerrdai/genz-slang-pairs-1k


In [1]:
!pip install transformers datasets sentence-transformers torch pandas scikit-learn accelerate evaluate bert_score sacrebleu rouge_score -q

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/51.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone


In [2]:
import os
import torch
import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict
from transformers import BartTokenizer, BartForConditionalGeneration, Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq, pipeline
from sentence_transformers import SentenceTransformer
import torch.nn as nn
import torch.nn.functional as F
import evaluate
from tqdm import tqdm

In [3]:
os.environ["WANDB_DISABLED"] = "true"

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

df = pd.read_csv("genz_dataset.csv")
df = df.rename(columns={'gen_z': 'source_text', 'normal': 'target_text'})
print(f"Dataset loaded. Total rows: {len(df)}")

# Ensure data types
df['source_text'] = df['source_text'].astype(str)
df['target_text'] = df['target_text'].astype(str)

# Split Data
train_df, test_df = train_test_split(df, test_size=0.3, random_state=42)
dataset_dict = DatasetDict({
    'train': Dataset.from_pandas(train_df),
    'test': Dataset.from_pandas(test_df)
})

tokenizer = BartTokenizer.from_pretrained("facebook/bart-base")

def preprocess_function(examples):
    # Add "Translate slang to formal: " to force translation behavior
    inputs = ["Translate slang to formal: " + text for text in examples["source_text"]]
    targets = examples["target_text"]

    model_inputs = tokenizer(inputs, max_length=128, truncation=True, padding="max_length")
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=128, truncation=True, padding="max_length")

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_datasets = dataset_dict.map(preprocess_function, batched=True)

Using device: cuda
Dataset loaded. Total rows: 1005


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

Map:   0%|          | 0/703 [00:00<?, ? examples/s]



Map:   0%|          | 0/302 [00:00<?, ? examples/s]

In [4]:
model_baseline = BartForConditionalGeneration.from_pretrained("facebook/bart-base")
# Fix BART's default limitation of 20 tokens
model_baseline.config.max_length = 128
model_baseline.to(device)

args_baseline = Seq2SeqTrainingArguments(
    output_dir="./bart_baseline_results",
    num_train_epochs=15,
    learning_rate=5e-5,
    per_device_train_batch_size=8,
    weight_decay=0.01,
    save_total_limit=1,
    predict_with_generate=True,
    fp16=torch.cuda.is_available(),
    logging_steps=50,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    report_to=None
)

trainer_baseline = Seq2SeqTrainer(
    model=model_baseline,
    args=args_baseline,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
    data_collator=DataCollatorForSeq2Seq(tokenizer, model=model_baseline),
)

print("Starting Baseline Training...")
trainer_baseline.train()

baseline_path = "./bart_baseline_final"
trainer_baseline.save_model(baseline_path)
tokenizer.save_pretrained(baseline_path)
print("Baseline Model Saved.")

model.safetensors:   0%|          | 0.00/558M [00:00<?, ?B/s]

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer_baseline = Seq2SeqTrainer(


Starting Baseline Training...


Epoch,Training Loss,Validation Loss
1,6.4699,0.084884
2,0.0726,0.045245
3,0.0447,0.046065
4,0.0332,0.044166
5,0.0302,0.045166
6,0.0269,0.045323
7,0.0217,0.044782
8,0.0196,0.047607
9,0.0179,0.048795
10,0.0156,0.049585


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].


Baseline Model Saved.


In [None]:
semantic_model = SentenceTransformer('all-MiniLM-L6-v2')
semantic_model.to(device)
semantic_model.eval()


class CustomSemanticTrainer(Seq2SeqTrainer):

    def __init__(self, *args, semantic_model=None, semantic_weight_max=0.7, warmup_steps=2000, **kwargs):
        super().__init__(*args, **kwargs)
        self.semantic_model = semantic_model
        self.semantic_weight_max = semantic_weight_max
        self.warmup_steps = warmup_steps
        self.current_step = 0
        self.proj = torch.nn.Linear(
            self.model.config.d_model,
            semantic_model.get_sentence_embedding_dimension()
        ).to(self.model.device)


    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):

        outputs = model(**inputs, output_hidden_states=True)
        ce_loss = outputs.loss

        # Decoder sentence embedding
        decoder_emb = outputs.decoder_hidden_states[-1].mean(dim=1)
        decoder_proj = self.proj(decoder_emb)

        # Target sentence embedding
        labels = inputs["labels"].clone()
        labels[labels == -100] = tokenizer.pad_token_id
        target_texts = tokenizer.batch_decode(labels, skip_special_tokens=True)

        with torch.no_grad():
            tgt_emb = self.semantic_model.encode(
                target_texts,
                convert_to_tensor=True
            ).to(decoder_proj.device)
        tgt_emb = tgt_emb.detach().clone()
        # Contrastive (InfoNCE) semantic loss
        temperature = 0.05
        sim = torch.matmul(decoder_proj, tgt_emb.T) / temperature
        contrastive_labels = torch.arange(sim.size(0)).to(sim.device)

        semantic_loss = F.cross_entropy(sim, contrastive_labels)



        # Warm-up semantic loss weight
        self.current_step += 1
        warmup_factor = min(self.current_step / self.warmup_steps, 1.0)
        semantic_weight = warmup_factor * self.semantic_weight_max

        total_loss = ce_loss + semantic_weight * semantic_loss

        return (total_loss, outputs) if return_outputs else total_loss



def freeze_lower_layers(model, num_layers=4):
    for i in range(num_layers):
        for name, param in model.model.encoder.layers[i].named_parameters():
            param.requires_grad = False
    print(f"Frozen {num_layers} encoder layers.")

model_proposed = BartForConditionalGeneration.from_pretrained("facebook/bart-base")
freeze_lower_layers(model_proposed, num_layers=4)
model_proposed.config.max_length = 128
model_proposed.to(device)


trainer_proposed = CustomSemanticTrainer(
    model=model_proposed,
    args=args_baseline,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
    data_collator=DataCollatorForSeq2Seq(tokenizer, model=model_proposed),
    semantic_model=semantic_model,     # <— MiniLM
    semantic_weight_max=0.3,
    warmup_steps=2000
)


print("Starting Proposed Model Training (Semantic Loss)...")
trainer_proposed.train()

# Save
proposed_path = "./bart_proposed_final"
trainer_proposed.save_model(proposed_path)
tokenizer.save_pretrained(proposed_path)
print("Proposed Model Saved.")

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Frozen 4 encoder layers.


  super().__init__(*args, **kwargs)


Starting Proposed Model Training (Semantic Loss)...


Epoch,Training Loss,Validation Loss
1,6.6246,0.346996
2,0.4594,0.184903
3,0.1503,0.165023
4,0.1157,0.149867
5,0.0979,0.117825
6,0.0963,0.109963
7,0.0785,0.116257
8,0.061,0.109684
9,0.0772,0.10225
10,0.0464,0.091037


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].


Proposed Model Saved.


In [6]:
translator_base = pipeline("translation", model=baseline_path, tokenizer=baseline_path, device=0 if device=="cuda" else -1)
translator_prop = pipeline("translation", model=proposed_path, tokenizer=proposed_path, device=0 if device=="cuda" else -1)

test_slang = "Translate slang to formal: thats cool bro no cap"

print(f"Input: {test_slang}\n")

out_base = translator_base(test_slang, max_length=128)[0]['translation_text']
print(f"Baseline Output: {out_base}")

out_prop = translator_prop(test_slang, max_length=128)[0]['translation_text']
print(f"Proposed Output: {out_prop}")

Device set to use cuda:0
Device set to use cuda:0


Input: Translate slang to formal: thats cool bro no cap

Baseline Output: I really like your new haircut.
Proposed Output: Hey, I really like your new haircut.


In [None]:
bleu = evaluate.load("sacrebleu")
bertscore = evaluate.load("bertscore")

test_data = dataset_dict['test']

print(f"Evaluating on {len(test_data)} test sentences...")

def evaluate_model(model_path, dataset, name="Model"):
    print(f"\n--- Evaluating {name} ---")

    # Load Translator
    translator = pipeline("translation", model=model_path, tokenizer=model_path, device=0 if torch.cuda.is_available() else -1)

    # Prepare Inputs (Add prefix)
    inputs = ["Translate slang to formal: " + text for text in dataset["source_text"]]
    references = dataset["target_text"]

    # Generate Predictions (Batch processing for speed)
    print("Generating translations...")
    predictions = []
    # Using batch_size=8 for speed
    for out in tqdm(translator(inputs, batch_size=8, max_length=128)):
        predictions.append(out['translation_text'])

    # Calculate BLEU (Lexical Precision)
    bleu_result = bleu.compute(predictions=predictions, references=references)

    # Calculate BERTScore (Semantic Similarity)
    bert_result = bertscore.compute(predictions=predictions, references=references, lang="en")

    # Average the BERTScore (F1)
    bert_f1 = sum(bert_result['f1']) / len(bert_result['f1'])

    results = {
        "Model": name,
        "BLEU": round(bleu_result['score'], 2),
        "BERTScore F1": round(bert_f1, 4)
    }

    return results, predictions

# Evaluate Baseline
baseline_metrics, baseline_preds = evaluate_model(baseline_path, test_data, name="Baseline (BART)")

# Evaluate Proposed
proposed_metrics, proposed_preds = evaluate_model(proposed_path, test_data, name="Proposed (Semantic Loss)")

results_df = pd.DataFrame([baseline_metrics, proposed_metrics])

print("\n\n================ PERFORMANCE COMPARISON ================")
print(results_df)
print("========================================================")

print("\n--- QUALITATIVE ANALYSIS (Examples) ---")
for i in range(5):
    print(f"\nInput: {test_data['source_text'][i]}")
    print(f"Reference: {test_data['target_text'][i]}")
    print(f"Baseline:  {baseline_preds[i]}")
    print(f"Proposed:  {proposed_preds[i]}")

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

Evaluating on 302 test sentences...

--- Evaluating Baseline (BART) ---


Device set to use cuda:0


Generating translations...


100%|██████████| 302/302 [00:00<00:00, 952390.83it/s]


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



--- Evaluating Proposed (Semantic Loss) ---


Device set to use cuda:0


Generating translations...


100%|██████████| 302/302 [00:00<00:00, 1052059.64it/s]




                      Model   BLEU  BERTScore F1
0           Baseline (BART)  67.02        0.9805
1  Proposed (Semantic Loss)  68.78        0.9808

--- QUALITATIVE ANALYSIS (Examples) ---

Input: Gonna snag a latte before hitting up class, fr.
Reference: I'm going to grab a coffee before heading to class.
Baseline:  I'm just going to grab a coffee before heading to class.
Proposed:  I'm just going to grab a coffee before heading to class.

Input: I'm just vibing with my friends after class.
Reference: I'm just casually hanging out with my friends after school.
Baseline:  I'm just hanging out with my friends after school.
Proposed:  I'm just hanging out with my friends after school.

Input: I'm hella worn out today and just wanna chill after work, fr.
Reference: I'm really tired today and just want to relax after work.
Baseline:  I'm really tired today and just want to relax after work.
Proposed:  I'm really tired today and just want to relax after work.

Input: Yo, wanna vibe and chi

In [8]:
import shutil

print("Creating model archives...")

# Zip baseline model
shutil.make_archive("bart_baseline_final", 'zip', "./bart_baseline_final")
print("Finished creating bart_baseline_final.zip")

# Zip proposed model
shutil.make_archive("bart_proposed_final", 'zip', "./bart_proposed_final")
print("Finished Creating bart_proposed_final.zip")

Creating model archives...
Finished creating bart_baseline_final.zip
Finished Creating bart_proposed_final.zip
