In [None]:
import torch
import numpy as np
import pandas as pd
from torch.utils.data import Dataset
from sklearn.model_selection import train_test_split
from transformers import BertModel, BertForSequenceClassification, BertTokenizer, TrainingArguments, Trainer
from transformers import RobertaTokenizer, RobertaForSequenceClassification

In [None]:
import pandas as pd

data = pd.read_csv('train_en.txt', sep='\t')
data = data.sample(frac=1, random_state=42).head(50000)



In [None]:
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

In [None]:
sentences = data['Sentence'].values.tolist()


In [None]:
encodings = tokenizer(
    sentences,
    padding=True,
    truncation=True,
    max_length=128
)

In [None]:
labels = [1 if label=='toxic' else 0 for label in data['Style'].values]


In [None]:
class ToxicDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [None]:
dataset = ToxicDataset(encodings, labels)


In [None]:
model = RobertaForSequenceClassification.from_pretrained(
    "roberta-base",
    num_labels=2
)

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
training_args = TrainingArguments(output_dir='toxic_bert', report_to='none',
    gradient_accumulation_steps=1,
    learning_rate=2e-5,
    num_train_epochs=3,
    weight_decay=0.005,
    optim="adamw_torch",
    per_device_train_batch_size=16,
    logging_steps=2000,
    fp16=True,
    save_strategy="no",
    )

In [None]:
trainer1 = Trainer(model=model, args=training_args, train_dataset=dataset, tokenizer=tokenizer)

  trainer1 = Trainer(model=model, args=training_args, train_dataset=dataset, tokenizer=tokenizer)


In [None]:
trainer1.train()

Step,Training Loss
2000,0.0024
4000,0.0
6000,0.0
8000,0.0


TrainOutput(global_step=9375, training_loss=0.0005050603214899698, metrics={'train_runtime': 1324.283, 'train_samples_per_second': 113.269, 'train_steps_per_second': 7.079, 'total_flos': 9866664576000000.0, 'train_loss': 0.0005050603214899698, 'epoch': 3.0})

In [None]:
test_data = pd.read_csv("test_en.txt", sep="\t")
test_sentences = test_data["Sentence"].values.tolist()
test_encodings = tokenizer(
    test_sentences,
    padding=True,
    truncation=True,
    max_length=128
)

In [None]:
test_labels = [1 if lbl == "toxic" else 0 for lbl in test_data["Style"].values]

test_dataset = ToxicDataset(test_encodings, test_labels)

In [None]:
test_results1 = trainer1.predict(test_dataset)
print("MODEL 1 TEST RESULTS:", test_results1.metrics)

MODEL 1 TEST RESULTS: {'test_loss': 1.6137919089942443e-07, 'test_runtime': 95.8068, 'test_samples_per_second': 447.4, 'test_steps_per_second': 55.925}


----------------------------------------------MODEL 2----------------------------------------------

In [None]:
model2 = RobertaForSequenceClassification.from_pretrained(
    "roberta-base",
    num_labels=2
)


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
training_args2 = TrainingArguments(
    output_dir="roberta_base_model2",
    report_to="none",
    learning_rate=1e-5,
    num_train_epochs=5,
    per_device_train_batch_size=8,
    weight_decay=0.0,
    save_strategy="no"
)

In [None]:
trainer2 = Trainer(
    model=model2,
    args=training_args2,
    train_dataset=dataset,
    tokenizer=tokenizer
)

  trainer2 = Trainer(


In [None]:
trainer2.train()

Step,Training Loss
500,0.0194
1000,0.0001
1500,0.0
2000,0.0
2500,0.0
3000,0.0
3500,0.0
4000,0.0
4500,0.0
5000,0.0


TrainOutput(global_step=31250, training_loss=0.0003123277931213379, metrics={'train_runtime': 2710.9291, 'train_samples_per_second': 92.219, 'train_steps_per_second': 11.527, 'total_flos': 1.644444096e+16, 'train_loss': 0.0003123277931213379, 'epoch': 5.0})

In [None]:
test_results2 = trainer2.predict(test_dataset)
print("MODEL 2 TEST RESULTS:", test_results2.metrics)

MODEL 2 TEST RESULTS: {'test_loss': 0.0, 'test_runtime': 97.7686, 'test_samples_per_second': 438.423, 'test_steps_per_second': 54.803}


----------------------------------------------MODEL 3----------------------------------------------

In [None]:
model3 = RobertaForSequenceClassification.from_pretrained(
    "roberta-base",
    num_labels=2
)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
training_args3 = TrainingArguments(
    output_dir="roberta_base_model3",
    report_to="none",
    learning_rate=2e-5,
    num_train_epochs=6,
    per_device_train_batch_size=16,
    weight_decay=0.01,
    save_strategy="no"
)


In [None]:
trainer3 = Trainer(
    model=model3,
    args=training_args3,
    train_dataset=dataset,
    tokenizer=tokenizer
)

  trainer3 = Trainer(


In [None]:
trainer3.train()

Step,Training Loss
500,0.0124
1000,0.0
1500,0.0
2000,0.0
2500,0.0
3000,0.0
3500,0.0
4000,0.0
4500,0.0
5000,0.0


TrainOutput(global_step=18750, training_loss=0.00033216716170310974, metrics={'train_runtime': 2305.2891, 'train_samples_per_second': 130.136, 'train_steps_per_second': 8.133, 'total_flos': 1.9733329152e+16, 'train_loss': 0.00033216716170310974, 'epoch': 6.0})

In [None]:
test_results3 = trainer3.predict(test_dataset)
print("MODEL 3 TEST RESULTS:", test_results3.metrics)

MODEL 3 TEST RESULTS: {'test_loss': 0.0, 'test_runtime': 98.4684, 'test_samples_per_second': 435.307, 'test_steps_per_second': 54.413}


Site modeli se znachitelno podobri od modelite od prvata laboratoriska vezhba.
Inicijalniot model ima mnogu mal loss, no modelite so pomal learning rate i povekje epohi imat 0 loss.

# TASK 2

In [5]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from transformers import pipeline
from sentence_transformers import SentenceTransformer
from sentence_transformers import util
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [6]:
model1_name = 't5-base'

In [7]:
tokenizer = AutoTokenizer.from_pretrained(model1_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

In [8]:
model1 = AutoModelForSeq2SeqLM.from_pretrained(model1_name)

model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [9]:
data = pd.read_csv('test_en.txt', sep='\t')
data = data.sample(frac=1, random_state=42).head(200)
sentences = data['Sentence'].values.tolist()
labels = data['Style'].values.tolist()

In [10]:
sample_sentence = sentences[4]
sample_sentence

'My wife bought a car here.  I handled the negotiations.  \n\nOur salesman was friendly and helpful.  Once the negotiations began however, our salesman left the picture and we were left to deal with the manager.  This experience was annoying.  I had done my research and shopped around.  The manager must not be accustomed to dealing with informed shoppers because he was obviously lying to me on a number of fronts.  Had my wife not needed the car immediately - and wanted a color that was only available this dealer - I would have moved on to another dealership.  \n\nMy advice would be that if you have to come here be armed with knowledge and info on pricing from other dealers but in general I would probably buy from a different dealer if I had to do it over.'

In [11]:
prompt = f'Classify the following text into either \'TOXIC\' or \'NEUTRAL\': {sample_sentence}'
prompt

"Classify the following text into either 'TOXIC' or 'NEUTRAL': My wife bought a car here.  I handled the negotiations.  \n\nOur salesman was friendly and helpful.  Once the negotiations began however, our salesman left the picture and we were left to deal with the manager.  This experience was annoying.  I had done my research and shopped around.  The manager must not be accustomed to dealing with informed shoppers because he was obviously lying to me on a number of fronts.  Had my wife not needed the car immediately - and wanted a color that was only available this dealer - I would have moved on to another dealership.  \n\nMy advice would be that if you have to come here be armed with knowledge and info on pricing from other dealers but in general I would probably buy from a different dealer if I had to do it over."

In [12]:
tokens = tokenizer(prompt, return_tensors='pt')

In [14]:
output_ids = model1.generate(tokens.input_ids)

In [16]:
examples = 'Text: delete the page and shut up\nClass: TOXIC\nText: I heard it was on the news.\nClass: NEUTRAL\n'


In [17]:
prompt = f'{examples}\nClassify the following text into either \'TOXIC\' or \'NEUTRAL\': {sample_sentence}\nClass:'


In [18]:
embedding_model = SentenceTransformer('all-distilroberta-v1')
embeddings = embedding_model.encode(sentences[:20], batch_size=64, show_progress_bar=True)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/653 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/328M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/333 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [19]:
embeddings.shape

(20, 768)

In [20]:
query_emb = embedding_model.encode([sample_sentence], batch_size=64, show_progress_bar=True)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [21]:
util.semantic_search(query_emb, embeddings, top_k=10)

[[{'corpus_id': 4, 'score': 1.0},
  {'corpus_id': 19, 'score': 0.285430371761322},
  {'corpus_id': 17, 'score': 0.28490519523620605},
  {'corpus_id': 1, 'score': 0.26936808228492737},
  {'corpus_id': 2, 'score': 0.23823367059230804},
  {'corpus_id': 11, 'score': 0.232102632522583},
  {'corpus_id': 3, 'score': 0.20569449663162231},
  {'corpus_id': 14, 'score': 0.20216311514377594},
  {'corpus_id': 8, 'score': 0.2015698254108429},
  {'corpus_id': 9, 'score': 0.14951351284980774}]]

In [22]:
t5_name = "google/flan-t5-base"
bart_name = "facebook/bart-large"

In [23]:
t5_tokenizer = AutoTokenizer.from_pretrained(t5_name)
t5_model = AutoModelForSeq2SeqLM.from_pretrained(t5_name)

tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [24]:
bart_tokenizer = AutoTokenizer.from_pretrained(bart_name)
bart_model = AutoModelForSeq2SeqLM.from_pretrained(bart_name)

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.02G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.02G [00:00<?, ?B/s]

In [25]:
def classify_t5_zeroshot(text):
    prompt = f"Is the following review positive or negative?\nReview: \"{text}\"\nAnswer:"
    inputs = t5_tokenizer(prompt, return_tensors="pt")
    outputs = t5_model.generate(**inputs, max_new_tokens=5)
    answer = t5_tokenizer.decode(outputs[0], skip_special_tokens=True).lower()

    return 1 if "positive" in answer else 0


In [26]:
def classify_bart_zeroshot(text):
    prompt = f"Is the following review positive or negative?\nReview: \"{text}\"\nAnswer:"
    inputs = bart_tokenizer(prompt, return_tensors="pt")
    outputs = bart_model.generate(**inputs, max_new_tokens=5)
    answer = bart_tokenizer.decode(outputs[0], skip_special_tokens=True).lower()

    return 1 if "positive" in answer else 0


In [None]:
pred_t5_zero = [classify_t5_zeroshot(t) for t in sentences]
pred_bart_zero = [classify_bart_zeroshot(t) for t in sentences]


In [None]:
def evaluate(name, preds):
    print(f"\n=== {name} ===")
    print("Accuracy:", accuracy_score(labels, preds))
    print("Precision:", precision_score(labels, preds))
    print("Recall:", recall_score(labels, preds))
    print("F1:", f1_score(labels, preds))


In [None]:
evaluate("T5 Zero-Shot", pred_t5_zero)
evaluate("BART Zero-Shot", pred_bart_zero)


In [None]:
fewshot_examples = """
Review: "The rice was very flavorful and delicious. I went back for more! yum! "
Sentiment: positive

Review: "This place won't last."
Sentiment: negative
"""

def classify_t5_fewshot(text):
    prompt = fewshot_examples + f'\nReview: "{text}"\nSentiment: '
    inputs = t5_tokenizer(prompt, return_tensors="pt")
    outputs = t5_model.generate(**inputs, max_new_tokens=5)
    answer = t5_tokenizer.decode(outputs[0], skip_special_tokens=True).lower()
    return 1 if "positive" in answer else 0


In [None]:
t5_few_preds = [classify_t5_fewshot(x) for x in sentences]

In [None]:
fewshot_examples = """
Review: "The rice was very flavorful and delicious. I went back for more! yum!"
Sentiment: positive

Review: "This place won't last."
Sentiment: negative
"""

def classify_bart_fewshot(text):
    prompt = fewshot_examples + f'\nReview: "{text}"\nSentiment: '
    outputs = bart_model(prompt, max_new_tokens=5)[0]["generated_text"].lower()
    return 1 if "positive" in outputs else 0

In [None]:
bart_few_preds = [classify_bart_fewshot(x) for x in sentences]

In [None]:
def evaluate(true, pred):
    return {
        "accuracy": accuracy_score(true, pred),
        "precision": precision_score(true, pred, pos_label="POSITIVE"),
        "recall": recall_score(true, pred, pos_label="POSITIVE"),
        "f1": f1_score(true, pred, pos_label="POSITIVE")
    }


In [None]:
results = {
    "T5 Zero-shot": evaluate(labels, pred_t5_zero),
    "T5 Few-shot": evaluate(labels, t5_few_preds),
    "BART Zero-shot": evaluate(labels, pred_bart_zero),
    "BART Few-shot": evaluate(labels, bart_few_preds),
}

results
