In [1]:
import os

os.environ['CUDA_VISIBLE_DEVICES'] = "1"


In [2]:
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification, BitsAndBytesConfig, Trainer
from transformers import pipeline
from sklearn import metrics
import pandas as pd
import datasets as ds
from peft import PeftModel
import numpy as np

In [3]:
test = pd.read_csv('../data/headlines_test.csv')
test2 = pd.read_csv('../data/reddit_test.csv')

dataset = ds.DatasetDict({
    "test": ds.Dataset.from_pandas(test),
    "test2":ds.Dataset.from_pandas(test2)
})

modelname = "../../Meta-Llama-3-8B"
tokenizer = AutoTokenizer.from_pretrained(modelname)
tokenizer.pad_token_id = tokenizer.eos_token_id
tokenizer.pad_token = tokenizer.eos_token
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", max_length=160, truncation=True, return_tensors="pt")

tokenized_datasets = dataset.map(tokenize_function, batched=True)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Map:   0%|          | 0/2862 [00:00<?, ? examples/s]

Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

### Headlines model na headlines testih

In [4]:
quantization_config = BitsAndBytesConfig(
    load_in_8bit = True
)
model = AutoModelForSequenceClassification.from_pretrained("../training/llama3_headlines", quantization_config=quantization_config)
model.config.pad_token_id = tokenizer.pad_token_id
model.config.use_cache = False

trainer = Trainer(
    model=model,
    tokenizer=tokenizer
)

predictions = trainer.predict(test_dataset=tokenized_datasets["test"])
prediction = np.argmax(predictions.predictions, axis=-1)
prediction = prediction.tolist()

`low_cpu_mem_usage` was None, now set to True since model is quantized.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at ../../Meta-Llama-3-8B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [5]:
recall = metrics.recall_score(dataset["test"]["label"],prediction)
precision = metrics.precision_score(dataset["test"]["label"],prediction)
f1_score = metrics.f1_score(dataset["test"]["label"],prediction)
accuracy = metrics.accuracy_score(dataset["test"]["label"],prediction)
loss = metrics.log_loss(dataset["test"]["label"],prediction)

print('Loss:',loss)
print('Accuracy:',accuracy)
print('Precision:',precision)
print('Recall:',recall)
print('f1 score:',f1_score)

Loss: 2.871402156785014
Accuracy: 0.9203354297693921
Precision: 0.9356548069644209
Recall: 0.896301667875272
f1 score: 0.9155555555555556


In [6]:
kappa = metrics.cohen_kappa_score(dataset["test"]["label"],prediction)
print('Cohens kappa: %f' % kappa)
auc = metrics.roc_auc_score(dataset["test"]["label"],prediction)
print('ROC AUC: %f' % auc)
matrix = metrics.confusion_matrix(dataset["test"]["label"],prediction)
print(matrix)

Cohens kappa: 0.840225
ROC AUC: 0.919493
[[1398   85]
 [ 143 1236]]


### Headlines model na reddit testih

In [7]:
predictions = trainer.predict(test_dataset=tokenized_datasets["test2"])
prediction = np.argmax(predictions.predictions, axis=-1)
prediction = prediction.tolist()

In [8]:
recall = metrics.recall_score(dataset["test2"]["label"],prediction)
precision = metrics.precision_score(dataset["test2"]["label"],prediction)
f1_score = metrics.f1_score(dataset["test2"]["label"],prediction)
accuracy = metrics.accuracy_score(dataset["test2"]["label"],prediction)
loss = metrics.log_loss(dataset["test2"]["label"],prediction)

print('Loss:',loss)
print('Accuracy:',accuracy)
print('Precision:',precision)
print('Recall:',recall)
print('f1 score:',f1_score)

Loss: 17.829593876483283
Accuracy: 0.5053333333333333
Precision: 0.5206258890469416
Recall: 0.2419035029742234
f1 score: 0.3303249097472924


In [9]:
kappa = metrics.cohen_kappa_score(dataset["test2"]["label"],prediction)
print('Cohens kappa: %f' % kappa)
auc = metrics.roc_auc_score(dataset["test2"]["label"],prediction)
print('ROC AUC: %f' % auc)
matrix = metrics.confusion_matrix(dataset["test2"]["label"],prediction)
print(matrix)

Cohens kappa: 0.015202
ROC AUC: 0.507636
[[1150  337]
 [1147  366]]


### Reddit model na reddit testih

In [10]:
model = AutoModelForSequenceClassification.from_pretrained("../training/llama3_reddit", quantization_config=quantization_config)
model.config.pad_token_id = tokenizer.pad_token_id
model.config.use_cache = False

trainer = Trainer(
    model=model,
    tokenizer=tokenizer
)

predictions = trainer.predict(test_dataset=tokenized_datasets["test2"])
prediction = np.argmax(predictions.predictions, axis=-1)
prediction = prediction.tolist()

`low_cpu_mem_usage` was None, now set to True since model is quantized.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at ../../Meta-Llama-3-8B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [11]:
recall = metrics.recall_score(dataset["test2"]["label"],prediction)
precision = metrics.precision_score(dataset["test2"]["label"],prediction)
f1_score = metrics.f1_score(dataset["test2"]["label"],prediction)
accuracy = metrics.accuracy_score(dataset["test2"]["label"],prediction)
loss = metrics.log_loss(dataset["test2"]["label"],prediction)

print('Loss:',loss)
print('Accuracy:',accuracy)
print('Precision:',precision)
print('Recall:',recall)
print('f1 score:',f1_score)

Loss: 9.719771863931927
Accuracy: 0.7303333333333333
Precision: 0.7699386503067485
Recall: 0.6635822868473232
f1 score: 0.7128150514731985


In [12]:
kappa = metrics.cohen_kappa_score(dataset["test2"]["label"],prediction)
print('Cohens kappa: %f' % kappa)
auc = metrics.roc_auc_score(dataset["test2"]["label"],prediction)
print('ROC AUC: %f' % auc)
matrix = metrics.confusion_matrix(dataset["test2"]["label"],prediction)
print(matrix)

Cohens kappa: 0.461277
ROC AUC: 0.730917
[[1187  300]
 [ 509 1004]]


### Reddit model na headlines testih

In [13]:
predictions = trainer.predict(test_dataset=tokenized_datasets["test"])
prediction = np.argmax(predictions.predictions, axis=-1)
prediction = prediction.tolist()

In [14]:
recall = metrics.recall_score(dataset["test"]["label"],prediction)
precision = metrics.precision_score(dataset["test"]["label"],prediction)
f1_score = metrics.f1_score(dataset["test"]["label"],prediction)
accuracy = metrics.accuracy_score(dataset["test"]["label"],prediction)
loss = metrics.log_loss(dataset["test"]["label"],prediction)

print('Loss:',loss)
print('Accuracy:',accuracy)
print('Precision:',precision)
print('Recall:',recall)
print('f1 score:',f1_score)

Loss: 18.575956935341647
Accuracy: 0.48462613556953177
Precision: 0.45102040816326533
Recall: 0.3205221174764322
f1 score: 0.37473505722763883


In [15]:
kappa = metrics.cohen_kappa_score(dataset["test"]["label"],prediction)
print('Cohens kappa: %f' % kappa)
auc = metrics.roc_auc_score(dataset["test"]["label"],prediction)
print('ROC AUC: %f' % auc)
matrix = metrics.confusion_matrix(dataset["test"]["label"],prediction)
print(matrix)

Cohens kappa: -0.042689
ROC AUC: 0.478872
[[945 538]
 [937 442]]
