https://www.analyticsvidhya.com/blog/2024/06/finetuning-llama-3-for-sequence-classification/

In [22]:
# !pip install -q transformers accelerate trl bitsandbytes datasets evaluate
# !pip install -q peft scikit-learn
# !pip install -U "huggingface_hub[cli]"

In [23]:
with open("../private_/hf_read_token", "r") as f:
  token = f.readline()

hf_read = token

with open("../private_/hf_write_token", "r") as f:
  token = f.readline()

hf_write = token

In [24]:
!huggingface-cli login --token $hf_read

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /home/arjunsohur/.cache/huggingface/token
Login successful


In [25]:
from convokit import Corpus, download
from datasets import Dataset, DatasetDict

import torch
from transformers import BitsAndBytesConfig, AutoModelForSequenceClassification

import numpy as np

In [26]:
from huggingface_hub import HfApi, login

# not sure if the r and w tokens are needed but oh well
login(token=hf_write)

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /home/arjunsohur/.cache/huggingface/token
Login successful


In [27]:
%%capture
corpus = Corpus(filename=download("winning-args-corpus"))

In [72]:
import pandas as pd

ids = corpus.get_utterance_ids()
print("Len of ids", len(ids))

SPEAKER_BLACKLIST = ['DeltaBot','AutoModerator']
training_trios = []

for id in ids:
  ut = corpus.get_utterance(id)
  if ut.reply_to == ut.conversation_id and (ut.meta['success'] == 1 or ut.meta['success'] == 0) and (ut.speaker.id not in SPEAKER_BLACKLIST):
    op = corpus.get_utterance(ut.conversation_id).text
    x = ut.text
    y = ut.meta['success']

    training_trios += [(op, x, y)]

print(len(training_trios))

train_len = len(training_trios)

ones = 0
zeros = 0
total = 0

def formatting_prompts_func(training_trios):
    texts = []
    targets = []

    total = 0
    ones = 0
    zeros = 0

    for trio in training_trios:
        op, x, y = trio
        instruction = "Please determine if the following argument is successful based on the original post.  Output 1 for successful and 0 for unsuccessful.  Only output the one number, NOTHING ELSE."
        input_context = f"Original post: {op}\nArgument: {x}"

        texts.append(input_context)
        targets.append(y)

        if y:
           ones+=1
        else:
           zeros+=1
        total += 1

    return texts, targets, ones, zeros, total

# Format the data
texts, targets, ones, zeros, total = formatting_prompts_func(training_trios)

len_text = len(texts)
len_text = int(len_text*(150/8106))

v_start = int(len_text * 0.8)
v_end = int(len_text * 0.9)

train = {"text": texts[:v_start], "label": targets[:v_start]}
val = {"text": texts[v_start:v_end], "label":targets[v_start:v_end]}
test = {"text": texts[v_end:len_text], "label":targets[v_end:len_text]}

train_ds = Dataset.from_dict(train)
val_ds = Dataset.from_dict(val)
test_ds = Dataset.from_dict(test)

train_df = pd.DataFrame.from_dict(train_ds)
val_df = pd.DataFrame.from_dict(val_ds)
test_df = pd.DataFrame.from_dict(test_ds)

dataset = DatasetDict({
   'train': train_ds,
   'val': val_ds,
   'test': test_ds
})

print(dataset)

Len of ids 293297
8106
DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 120
    })
    val: Dataset({
        features: ['text', 'label'],
        num_rows: 15
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 15
    })
})


In [73]:
class_weights=(1/train_df.label.value_counts(normalize=True).sort_index()).tolist()
class_weights=torch.tensor(class_weights)
class_weights=class_weights/class_weights.sum()
class_weights

tensor([0.5167, 0.4833])

In [74]:
quantization_config = BitsAndBytesConfig(
    load_in_4bit = True, 
    bnb_4bit_quant_type = 'nf4',
    bnb_4bit_use_double_quant = True, 
    bnb_4bit_compute_dtype = torch.bfloat16 
)

model_name = "meta-llama/Meta-Llama-3-8B"

model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    quantization_config=quantization_config,
    num_labels=2,
    device_map='auto'
)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Meta-Llama-3-8B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [75]:
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model

lora_config = LoraConfig(
    r = 16, 
    lora_alpha = 8,
    target_modules = ['q_proj', 'k_proj', 'v_proj', 'o_proj'],
    lora_dropout = 0.05, 
    bias = 'none',
    task_type = 'SEQ_CLS'
)

model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, lora_config)

In [76]:
from transformers import AutoTokenizer

model_name = "meta-llama/Meta-Llama-3-8B"
tokenizer = AutoTokenizer.from_pretrained(model_name, add_prefix_space=True)

tokenizer.pad_token_id = tokenizer.eos_token_id
tokenizer.pad_token = tokenizer.eos_token

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [77]:
model.config.pad_token_id = tokenizer.pad_token_id
model.config.use_cache = False
model.config.pretraining_tp = 1

In [78]:
sentences = test_df.text.tolist()

batch_size = 32  

all_outputs = []

for i in range(0, len(sentences), batch_size):
    batch_sentences = sentences[i:i + batch_size]

    inputs = tokenizer(batch_sentences, return_tensors="pt", 
    padding=True, truncation=True, max_length=512)

    inputs = {k: v.to('cuda' if torch.cuda.is_available() else 'cpu') for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model(**inputs)
        all_outputs.append(outputs['logits'])
        
final_outputs = torch.cat(all_outputs, dim=0)
test_df['predictions']=final_outputs.argmax(axis=1).cpu().numpy()

In [79]:
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.metrics import balanced_accuracy_score, classification_report

def get_metrics_result(test_df):
    y_test = test_df.label
    y_pred = test_df.predictions

    print("Classification Report:")
    print(classification_report(y_test, y_pred))

    print("Balanced Accuracy Score:", balanced_accuracy_score(y_test, y_pred))
    print("Accuracy Score:", accuracy_score(y_test, y_pred))

get_metrics_result(test_df)

Classification Report:
              precision    recall  f1-score   support

           0       0.33      0.14      0.20         7
           1       0.50      0.75      0.60         8

    accuracy                           0.47        15
   macro avg       0.42      0.45      0.40        15
weighted avg       0.42      0.47      0.41        15

Balanced Accuracy Score: 0.4464285714285714
Accuracy Score: 0.4666666666666667


In [80]:
def data_preprocesing(row):
    return tokenizer(row['text'], truncation=True, max_length=1000)

tokenized_data = dataset.map(data_preprocesing, batched=True, 
remove_columns=['text'])
tokenized_data.set_format("torch")

Map:   0%|          | 0/120 [00:00<?, ? examples/s]

Map:   0%|          | 0/15 [00:00<?, ? examples/s]

Map:   0%|          | 0/15 [00:00<?, ? examples/s]

In [81]:
from transformers import DataCollatorWithPadding

collate_fn = DataCollatorWithPadding(tokenizer=tokenizer)

In [82]:
def compute_metrics(evaluations):
    predictions, labels = evaluations
    predictions = np.argmax(predictions, axis=1)
    return {'balanced_accuracy' : balanced_accuracy_score(predictions, labels),
    'accuracy':accuracy_score(predictions,labels)}

In [83]:
from transformers import Trainer, TrainingArguments
import torch.nn.functional as F
import logging

global train_loss
train_loss = []


class CustomTrainer(Trainer):
    def __init__(self, *args, eval_dataset_train=None, class_weights=None, **kwargs):
        super().__init__(*args, **kwargs)

        self.eval_dataset_train = eval_dataset_train

        if class_weights is not None:
            self.class_weights = torch.tensor(class_weights, dtype=torch.float32).to(self.args.device)
        else:
            self.class_weights = None

    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels").long()
        outputs = model(**inputs)
        logits = outputs.get('logits')

        if self.class_weights is not None:
            loss = F.cross_entropy(logits, labels, weight=self.class_weights)
        else:
            loss = F.cross_entropy(logits, labels)

        return (loss, outputs) if return_outputs else loss
    
    def evaluate(self, eval_dataset=None, ignore_keys=None, metric_key_prefix="eval"):
        # Evaluate on validation set
        eval_results = super().evaluate(eval_dataset, ignore_keys, metric_key_prefix)
        
        # Evaluate on training set
        if self.eval_dataset_train is not None:
            train_results = super().evaluate(self.eval_dataset_train, ignore_keys, metric_key_prefix="train")

            train_loss.append(train_results["train_loss"])
            
            eval_results.update(train_results)
        
        print(f"Mid-train results: Val loss: {eval_results["eval_loss"]:.4f} || Train loss {eval_results["train_loss"]:.4f}")
        
        return eval_results

In [87]:
training_args = TrainingArguments(
    output_dir='persuasion_classification',
    learning_rate=2e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    max_steps=2000,  # 500 steps * 4 "epochs" = 2000 total steps
    logging_steps=10,
    weight_decay=0.05,
    evaluation_strategy='steps',
    eval_steps=10,
    save_steps=500,
    load_best_model_at_end=True,
    report_to="none",
    lr_scheduler_type='linear',
    gradient_accumulation_steps=1,
    warmup_steps=500,
    dataloader_drop_last=False,  # Ensure all data is used
    remove_unused_columns=False  # Prevent potential issues with custom datasets
)



In [88]:
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data['train'],
    eval_dataset=tokenized_data['val'],
    eval_dataset_train=tokenized_data['train'],
    tokenizer=tokenizer,
    data_collator=collate_fn,
    compute_metrics=compute_metrics,
    class_weights=class_weights,
)

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
max_steps is given, it will override any value given in num_train_epochs


In [89]:
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

train_result = trainer.train()

Step,Training Loss,Validation Loss,Balanced Accuracy,Accuracy
10,1.6801,1.812711,0.522708,0.525
20,2.5874,1.810749,0.522708,0.525
30,1.4209,1.808327,0.522708,0.525
40,1.0513,1.804466,0.522708,0.525
50,1.8181,1.799404,0.522708,0.525
60,2.2964,1.792659,0.531429,0.533333
70,1.3822,1.783392,0.531429,0.533333
80,1.7163,1.777586,0.531429,0.533333
90,2.2607,1.758547,0.531429,0.533333
100,1.2888,1.747182,0.540068,0.541667


Mid-train results: Val loss: 1.2339 || Train loss 1.8127
Mid-train results: Val loss: 1.2332 || Train loss 1.8107
Mid-train results: Val loss: 1.2333 || Train loss 1.8083
Mid-train results: Val loss: 1.2275 || Train loss 1.8045
Mid-train results: Val loss: 1.2250 || Train loss 1.7994
Mid-train results: Val loss: 1.2234 || Train loss 1.7927
Mid-train results: Val loss: 1.2120 || Train loss 1.7834
Mid-train results: Val loss: 1.2156 || Train loss 1.7776
Mid-train results: Val loss: 1.1855 || Train loss 1.7585
Mid-train results: Val loss: 1.1707 || Train loss 1.7472
Mid-train results: Val loss: 1.1758 || Train loss 1.7365
Mid-train results: Val loss: 1.1623 || Train loss 1.7248
Mid-train results: Val loss: 1.1655 || Train loss 1.7107
Mid-train results: Val loss: 1.1412 || Train loss 1.6906
Mid-train results: Val loss: 1.1079 || Train loss 1.6677
Mid-train results: Val loss: 1.0776 || Train loss 1.6477
Mid-train results: Val loss: 1.0728 || Train loss 1.6290
Mid-train results: Val loss: 1.



Mid-train results: Val loss: 1.1390 || Train loss 0.2857
Mid-train results: Val loss: 1.2244 || Train loss 0.1916
Mid-train results: Val loss: 1.8728 || Train loss 0.2670
Mid-train results: Val loss: 1.8503 || Train loss 0.2364
Mid-train results: Val loss: 1.3278 || Train loss 0.1545
Mid-train results: Val loss: 1.1285 || Train loss 0.1526
Mid-train results: Val loss: 1.2537 || Train loss 0.1542
Mid-train results: Val loss: 1.3497 || Train loss 0.1550
Mid-train results: Val loss: 1.3604 || Train loss 0.1268
Mid-train results: Val loss: 1.3433 || Train loss 0.1113
Mid-train results: Val loss: 1.1741 || Train loss 0.0791
Mid-train results: Val loss: 1.1548 || Train loss 0.0818
Mid-train results: Val loss: 1.3230 || Train loss 0.0886
Mid-train results: Val loss: 1.4300 || Train loss 0.0986
Mid-train results: Val loss: 1.5406 || Train loss 0.1125
Mid-train results: Val loss: 1.4481 || Train loss 0.1037
Mid-train results: Val loss: 1.3951 || Train loss 0.0950
Mid-train results: Val loss: 1.



Mid-train results: Val loss: 1.2686 || Train loss 0.0830
Mid-train results: Val loss: 1.3045 || Train loss 0.0830
Mid-train results: Val loss: 1.3376 || Train loss 0.0916
Mid-train results: Val loss: 1.3572 || Train loss 0.1026
Mid-train results: Val loss: 1.3454 || Train loss 0.1046
Mid-train results: Val loss: 1.3299 || Train loss 0.0944
Mid-train results: Val loss: 1.2817 || Train loss 0.0802
Mid-train results: Val loss: 1.2459 || Train loss 0.0710
Mid-train results: Val loss: 1.2568 || Train loss 0.0747
Mid-train results: Val loss: 1.2864 || Train loss 0.0858
Mid-train results: Val loss: 1.3127 || Train loss 0.0849


In [None]:
def generate_predictions(model,df_test):
    sentences = df_test.text.tolist()
    batch_size = 32  
    all_outputs = []

    for i in range(0, len(sentences), batch_size):

        batch_sentences = sentences[i:i + batch_size]

        inputs = tokenizer(batch_sentences, return_tensors="pt", 
        padding=True, truncation=True, max_length=512)

        inputs = {k: v.to('cuda' if torch.cuda.is_available() else 'cpu') 
        for k, v in inputs.items()}

        with torch.no_grad():
            outputs = model(**inputs)
            all_outputs.append(outputs['logits'])
        
    final_outputs = torch.cat(all_outputs, dim=0)
    df_test['predictions']=final_outputs.argmax(axis=1).cpu().numpy()

generate_predictions(model,test_df)
get_metrics_result(test_df)

Classification Report:
              precision    recall  f1-score   support

           0       0.46      0.53      0.49        30
           1       0.50      0.42      0.46        33

    accuracy                           0.48        63
   macro avg       0.48      0.48      0.48        63
weighted avg       0.48      0.48      0.47        63

Balanced Accuracy Score: 0.47878787878787876
Accuracy Score: 0.47619047619047616


In [None]:
generate_predictions(model,train_df)
get_metrics_result(train_df)

Classification Report:
              precision    recall  f1-score   support

           0       0.51      0.50      0.51       237
           1       0.56      0.57      0.57       263

    accuracy                           0.54       500
   macro avg       0.54      0.54      0.54       500
weighted avg       0.54      0.54      0.54       500

Balanced Accuracy Score: 0.5360173910253325
Accuracy Score: 0.538


In [None]:
test_df.head(20)

In [None]:
# model.push_to_hub("ArjunSohur/argument_classification")