https://www.analyticsvidhya.com/blog/2024/06/finetuning-llama-3-for-sequence-classification/

In [1]:
!pip install -q transformers accelerate trl bitsandbytes datasets evaluate
!pip install -q peft scikit-learn
!pip install -U "huggingface_hub[cli]"



In [32]:
with open("../private_/hf_read_token", "r") as f:
  token = f.readline()

hf_read = token

with open("../private_/hf_write_token", "r") as f:
  token = f.readline()

hf_write = token

In [33]:
!huggingface-cli login --token $hf_read

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /home/arjunsohur/.cache/huggingface/token
Login successful


In [4]:
from convokit import Corpus, download
from datasets import Dataset, DatasetDict

import torch
from transformers import BitsAndBytesConfig, AutoModelForSequenceClassification

  from .autonotebook import tqdm as notebook_tqdm


In [37]:
from huggingface_hub import HfApi, login

# not sure if the r and w tokens are needed but oh well
login(token=hf_write)

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /home/arjunsohur/.cache/huggingface/token
Login successful


In [5]:
%%capture
corpus = Corpus(filename=download("winning-args-corpus"))

In [6]:
import pandas as pd

ids = corpus.get_utterance_ids()
print("Len of ids", len(ids))

SPEAKER_BLACKLIST = ['DeltaBot','AutoModerator']
training_trios = []

for id in ids:
  ut = corpus.get_utterance(id)
  if ut.reply_to == ut.conversation_id and (ut.meta['success'] == 1 or ut.meta['success'] == 0) and (ut.speaker.id not in SPEAKER_BLACKLIST):
    op = corpus.get_utterance(ut.conversation_id).text
    x = ut.text
    y = ut.meta['success']

    training_trios += [(op, x, y)]

print(len(training_trios))

train_len = len(training_trios)

ones = 0
zeros = 0
total = 0

def formatting_prompts_func(training_trios):
    texts = []
    targets = []

    total = 0
    ones = 0
    zeros = 0

    for trio in training_trios:
        op, x, y = trio
        instruction = "Please determine if the following argument is successful based on the original post.  Output 1 for successful and 0 for unsuccessful.  Only output the one number, NOTHING ELSE."
        input_context = f"Original post: {op}\nArgument: {x}"

        texts.append(input_context)
        targets.append(y)

        if y:
           ones+=1
        else:
           zeros+=1
        total += 1

    return texts, targets, ones, zeros, total

# Format the data
texts, targets, ones, zeros, total = formatting_prompts_func(training_trios)

v_start = int(len(texts) * 0.8)
v_end = int(len(texts) * 0.9)

train = {"text": texts[:v_start], "label": targets[:v_start]}
val = {"text": texts[v_start:v_end], "label":targets[v_start:v_end]}
test = {"text": texts[v_end:], "label":targets[v_end:]}

train_ds = Dataset.from_dict(train)
val_ds = Dataset.from_dict(val)
test_ds = Dataset.from_dict(test)

train_df = pd.DataFrame.from_dict(train_ds)
val_df = pd.DataFrame.from_dict(val_ds)
test_df = pd.DataFrame.from_dict(test_ds)

dataset = DatasetDict({
   'train': train_ds,
   'val': val_ds,
   'test': train_ds
})

print(dataset)

# NOTE: I DON'T WEIGHT THE 0 OR 1, WHICH I SHOULD PROBABLY DO
# IF PERFORMACE IS UNSATISFACTORY, I'LL DO IT

Len of ids 293297
8106
DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 6484
    })
    val: Dataset({
        features: ['text', 'label'],
        num_rows: 811
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 6484
    })
})


In [7]:
class_weights=(1/train_df.label.value_counts(normalize=True).sort_index()).tolist()
class_weights=torch.tensor(class_weights)
class_weights=class_weights/class_weights.sum()
class_weights

tensor([0.5216, 0.4784])

In [8]:
quantization_config = BitsAndBytesConfig(
    load_in_4bit = True, 
    bnb_4bit_quant_type = 'nf4',
    bnb_4bit_use_double_quant = True, 
    bnb_4bit_compute_dtype = torch.bfloat16 
)

model_name = "meta-llama/Meta-Llama-3-8B"

model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    quantization_config=quantization_config,
    num_labels=2,
    device_map='auto'
)

Loading checkpoint shards: 100%|██████████| 4/4 [00:13<00:00,  3.50s/it]
Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Meta-Llama-3-8B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model

lora_config = LoraConfig(
    r = 16, 
    lora_alpha = 8,
    target_modules = ['q_proj', 'k_proj', 'v_proj', 'o_proj'],
    lora_dropout = 0.05, 
    bias = 'none',
    task_type = 'SEQ_CLS'
)

model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, lora_config)

In [10]:
from transformers import AutoTokenizer

model_name = "meta-llama/Meta-Llama-3-8B"
tokenizer = AutoTokenizer.from_pretrained(model_name, add_prefix_space=True)

tokenizer.pad_token_id = tokenizer.eos_token_id
tokenizer.pad_token = tokenizer.eos_token

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [11]:
model.config.pad_token_id = tokenizer.pad_token_id
model.config.use_cache = False
model.config.pretraining_tp = 1

In [12]:
sentences = test_df.text.tolist()

batch_size = 32  

all_outputs = []

for i in range(0, len(sentences), batch_size):
    batch_sentences = sentences[i:i + batch_size]

    inputs = tokenizer(batch_sentences, return_tensors="pt", 
    padding=True, truncation=True, max_length=512)

    inputs = {k: v.to('cuda' if torch.cuda.is_available() else 'cpu') for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model(**inputs)
        all_outputs.append(outputs['logits'])
        
final_outputs = torch.cat(all_outputs, dim=0)
test_df['predictions']=final_outputs.argmax(axis=1).cpu().numpy()

In [13]:
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.metrics import balanced_accuracy_score, classification_report

def get_metrics_result(test_df):
    y_test = test_df.label
    y_pred = test_df.predictions

    print("Classification Report:")
    print(classification_report(y_test, y_pred))

    print("Balanced Accuracy Score:", balanced_accuracy_score(y_test, y_pred))
    print("Accuracy Score:", accuracy_score(y_test, y_pred))

get_metrics_result(test_df)

Classification Report:
              precision    recall  f1-score   support

           0       0.47      0.36      0.41       382
           1       0.53      0.64      0.58       429

    accuracy                           0.51       811
   macro avg       0.50      0.50      0.49       811
weighted avg       0.50      0.51      0.50       811

Balanced Accuracy Score: 0.4996887928825101
Accuracy Score: 0.5080147965474723


In [14]:
def data_preprocesing(row):
    return tokenizer(row['text'], truncation=True, max_length=1000)

tokenized_data = dataset.map(data_preprocesing, batched=True, 
remove_columns=['text'])
tokenized_data.set_format("torch")

Map: 100%|██████████| 6484/6484 [00:07<00:00, 840.08 examples/s]
Map: 100%|██████████| 811/811 [00:01<00:00, 789.87 examples/s]
Map: 100%|██████████| 6484/6484 [00:07<00:00, 844.78 examples/s]


In [15]:
from transformers import DataCollatorWithPadding

collate_fn = DataCollatorWithPadding(tokenizer=tokenizer)

In [16]:
def compute_metrics(evaluations):
    predictions, labels = evaluations
    predictions = np.argmax(predictions, axis=1)
    return {'balanced_accuracy' : balanced_accuracy_score(predictions, labels),
    'accuracy':accuracy_score(predictions,labels)}

In [17]:
from transformers import Trainer, TrainingArguments
import torch.nn.functional as F

class CustomTrainer(Trainer):
    def __init__(self, *args, class_weights=None, **kwargs):
        super().__init__(*args, **kwargs)
        if class_weights is not None:
            self.class_weights = torch.tensor(class_weights, 
            dtype=torch.float32).to(self.args.device)
        else:
            self.class_weights = None

    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels").long()

        outputs = model(**inputs)

        logits = outputs.get('logits')

        if self.class_weights is not None:
            loss = F.cross_entropy(logits, labels, weight=self.class_weights)
        else:
            loss = F.cross_entropy(logits, labels)

        return (loss, outputs) if return_outputs else loss

In [18]:
training_args = TrainingArguments(
    output_dir = 'persuasion_classification',
    learning_rate = 1e-4,
    per_device_train_batch_size = 8,
    per_device_eval_batch_size = 8,
    num_train_epochs = 1,
    logging_steps=1,
    weight_decay = 0.01,
    evaluation_strategy = 'epoch',
    save_strategy = 'epoch',
    load_best_model_at_end = True,
    report_to="none"
)



In [19]:
trainer = CustomTrainer(
    model = model,
    args = training_args,
    train_dataset = tokenized_data['train'],
    eval_dataset = tokenized_data['val'],
    tokenizer = tokenizer,
    data_collator = collate_fn,
    compute_metrics = compute_metrics,
    class_weights=class_weights,
)


  self.class_weights = torch.tensor(class_weights,


In [20]:
train_result = trainer.train()



Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [None]:
def generate_predictions(model,df_test):
    sentences = df_test.text.tolist()
    batch_size = 32  
    all_outputs = []

    for i in range(0, len(sentences), batch_size):

        batch_sentences = sentences[i:i + batch_size]

        inputs = tokenizer(batch_sentences, return_tensors="pt", 
        padding=True, truncation=True, max_length=512)

        inputs = {k: v.to('cuda' if torch.cuda.is_available() else 'cpu') 
        for k, v in inputs.items()}

        with torch.no_grad():
            outputs = model(**inputs)
            all_outputs.append(outputs['logits'])
        
    final_outputs = torch.cat(all_outputs, dim=0)
    df_test['predictions']=final_outputs.argmax(axis=1).cpu().numpy()

generate_predictions(model,test_df)
get_performance_metrics(test_df)

In [None]:
model.push_to_hub("ArjunSohur/argument_classification")