In [1]:
import datasets
datasets.__version__

  from .autonotebook import tqdm as notebook_tqdm


'2.17.1'

In [1]:
import json
import re
from pprint import pprint

import pandas as pd
import torch
from datasets import Dataset, load_dataset, DatasetDict
from huggingface_hub import notebook_login
from peft import PeftModel, PeftConfig, get_peft_model, LoraConfig, prepare_model_for_int8_training
from trl import SFTTrainer
import evaluate
import numpy as np
from transformers import (
    AutoModelForCausalLM,
LlamaForSequenceClassification,
    AutoConfig, 
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    Trainer
)


DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu"
base_model = "NousResearch/Llama-2-7b-hf" #"meta-llama/Llama-2-7b-chat-hf"
new_model = "Llama-2-7b-sentiment-finetune"

  from .autonotebook import tqdm as notebook_tqdm
2024-02-25 22:28:59.701808: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
from datasets import load_dataset

train_dataset = load_dataset("tyqiangz/multilingual-sentiments", 'all', split='train')
train_set = train_dataset.train_test_split(test_size=0.02, stratify_by_column="label")['test']
print(train_set)
val_dataset = load_dataset("tyqiangz/multilingual-sentiments", 'all', split='validation')
val_set = val_dataset.train_test_split(test_size=0.1, stratify_by_column="label")['test']
print(val_set)
test_dataset = load_dataset("tyqiangz/multilingual-sentiments", 'all', split='test')
test_set = test_dataset.train_test_split(test_size=0.08, stratify_by_column="label")['test']
print(test_set)

Dataset({
    features: ['text', 'source', 'language', 'label'],
    num_rows: 5408
})
Dataset({
    features: ['text', 'source', 'language', 'label'],
    num_rows: 1086
})
Dataset({
    features: ['text', 'source', 'language', 'label'],
    num_rows: 1158
})


In [3]:
train_labels = list(train_set['label'])
print(train_labels.count(0))
print(train_labels.count(1))
print(train_labels.count(2))

1870
1735
1803


In [4]:
val_labels = list(val_set['label'])
print(val_labels.count(0))
print(val_labels.count(1))
print(val_labels.count(2))

406
315
365


In [5]:
test_labels = list(test_set['label'])
print(test_labels.count(0))
print(test_labels.count(1))
print(test_labels.count(2))

400
365
393


In [7]:
# define label maps
id2label = {0: "Positive", 1: "Neutral", 2:"Negative"}
label2id = {"Positive":0, "Neutral":1, "Negative":2}

# generate classification model from model_checkpoint
# model = AutoModelForSequenceClassification.from_pretrained(
#     base_model, num_labels=3, id2label=id2label, label2id=label2id)

def get_tokenizer(model_name):
    # Load LLaMA tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    return tokenizer

def create_model_and_tokenizer():
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.float16,
    )

    model = AutoModelForSequenceClassification.from_pretrained(
        base_model,
        use_safetensors=True,
        quantization_config=bnb_config,
        device_map="auto",
        trust_remote_code=True,
        load_in_4bit=True,
        #torch_dtype=torch.float16,
        num_labels=3, id2label=id2label, label2id=label2id
    )

    #tokenizer = AutoTokenizer.from_pretrained(base_model)
    tokenizer = get_tokenizer(base_model)
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = "right"

    return model, tokenizer


model, tokenizer = create_model_and_tokenizer()
model.config.use_cache = False

Loading checkpoint shards: 100%|██████████| 2/2 [00:03<00:00,  1.60s/it]
Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at NousResearch/Llama-2-7b-hf and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
model.config.quantization_config.to_dict()

{'quant_method': <QuantizationMethod.BITS_AND_BYTES: 'bitsandbytes'>,
 'load_in_8bit': False,
 'load_in_4bit': True,
 'llm_int8_threshold': 6.0,
 'llm_int8_skip_modules': None,
 'llm_int8_enable_fp32_cpu_offload': False,
 'llm_int8_has_fp16_weight': False,
 'bnb_4bit_quant_type': 'nf4',
 'bnb_4bit_use_double_quant': False,
 'bnb_4bit_compute_dtype': 'float16'}

In [9]:
lora_r = 16
lora_alpha = 64
lora_dropout = 0.1
lora_target_modules = [
    "q_proj",
    "up_proj",
    "o_proj",
    "k_proj",
    "down_proj",
    "gate_proj",
    "v_proj",
]


peft_config = LoraConfig(
    r=lora_r,
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    target_modules=lora_target_modules,
    bias="none",
    task_type="SEQ_CLS",
)

model = prepare_model_for_int8_training(model)
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()



trainable params: 39,989,248 || all params: 6,647,345,152 || trainable%: 0.6015822420168502


In [10]:
# create tokenize function
def tokenize_function(examples):
    # extract text
    text = examples["text"]

    #tokenize and truncate text
    tokenizer.truncation_side = "left"
    tokenized_inputs = tokenizer(
        text,
        return_tensors="np",
        truncation=True,
        max_length=512
    )

    return tokenized_inputs

In [11]:
# tokenize training and validation datasets
tokenized_train_set = train_set.map(tokenize_function, batched=True)
tokenized_train_set

Map: 100%|██████████| 5408/5408 [00:00<00:00, 6628.18 examples/s]


Dataset({
    features: ['text', 'source', 'language', 'label', 'input_ids', 'attention_mask'],
    num_rows: 5408
})

In [12]:
tokenized_val_set = val_set.map(tokenize_function, batched=True)
tokenized_val_set

Map: 100%|██████████| 1086/1086 [00:00<00:00, 11230.78 examples/s]


Dataset({
    features: ['text', 'source', 'language', 'label', 'input_ids', 'attention_mask'],
    num_rows: 1086
})

In [13]:
# create data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [14]:
# import accuracy evaluation metric
accuracy = evaluate.load("accuracy")
auc = evaluate.load("roc_auc", "multiclass")

# define an evaluation function to pass into trainer later
def compute_metrics(p):
    predictions, labels = p
    predictions = torch.nn.functional.softmax(torch.Tensor(predictions),dim=1)
    argmax_predictions = torch.argmax(predictions, axis=1)

    acc_metric = accuracy.compute(predictions=argmax_predictions, references=labels)
    auc_metric = auc.compute(references=labels, prediction_scores=predictions,multi_class='ovr')

    return {"accuracy": round(acc_metric['accuracy'],5) , "auc":round(auc_metric['roc_auc'],5)}

In [15]:
from trl import SFTTrainer, DataCollatorForCompletionOnlyLM

OUTPUT_DIR = "classification_exp"

# %load_ext tensorboard
# %tensorboard --logdir classification_exp/runs

training_arguments = TrainingArguments(
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    optim="paged_adamw_32bit",
    logging_steps=50,
    learning_rate=1e-4,
    fp16=True,
    max_grad_norm=0.3,
    num_train_epochs=10,
    evaluation_strategy="steps",
    warmup_ratio=0.05,
    save_strategy="steps",
    save_steps = 250,
    save_total_limit = 2,
    group_by_length=True,
    output_dir=OUTPUT_DIR,
    report_to="tensorboard",
    save_safetensors=True,
    lr_scheduler_type="cosine",
    load_best_model_at_end="True",
    seed=42,
)

trainer = Trainer(
    model=model,
    train_dataset=tokenized_train_set,
    eval_dataset=tokenized_val_set,
    #peft_config=peft_config,
    #dataset_text_field="text",
    #max_seq_length=512,
    tokenizer=tokenizer,
    args=training_arguments,
    #formatting_func=formatting_prompts_func,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

trainer.train()

trainer.model.save_pretrained(new_model)

Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,Accuracy,Auc
50,1.4837,1.220451,0.36556,0.54677
100,1.0774,1.045757,0.47698,0.70663
150,0.9065,0.955734,0.54144,0.77957
200,0.7996,1.045691,0.47238,0.79831
250,0.7522,0.767317,0.62891,0.83422
300,0.6598,0.811409,0.61142,0.8377
350,0.6119,0.820795,0.61234,0.84996
400,0.5432,0.851727,0.57182,0.85164
450,0.5104,0.86355,0.61142,0.84
500,0.5234,0.792478,0.61971,0.84864


In [162]:
# Loading the base model (Llama2-7b) to check its performance on the test set

# define label maps
id2label = {0: "Positive", 1: "Neutral", 2:"Negative"}
label2id = {"Positive":0, "Neutral":1, "Negative":2}

raw_model = AutoModelForSequenceClassification.from_pretrained(
        base_model,
        use_safetensors=True,
        device_map="auto",
        trust_remote_code=True,
        low_cpu_mem_usage=True,
        torch_dtype=torch.float16,
        num_labels=3, id2label=id2label, label2id=label2id
    )

# Reload tokenizer to save it
tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.15s/it]
Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at NousResearch/Llama-2-7b-hf and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [125]:
def predict_test(test_dataset, model, tokenizer):
    y_pred = []
    y_true = []
    prediction_scores = []
    for i in range(len(test_dataset)):
        file = test_dataset[i]
        prompt = file['text']
        inputs = tokenizer.encode(prompt, return_tensors="pt").to(DEVICE)
        output = model(inputs)
        logits = output.logits
        logits = logits.to(torch.float32)
        predictions = torch.nn.functional.softmax(torch.Tensor(logits),dim=1)
        argmax_predictions = torch.argmax(predictions, axis=1)
        
        prediction_scores.append(predictions.detach().cpu().numpy())
        y_pred.append(argmax_predictions.item())
        y_true.append(file['label'])
    return y_true, y_pred, prediction_scores

# import accuracy evaluation metric
accuracy = evaluate.load("accuracy")
auc = evaluate.load("roc_auc", "multiclass")

# define an evaluation function to pass into trainer later
def compute_test_metrics(predictions, labels):

    predictions = [list(x[0].astype(np.float32)) for x in predictions]
    #print(predictions[:5])
    argmax_predictions = torch.argmax(torch.Tensor(predictions), axis=1)
    #print(argmax_predictions[:5])

    acc_metric = accuracy.compute(predictions=argmax_predictions, references=labels)
    auc_metric = auc.compute(references=labels, prediction_scores=predictions,multi_class='ovr')

    return {"accuracy": round(acc_metric['accuracy'],5) , "auc":round(auc_metric['roc_auc'],5)}

In [158]:
# Checking the performance of base model (Llama2-7b)

y_true, y_pred, prediction_scores = predict_test(test_set, raw_model, tokenizer)

results = compute_test_metrics(prediction_scores, y_true)
results

{'accuracy': 0.33679, 'auc': 0.48849}

In [159]:
# Loading and merging the finetuned (LORA) weights of Llama2

ft_model = PeftModel.from_pretrained(raw_model, new_model)
ft_model = ft_model.merge_and_unload()

In [163]:
# Checking the performance of the finetuned Llama2 Model

y_true, y_pred_ft, prediction_scores = predict_test(test_set, ft_model, tokenizer)

results = compute_test_metrics(prediction_scores, y_true)
results

{'accuracy': 0.62176, 'auc': 0.78785}

In [137]:
lang_list = list(test_set['language'])
eng_indices = [ind for ind, ele in enumerate(lang_list) if ele == 'english']
len(eng_indices)

87

In [138]:
test_set[eng_indices[0]]

{'text': 'Which altogether uncommonly wants so as to go through the candor regarding needful trafficking robots: oEJe ',
 'source': 'sem_eval_2017',
 'language': 'english',
 'label': 2}

In [167]:
import random
id2label = {0: "Positive", 1: "Neutral", 2:"Negative"}

def demo_test(file, model, tokenizer):
    y_pred = []
    y_true = []
    prediction_scores = []
    prompt = file['text']
    inputs = tokenizer.encode(prompt, return_tensors="pt").to(DEVICE)
    output = model(inputs)
    logits = output.logits
    logits = logits.to(torch.float32)
    predictions = torch.nn.functional.softmax(torch.Tensor(logits),dim=1)
    argmax_predictions = torch.argmax(predictions, axis=1)
    
    prediction_scores.append(predictions.detach().cpu().numpy())
    y_pred.append(argmax_predictions.item())
    y_true.append(file['label'])
    return y_true, y_pred, prediction_scores


for i in range(5):
    y = random.randrange(87)
    test_data = test_set[eng_indices[y]]
    print(test_data['text'])
    y_true, y_pred, prediction_scores = demo_test(test_data, raw_model, tokenizer)
    print("Ground Truth:", id2label[y_true[0]])
    print("Base Model Prediction:", id2label[y_pred[0]])
    y_true, y_pred, prediction_scores = demo_test(test_data, ft_model, tokenizer)
    print("Finetuned Model Prediction:", id2label[y_pred[0]])
    print("\n")
    
    
    

Really sounds like @user doesn't want this job. Know who did? @user It's why I'm #StillwithHer 
Ground Truth: Neutral
Base Model Prediction: Neutral
Finetuned Model Prediction: Positive


Horrible Uber Go trips! #UBER get the things sorted plz. #UberIndia 
Ground Truth: Negative
Base Model Prediction: Neutral
Finetuned Model Prediction: Negative


@user Never trust Comey😡 if it wasn't for him we would have Hillary 😠 F Comey!! Get rid of him 
Ground Truth: Negative
Base Model Prediction: Neutral
Finetuned Model Prediction: Negative


#2016 is the new #1966 first #brexit then #trumpton now a crap band from #Romford have done for the Italian president #Renzi #5* #5star 
Ground Truth: Negative
Base Model Prediction: Neutral
Finetuned Model Prediction: Negative


#cannabis The Associated PressAssistant cultivator Emily Errico examines cannabis plants grown by Vireo Health of … 
Ground Truth: Neutral
Base Model Prediction: Positive
Finetuned Model Prediction: Positive




In [169]:
#CONCLUSION:

# In this case, identifying a negative comment is much important than neutral/positive comment 
# as the goal is to make LLM understand hateful content

# As we can observe, the base model wrongly identifies most negative comments as neutral

# Whereas the finetuned model understands the negative comments better!

# We can see some misclassifications from the finetuned model as well since it is just finetuned for 10 epochs 
# NOTE that we have just finetuned on a minute subset (~2%) of the training data (due to compute restraints)

# We can get better performance as we upscale the training corpus and the finetuning hyperparameters