In [2]:
import warnings
warnings.filterwarnings("ignore")
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [3]:
import numpy as np
import pandas as pd
import os
from tqdm import tqdm
import bitsandbytes as bnb
import torch
import torch.nn as nn
import transformers
from datasets import Dataset
from peft import LoraConfig, PeftConfig
from trl import SFTTrainer
from trl import setup_chat_format
from transformers import (AutoModelForCausalLM, 
                          AutoTokenizer, 
                          BitsAndBytesConfig, 
                          TrainingArguments, 
                          pipeline, 
                          logging)
from sklearn.metrics import (accuracy_score, 
                             classification_report, 
                             confusion_matrix)
from sklearn.model_selection import train_test_split

In [4]:
print(f"pytorch version {torch.__version__}")

pytorch version 2.1.2+cu121


In [5]:
filename = "./data/all-data.csv"
df = pd.read_csv(filename, 
                 names=["sentiment", "text"],
                 encoding="utf-8", encoding_errors="replace")
df

Unnamed: 0,sentiment,text
0,neutral,"According to Gran , the company has no plans t..."
1,neutral,Technopolis plans to develop in stages an area...
2,negative,The international electronic industry company ...
3,positive,With the new production plant the company woul...
4,positive,According to the company 's updated strategy f...
...,...,...
4841,negative,LONDON MarketWatch -- Share prices ended lower...
4842,neutral,Rinkuskiai 's beer sales fell by 6.5 per cent ...
4843,negative,Operating profit fell to EUR 35.4 mn from EUR ...
4844,negative,Net sales of the Paper segment decreased to EU...


In [6]:
X_train = list()
X_test = list()

for sentiment in ["positive", "neutral", "negative"]:
    train, test  = train_test_split(df[df.sentiment==sentiment], 
                                    train_size=300,
                                    test_size=300, 
                                    random_state=42)
    X_train.append(train)
    X_test.append(test)


In [15]:
test

Unnamed: 0,sentiment,text
3790,negative,The company decided at the end of 2008 to temp...
4670,negative,down to EUR5 .9 m H1 '09 3 August 2009 - Finni...
4797,negative,The steelmaker said that the drop in profit wa...
2743,negative,Finland-based Stockmann Group has closed seven...
4065,negative,Operating loss before non-recurring items was ...
...,...,...
4219,negative,"More than 14,000 customers were left powerless ."
4814,negative,"Due to the rapid decrease in net sales , perso..."
4059,negative,Finnish retail software developer Aldata Solut...
4720,negative,The fair value of the company 's investment pr...


In [7]:
X_train = pd.concat(X_train).sample(frac=1, random_state=10)
X_test = pd.concat(X_test)

In [16]:
X_test

Unnamed: 0,text
567,Analyze the sentiment of the news headline enc...
1752,Analyze the sentiment of the news headline enc...
995,Analyze the sentiment of the news headline enc...
601,Analyze the sentiment of the news headline enc...
568,Analyze the sentiment of the news headline enc...
...,...
4219,Analyze the sentiment of the news headline enc...
4814,Analyze the sentiment of the news headline enc...
4059,Analyze the sentiment of the news headline enc...
4720,Analyze the sentiment of the news headline enc...


In [8]:
X_train

Unnamed: 0,sentiment,text
3683,neutral,Mr Jortikka is president of the base metal div...
163,positive,Both operating profit and net sales for the 12...
4017,negative,Finnish automation solutions developer Cencorp...
1588,positive,Renzo Piano 's building design will be a wonde...
1799,positive,`` We are proud to contribute to the creation ...
...,...,...
1374,neutral,"The dividend will be paid on April 15 , 2008 t..."
3869,neutral,The new shares entitle their holders to divide...
2766,neutral,Activities range from the development of natur...
1798,positive,"According to Bosse , the present cooperation i..."


In [9]:
eval_idx = [idx for idx in df.index if idx not in list(train.index) + list(test.index)]
X_eval = df[df.index.isin(eval_idx)]
X_eval = (X_eval
          .groupby('sentiment', group_keys=False)
          .apply(lambda x: x.sample(n=50, random_state=10, replace=True)))
X_train = X_train.reset_index(drop=True)

In [10]:
X_train

Unnamed: 0,sentiment,text
0,neutral,Mr Jortikka is president of the base metal div...
1,positive,Both operating profit and net sales for the 12...
2,negative,Finnish automation solutions developer Cencorp...
3,positive,Renzo Piano 's building design will be a wonde...
4,positive,`` We are proud to contribute to the creation ...
...,...,...
895,neutral,"The dividend will be paid on April 15 , 2008 t..."
896,neutral,The new shares entitle their holders to divide...
897,neutral,Activities range from the development of natur...
898,positive,"According to Bosse , the present cooperation i..."


In [11]:
def generate_prompt(data_point):
    return f"""
            Analyze the sentiment of the news headline enclosed in square brackets, 
            determine if it is positive, neutral, or negative, and return the answer as 
            the corresponding sentiment label "positive" or "neutral" or "negative".

            [{data_point["text"]}] = {data_point["sentiment"]}
            """.strip()

def generate_test_prompt(data_point):
    return f"""
            Analyze the sentiment of the news headline enclosed in square brackets, 
            determine if it is positive, neutral, or negative, and return the answer as 
            the corresponding sentiment label "positive" or "neutral" or "negative".

            [{data_point["text"]}] = """.strip()

In [12]:
X_train = pd.DataFrame(X_train.apply(generate_prompt, axis=1), 
                       columns=["text"])
X_eval = pd.DataFrame(X_eval.apply(generate_prompt, axis=1), 
                      columns=["text"])

y_true = X_test.sentiment
X_test = pd.DataFrame(X_test.apply(generate_test_prompt, axis=1), columns=["text"])


In [10]:
train_data = Dataset.from_pandas(X_train)
eval_data = Dataset.from_pandas(X_eval)

In [11]:
def evaluate(y_true, y_pred):
    labels = ['positive', 'neutral', 'negative']
    mapping = {'positive': 2, 'neutral': 1, 'none':1, 'negative': 0}
    def map_func(x):
        return mapping.get(x, 1)
    
    y_true = np.vectorize(map_func)(y_true)
    y_pred = np.vectorize(map_func)(y_pred)
    
    # Calculate accuracy
    accuracy = accuracy_score(y_true=y_true, y_pred=y_pred)
    print(f'Accuracy: {accuracy:.3f}')
    
    # Generate accuracy report
    unique_labels = set(y_true)  # Get unique labels
    
    for label in unique_labels:
        label_indices = [i for i in range(len(y_true)) 
                         if y_true[i] == label]
        label_y_true = [y_true[i] for i in label_indices]
        label_y_pred = [y_pred[i] for i in label_indices]
        accuracy = accuracy_score(label_y_true, label_y_pred)
        print(f'Accuracy for label {label}: {accuracy:.3f}')
        
    # Generate classification report
    class_report = classification_report(y_true=y_true, y_pred=y_pred)
    print('\nClassification Report:')
    print(class_report)
    
    # Generate confusion matrix
    conf_matrix = confusion_matrix(y_true=y_true, y_pred=y_pred, labels=[0, 1, 2])
    print('\nConfusion Matrix:')
    print(conf_matrix)

In [12]:
model_name = "../../../../llm-model"
compute_dtype = getattr(torch, "float16")

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True, 
    bnb_4bit_quant_type="nf4", 
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=True,
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    torch_dtype=compute_dtype,
    quantization_config=bnb_config, 
)

model.config.use_cache = False
model.config.pretraining_tp = 1

tokenizer = AutoTokenizer.from_pretrained(model_name, 
                                          trust_remote_code=True,
                                         )
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

model, tokenizer = setup_chat_format(model, tokenizer)

Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.24s/it]


In [56]:
def predict(test, model, tokenizer):
    y_pred = []
    for i in tqdm(range(len(X_test))):
        prompt = X_test.iloc[i]["text"]
        pipe = pipeline(task="text-generation", 
                        model=model, 
                        tokenizer=tokenizer, 
                        max_new_tokens = 1, 
                        temperature = 0.0,
                       )
        result = pipe(prompt)
        answer = result[0]['generated_text'].split("=")[-1]
        if "positive" in answer:
            y_pred.append("positive")
        elif "negative" in answer:
            y_pred.append("negative")
        elif "neutral" in answer:
            y_pred.append("neutral")
        else:
            y_pred.append("none")
    return y_pred

In [None]:
y_pred = predict(test, model, tokenizer)

In [16]:
evaluate(y_true, y_pred)

Accuracy: 0.373
Accuracy for label 0: 0.027
Accuracy for label 1: 0.937
Accuracy for label 2: 0.157

Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.03      0.05       300
           1       0.34      0.94      0.50       300
           2       0.67      0.16      0.25       300

    accuracy                           0.37       900
   macro avg       0.63      0.37      0.27       900
weighted avg       0.63      0.37      0.27       900


Confusion Matrix:
[[  8 287   5]
 [  1 281  18]
 [  0 253  47]]


### fine tuning

In [17]:
peft_config = LoraConfig(
        lora_alpha=16, 
        lora_dropout=0.1,
        r=64,
        bias="none",
        target_modules="all-linear",
        task_type="CAUSAL_LM",
)

training_arguments = TrainingArguments(
    output_dir="logs",                        # directory to save and repository id
    num_train_epochs=3,                       # number of training epochs
    per_device_train_batch_size=1,            # batch size per device during training
    gradient_accumulation_steps=8,            # number of steps before performing a backward/update pass
    gradient_checkpointing=True,              # use gradient checkpointing to save memory
    optim="paged_adamw_32bit",
    save_steps=0,
    logging_steps=25,                         # log every 10 steps
    learning_rate=2e-4,                       # learning rate, based on QLoRA paper
    weight_decay=0.001,
    fp16=True,
    bf16=False,
    max_grad_norm=0.3,                        # max gradient norm based on QLoRA paper
    max_steps=-1,
    warmup_ratio=0.03,                        # warmup ratio based on QLoRA paper
    group_by_length=True,
    lr_scheduler_type="cosine",               # use cosine learning rate scheduler
    report_to="tensorboard",                  # report metrics to tensorboard
    evaluation_strategy="epoch"               # save checkpoint every epoch
)

trainer = SFTTrainer(
    model=model,
    args=training_arguments,
    train_dataset=train_data,
    eval_dataset=eval_data,
    peft_config=peft_config,
    dataset_text_field="text",
    tokenizer=tokenizer,
    max_seq_length=1024,
    packing=True,
    dataset_kwargs={
        "add_special_tokens": False,  # We template with special tokens
        "append_concat_token": False, # No need to add additional separator token
    }
)

Generating train split: 86 examples [00:00, 627.64 examples/s]
Generating train split: 13 examples [00:00, 681.16 examples/s]


In [18]:
# Train model
trainer.train()

# Save trained model
trainer.model.save_pretrained("trained-model")

You are using 8-bit optimizers with a version of `bitsandbytes` < 0.41.1. It is recommended to update your version as a major bug has been fixed in 8-bit optimizers.
You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss
0,No log,0.730605
1,No log,0.661581
2,0.894900,0.657592


In [19]:
%load_ext tensorboard
%tensorboard --logdir logs/runs

In [20]:
y_pred = predict(test, model, tokenizer)
evaluate(y_true, y_pred)

100%|██████████| 900/900 [00:59<00:00, 15.03it/s]


Accuracy: 0.782
Accuracy for label 0: 0.937
Accuracy for label 1: 0.630
Accuracy for label 2: 0.780

Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.94      0.92       300
           1       0.73      0.63      0.67       300
           2       0.72      0.78      0.75       300

    accuracy                           0.78       900
   macro avg       0.78      0.78      0.78       900
weighted avg       0.78      0.78      0.78       900


Confusion Matrix:
[[281  15   4]
 [ 23 189  88]
 [ 10  56 234]]


In [21]:
evaluation = pd.DataFrame({'text': X_test["text"], 
                           'y_true':y_true, 
                           'y_pred': y_pred},
                         )

In [22]:
evaluation

Unnamed: 0,text,y_true,y_pred
567,Analyze the sentiment of the news headline enc...,positive,positive
1752,Analyze the sentiment of the news headline enc...,positive,positive
995,Analyze the sentiment of the news headline enc...,positive,positive
601,Analyze the sentiment of the news headline enc...,positive,positive
568,Analyze the sentiment of the news headline enc...,positive,positive
...,...,...,...
4219,Analyze the sentiment of the news headline enc...,negative,negative
4814,Analyze the sentiment of the news headline enc...,negative,negative
4059,Analyze the sentiment of the news headline enc...,negative,negative
4720,Analyze the sentiment of the news headline enc...,negative,negative
