# 1-Libraries and Dataset downloading

In [1]:
import numpy as np
import pandas as pd
import os
from tqdm import tqdm
import bitsandbytes as bnb
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import transformers
from peft import LoraConfig, PeftConfig
from trl import SFTTrainer
from trl import setup_chat_format
from transformers import (AutoModelForCausalLM,
                        AutoModelForSequenceClassification,
                        ZeroShotClassificationPipeline,
                          AutoTokenizer, 
                          BitsAndBytesConfig, 
                          TrainingArguments, 
                          pipeline, 
                          logging)
from transformers.pipelines.pt_utils import KeyDataset
from sklearn.metrics import (accuracy_score, 
                             classification_report, 
                             confusion_matrix)
from sklearn.model_selection import train_test_split
device='cuda' if torch.cuda.is_available() else 'cpu'

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
train_path = 'archive/twitter_training.csv'
test_path = 'archive/twitter_validation.csv'
df_train = pd.read_csv(train_path)
df_test = pd.read_csv(test_path)
df_train.head()

Unnamed: 0,2401,Borderlands,Positive,"im getting on borderlands and i will murder you all ,"
0,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
1,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
2,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
3,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...
4,2401,Borderlands,Positive,im getting into borderlands and i can murder y...


In [3]:
df_test.columns = ['header', 'entity','labels','text']
df_train.columns = ['header', 'entity','labels','text']

In [4]:
df_train.dropna(inplace=True)
df_train.drop_duplicates(inplace=True)
df_train.isnull().sum()

header    0
entity    0
labels    0
text      0
dtype: int64

In [5]:
df_train.drop(columns=['header'], inplace=True)
df_test.drop(columns=['header'], inplace=True)
df_train.replace(to_replace='Irrelevant', value='Neutral', inplace=True)
df_test.replace(to_replace='Irrelevant', value='Neutral', inplace=True)

In [6]:
df_test.head()

Unnamed: 0,entity,labels,text
0,Amazon,Neutral,BBC News - Amazon boss Jeff Bezos rejects clai...
1,Microsoft,Negative,@Microsoft Why do I pay for WORD when it funct...
2,CS-GO,Negative,"CSGO matchmaking is so full of closet hacking,..."
3,Google,Neutral,Now the President is slapping Americans in the...
4,FIFA,Negative,Hi @EAHelp I’ve had Madeleine McCann in my cel...


In [7]:
df_test.labels.value_counts()

labels
Neutral     456
Positive    277
Negative    266
Name: count, dtype: int64

In [8]:
# Create a mapping dictionary to streamline LLM classification
sentiment_mapping = {
    'Positive': 0,
    'Neutral': 1,
    'Negative': 2
}

# Apply the mapping to the 'sentiment' column
df_test['sentiment'] = df_test['labels'].map(sentiment_mapping)
df_train['sentiment'] = df_train['labels'].map(sentiment_mapping)

In [9]:
df_test.head()

Unnamed: 0,entity,labels,text,sentiment
0,Amazon,Neutral,BBC News - Amazon boss Jeff Bezos rejects clai...,1
1,Microsoft,Negative,@Microsoft Why do I pay for WORD when it funct...,2
2,CS-GO,Negative,"CSGO matchmaking is so full of closet hacking,...",2
3,Google,Neutral,Now the President is slapping Americans in the...,1
4,FIFA,Negative,Hi @EAHelp I’ve had Madeleine McCann in my cel...,2


Create custom dataset

In [10]:
class EntitySentimentDataset(Dataset):
    def __init__(self, data):
        self.data = data
        
    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        return {'text': row['text'], 'entity': row['entity']}

# 2. Predicting with pre-trained NLI model

The task is entity-level sentiment analisys with 3 labels. Model should recognise the dependence of a sentence on an entity and predict sentiment based on this.
This task is similar with nli task, because it is necessary to determine the relationship between the entity and the text.

In [15]:
class EvaluationPipeline():
    def __init__(self, data, pipe):
        self.data = data
        self.dataset = EntitySentimentDataset(self.data)
        self.result=[]
        self.pipe = pipe
        print (self.pipe.device)

    def evaluate(self, batch_size: int, candidate_labels: list):
        for out in tqdm(self.pipe(self.dataset, candidate_labels, batch_size=batch_size, truncation="only_first"), total=len(self.dataset)):
            self.result.append(out['labels'][0])

    def calculate_metrics(self):
        # Evaluate accuracy
        accuracy = accuracy_score(self.data['labels'], self.result)
        print(f'Accuracy: {accuracy:.2f}')
        
        # Generate classification report
        class_report = classification_report(self.data['labels'], self.result)
        print('Classification Report:\n', class_report)
        
        # Generate confusion matrix
        conf_matrix = confusion_matrix(self.data['labels'], self.result)
        print('Confusion Matrix:\n', conf_matrix)

        return accuracy, class_report, conf_matrix

We'll have to adapt the pipeline class a bit to our task so that we can use batching

In [12]:
class EntityClassificationPipeline(ZeroShotClassificationPipeline):
    def preprocess(self, inputs, candidate_labels):
        sequences = inputs['text']
        if isinstance(sequences, str):
            sequences = [sequences]

        entities = inputs['entity']
        if isinstance(entities, str):
            entities = [entities]
        hypothesis_templates = [f"the sentiment towards {entity} is {{}}." for entity in entities]
        sequence_pairs = []
        for i in range(len(sequences)):
            sequence_pairs.extend([[sequences[i], hypothesis_templates[i].format(label)] for label in candidate_labels])

        for i, (candidate_label, sequence_pair) in enumerate(zip(candidate_labels, sequence_pairs)):
            model_input = self._parse_and_tokenize([sequence_pair])

            yield {
                "candidate_label": candidate_label,
                "sequence": sequences[0],
                "is_last": i == len(candidate_labels) - 1,
                **model_input,
            }


Try to solve task with pre-trained bart-large-mnli

In [13]:
model_name = 'facebook/bart-large-mnli'
model = AutoModelForSequenceClassification.from_pretrained(model_name,
                                                    device_map='auto')
tokenizer = AutoTokenizer.from_pretrained(model_name, max_seq_length=512)
sentiment_model = EvaluationPipeline(df_test, EntityClassificationPipeline(model=model, tokenizer=tokenizer, return_all_scores=True))
candidate_labels = ['Positive', 'Neutral', 'Negative']
sentiment_model.evaluate(batch_size=16, candidate_labels=candidate_labels)
accuracy1, class_report1, conf_matrix1 = sentiment_model.calculate_metrics()



cuda:0


100%|█████████████████████████████████████████| 999/999 [00:43<00:00, 23.08it/s]

Accuracy: 0.47
Classification Report:
               precision    recall  f1-score   support

    Negative       0.46      0.92      0.62       266
     Neutral       0.57      0.01      0.02       456
    Positive       0.48      0.80      0.60       277

    accuracy                           0.47       999
   macro avg       0.50      0.58      0.41       999
weighted avg       0.52      0.47      0.34       999

Confusion Matrix:
 [[245   0  21]
 [230   4 222]
 [ 53   3 221]]





The result is not impressive, look what we have with two labels (without Neutral)

In [15]:
df_test_2 = df_test[df_test['labels']!='Neutral']
sentiment_model = EvaluationPipeline(df_test_2, EntityClassificationPipeline(model=model, tokenizer=tokenizer, return_all_scores=True))
candidate_labels = ['Positive', 'Negative']
sentiment_model.evaluate(batch_size=16, candidate_labels=candidate_labels)
accuracy1, class_report1, conf_matrix1 = sentiment_model.calculate_metrics()

cuda:0


100%|█████████████████████████████████████████| 543/543 [00:15<00:00, 36.12it/s]

Accuracy: 0.86
Classification Report:
               precision    recall  f1-score   support

    Negative       0.82      0.92      0.87       266
    Positive       0.91      0.81      0.86       277

    accuracy                           0.86       543
   macro avg       0.87      0.86      0.86       543
weighted avg       0.87      0.86      0.86       543

Confusion Matrix:
 [[245  21]
 [ 54 223]]





With two labels model performs well, but we need to solve the problem for 3.

# 3. Llama3 prompting

In [11]:
from huggingface_hub import login
login(token='YOUR_API_KEY')

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /home/dortp58/.cache/huggingface/token
Login successful


In [12]:
# Load model directly
from transformers import (AutoModelForCausalLM, 
                          AutoTokenizer, 
                          BitsAndBytesConfig, 
                          TrainingArguments, 
                          pipeline, 
                          logging)

model_name = "meta-llama/Llama-3.2-1B-Instruct"
model = AutoModelForCausalLM.from_pretrained(model_name,
                                            device_map='auto',
                                            torch_dtype=torch.bfloat16)

tokenizer = AutoTokenizer.from_pretrained(model_name, max_seq_length=512, padding_size="left")
tokenizer.pad_token_id = tokenizer.eos_token_id


In [13]:
pipe = pipeline(task="text-generation", 
                    model=model, 
                    tokenizer=tokenizer, 
                    max_new_tokens = 2,
                    temperature=0.1,
                       )  
pipe.device

device(type='cuda', index=0)

In [16]:
class PromptingPipeline(EvaluationPipeline):
    def evaluate(self, generate_prompt, output_handler):
        for row in tqdm(self.dataset, total=len(self.dataset)):
            output = self.pipe(generate_prompt(row))
            answ = output_handler(output)
            if "Positive" in answ:
                self.result.append("Positive")
            elif "Negative" in answ:
                self.result.append("Negative")
            else:
                self.result.append("Neutral")

In [18]:
def generate_prompt(data_point):
    return f"""
            Analyze the sentiment of the tweet in square brackets about the entity {data_point["entity"]}, 
            determine if it is positive, neutral, or negative, and return the answer as 
            the corresponding sentiment label "Positive" or "Neutral" or "Negative".
            Classify tweets that are not relevant to the entity as "Neutral".

            [{data_point["text"]}] = """.strip()
def output_handler(output):
    return output[0]['generated_text'].split("=")[-1]

In [19]:
prompt_model = PromptingPipeline(df_test, pipe)
prompt_model.evaluate(generate_prompt, output_handler)
accuracy2, class_report2, conf_matrix2 = prompt_model.calculate_metrics()

cuda:0


  0%|                                                   | 0/999 [00:00<?, ?it/s]Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)
  1%|▍                                         | 10/999 [00:02<03:10,  5.18it/s]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
100%|█████████████████████████████████████████| 999/999 [03:02<00:00,  5.48it/s]

Accuracy: 0.50
Classification Report:
               precision    recall  f1-score   support

    Negative       0.71      0.36      0.48       266
     Neutral       0.56      0.49      0.52       456
    Positive       0.39      0.65      0.48       277

    accuracy                           0.50       999
   macro avg       0.55      0.50      0.49       999
weighted avg       0.55      0.50      0.50       999

Confusion Matrix:
 [[ 96  89  81]
 [ 29 223 204]
 [ 10  88 179]]





Better than previous model, try prompting with instructions

In [17]:
def generate_prompt(row):
    return [
        {"role": "system", 
         "content": '''You are a helpful assistant designed to output sentiment classification labels.
All questions are about entity-wise sentiment analysis on tweets in English. You will analyze the sentiment regarding one volitional entity at the time, inspecting a Norwegian text that is provided as the introduction. 
The reply should contain the sentiment label only, chosen from this list: ‘[’Positive’, ’Neutral’, ’Negative’ ]‘. 
’Positive’ label is used to show support, approval or a positive attitude towards the entity.
’Negative’ label is usef to expresses criticism, disapproval, or negativity towards the entity.
’Neutral’ is the most common label, if tweet is not relevant to the entity use this label.
You should not refer to common knowledge about an entity, but strictly analyze the sentiment conveyed in the given text.
If both positive or negative sentiments exist, you must decide what is the prevalent or overall strongest sentiment conveyed in the text regarding the enity in question.'''},
        {"role": "user", "content":  f"""Analyze the sentiment of the tweet '{row['text']}' about the entity '{row["entity"]}'""".strip()}]
    
def output_handler(output):
    return output[0]['generated_text'][2]['content']

In [20]:
prompt_model.result=[]
prompt_model.evaluate(generate_prompt, output_handler)
accuracy3, class_report3, conf_matrix3 = prompt_model.calculate_metrics()

  0%|                                                   | 0/999 [00:00<?, ?it/s]Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)
  1%|▍                                          | 9/999 [00:00<01:11, 13.85it/s]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
100%|█████████████████████████████████████████| 999/999 [00:59<00:00, 16.85it/s]

Accuracy: 0.43
Classification Report:
               precision    recall  f1-score   support

    Negative       0.42      0.83      0.55       266
     Neutral       0.43      0.07      0.12       456
    Positive       0.45      0.65      0.53       277

    accuracy                           0.43       999
   macro avg       0.43      0.51      0.40       999
weighted avg       0.43      0.43      0.35       999

Confusion Matrix:
 [[220  27  19]
 [227  32 197]
 [ 83  15 179]]





The results got worse. Maybe the model is too small, but a bigger one won't fit on my computer.
Lets try fine-tune Llama in next notebook.