# Importing Necessary Libraries.

In [1]:
pip install -r "requirement.txt"

Defaulting to user installation because normal site-packages is not writeable
Collecting numpy==1.26.4
  Downloading numpy-1.26.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.2/18.2 MB[0m [31m16.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting accelerate==0.31.0
  Downloading accelerate-0.31.0-py3-none-any.whl (309 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m309.4/309.4 KB[0m [31m19.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting bitsandbytes==0.43.1
  Downloading bitsandbytes-0.43.1-py3-none-manylinux_2_24_x86_64.whl (119.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m119.8/119.8 MB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting datasets==2.19.2
  Downloading datasets-2.19.2-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.1/542.1 KB[0m [31m23.6 MB/s[0m 

In [2]:
import json
import torch
from transformers import (AutoTokenizer,
                          AutoModelForSequenceClassification,
                          BitsAndBytesConfig,
                          pipeline,
                          DataCollatorWithPadding,
                          TrainingArguments,
                          Trainer)
import pandas as pd
import numpy as np
from datasets import load_dataset, Dataset, DatasetDict
import evaluate
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.metrics import balanced_accuracy_score, classification_report

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
config_data = json.load(open("config.json"))
HF_TOKEN = config_data["HF_TOKEN"]

In [5]:
model_name = "meta-llama/Meta-Llama-3.1-8B"

In [6]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16)

In [7]:
master_train_df = pd.read_csv("train.csv")

In [8]:
train_df = master_train_df.copy()

In [9]:
print("Train data:")
display(train_df.head())

Train data:


Unnamed: 0,essay_id,full_text,score
0,000d118,Many people have car where they live. The thin...,3
1,000fe60,I am a scientist at NASA that is discussing th...,3
2,001ab80,People always wish they had the same technolog...,4
3,001bdc0,"We all heard about Venus, the planet without a...",4
4,002ba53,"Dear, State Senator\n\nThis is a letter to arg...",3


## Pre-processing Data

In [10]:
train_df.rename(columns = {'score':'label'}, inplace = True)

In [11]:
train_df.head()

Unnamed: 0,essay_id,full_text,label
0,000d118,Many people have car where they live. The thin...,3
1,000fe60,I am a scientist at NASA that is discussing th...,3
2,001ab80,People always wish they had the same technolog...,4
3,001bdc0,"We all heard about Venus, the planet without a...",4
4,002ba53,"Dear, State Senator\n\nThis is a letter to arg...",3


In [12]:
train_df.label.value_counts()

label
3    6280
2    4723
4    3926
1    1252
5     970
6     156
Name: count, dtype: int64

In [13]:
train_df['label'] = train_df['label'] - 1

In [14]:
train_df.label.value_counts()

label
2    6280
1    4723
3    3926
0    1252
4     970
5     156
Name: count, dtype: int64

In [15]:
# Splitting the dataframe into 4 separate dataframes based on the labels
label_0_df = train_df[train_df['label'] == 0]
label_1_df = train_df[train_df['label'] == 1]
label_2_df = train_df[train_df['label'] == 2]
label_3_df = train_df[train_df['label'] == 3]
label_4_df = train_df[train_df['label'] == 4]
label_5_df = train_df[train_df['label'] == 5]

# Shuffle each label dataframe
label_0_df = label_0_df.sample(frac=1).reset_index(drop=True)
label_1_df = label_1_df.sample(frac=1).reset_index(drop=True)
label_2_df = label_2_df.sample(frac=1).reset_index(drop=True)
label_3_df = label_3_df.sample(frac=1).reset_index(drop=True)
label_4_df = label_4_df.sample(frac=1).reset_index(drop=True)
label_5_df = label_5_df.sample(frac=1).reset_index(drop=True)

# Splitting each label dataframe into train, test, and validation sets
label_0_train = label_0_df.iloc[:876]
label_0_test = label_0_df.iloc[876:]

label_1_train = label_1_df.iloc[:3306]
label_1_test = label_1_df.iloc[3306:]

label_2_train = label_2_df.iloc[:4396]
label_2_test = label_2_df.iloc[4396:]

label_3_train = label_3_df.iloc[:2748]
label_3_test = label_3_df.iloc[2748:]

label_4_train = label_4_df.iloc[:679]
label_4_test = label_4_df.iloc[679:]

label_5_train = label_5_df.iloc[:109]
label_5_test = label_5_df.iloc[109:]

# Concatenating the splits back together
train_df = pd.concat([label_0_train, label_1_train, label_2_train, label_3_train,label_4_train,label_5_train])
test_df = pd.concat([label_0_test, label_1_test, label_2_test, label_3_test,label_4_test,label_5_test])
# Shuffle the dataframes to ensure randomness
train_df = train_df.sample(frac=1).reset_index(drop=True)
test_df = test_df.sample(frac=1).reset_index(drop=True)

In [16]:
train_df.drop(columns = ['essay_id'],inplace = True)
test_df.drop(columns = ['essay_id'],inplace = True)

In [17]:
train_df.label.value_counts(normalize = True)

label
2    0.362886
1    0.272907
3    0.226845
0    0.072313
4    0.056051
5    0.008998
Name: proportion, dtype: float64

In [18]:
test_df.label.value_counts(normalize = True)

label
2    0.362796
1    0.272867
3    0.226844
0    0.072405
4    0.056037
5    0.009051
Name: proportion, dtype: float64

In [19]:
from datasets import DatasetDict, Dataset

# Converting pandas DataFrames into Hugging Face Dataset objects:
dataset_train = Dataset.from_pandas(train_df)
dataset_test = Dataset.from_pandas(test_df)

# Combine them into a single DatasetDict
dataset = DatasetDict({
    'train': dataset_train,
    'test': dataset_test
})
dataset

DatasetDict({
    train: Dataset({
        features: ['full_text', 'label'],
        num_rows: 12114
    })
    test: Dataset({
        features: ['full_text', 'label'],
        num_rows: 5193
    })
})

In [20]:
import torch

class_weights=(1/train_df.label.value_counts(normalize=True).sort_index()).tolist()
class_weights=torch.tensor(class_weights)
class_weights=class_weights/class_weights.sum()
class_weights

tensor([0.0900, 0.0239, 0.0179, 0.0287, 0.1161, 0.7234])

In [21]:
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    num_labels=6,
    token = HF_TOKEN,
    device_map='auto'
)

Downloading shards: 100%|██████████| 4/4 [22:29<00:00, 337.30s/it]
Loading checkpoint shards: 100%|██████████| 4/4 [00:04<00:00,  1.19s/it]
Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Meta-Llama-3.1-8B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [22]:
lora_config = LoraConfig(
    r = 16,
    lora_alpha = 8,
    target_modules = ['q_proj', 'k_proj', 'v_proj', 'o_proj'],
    lora_dropout = 0.05,
    bias = 'none',
    task_type = 'SEQ_CLS'
)

model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, lora_config)

In [23]:
tokenizer = AutoTokenizer.from_pretrained(model_name, add_prefix_space=True,token = HF_TOKEN)

tokenizer.pad_token_id = tokenizer.eos_token_id
tokenizer.pad_token = tokenizer.eos_token

In [24]:
model.config.pad_token_id = tokenizer.pad_token_id
model.config.use_cache = False
model.config.pretraining_tp = 1

## Inferencing without training LLAMA 3(8B) on down-stream task

In [25]:
sentences = test_df.full_text.tolist()

batch_size = 20

all_outputs = []

for i in range(0, len(sentences), batch_size):
    batch_sentences = sentences[i:i + batch_size]

    inputs = tokenizer(batch_sentences, return_tensors="pt",padding= True,truncation=True, max_length=7500)
    inputs = {k: v.to('cuda' if torch.cuda.is_available() else 'cpu') for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model(**inputs)
        all_outputs.append(outputs['logits'])

final_outputs = torch.cat(all_outputs, dim=0)
test_df['predictions']=final_outputs.argmax(axis=1).cpu().numpy()

#### In this project, we are using cohen's kappa score. This metrics measures agreement between two evaluators which is adjusted for agreement occured by coincidence.

In [26]:
def get_metrics_result(test_df):
    from sklearn.metrics import cohen_kappa_score
    labels = test_df.label
    predictions = test_df.predictions
    
    print('qwk:',cohen_kappa_score(labels,predictions,weights = 'quadratic'))


get_metrics_result(test_df)

qwk: 0.062300837140532916


## Training LLAMA 3 on down-stream task

In [27]:
def data_preprocesing(row):
    return tokenizer(row['full_text'], truncation=True, max_length=7500)

tokenized_data = dataset.map(data_preprocesing, batched=True,
remove_columns=['full_text'])
tokenized_data.set_format("torch")

Map: 100%|██████████| 12114/12114 [00:02<00:00, 5200.63 examples/s]
Map: 100%|██████████| 5193/5193 [00:00<00:00, 5249.79 examples/s]


In [28]:
tokenized_data

DatasetDict({
    train: Dataset({
        features: ['label', 'input_ids', 'attention_mask'],
        num_rows: 12114
    })
    test: Dataset({
        features: ['label', 'input_ids', 'attention_mask'],
        num_rows: 5193
    })
})

In [29]:
collate_fn = DataCollatorWithPadding(tokenizer=tokenizer)

In [30]:
from sklearn.metrics import cohen_kappa_score
def compute_metrics(evaluations):
    predictions, labels = evaluations
    predictions = np.argmax(predictions, axis=1)
    return {'qwk': cohen_kappa_score(labels,predictions,weights = 'quadratic')}

In [31]:
import torch
class CustomTrainer(Trainer):
    def __init__(self, *args, class_weights=None, **kwargs):
        super().__init__(*args, **kwargs)
        if class_weights is not None:
            self.class_weights = torch.tensor(class_weights,
            dtype=torch.float32).to(self.args.device)
        else:
            self.class_weights = None

    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop('labels').long()
        labels = labels.to(self.args.device)

        outputs = model(**inputs)

        logits = outputs.get('logits')

        if self.class_weights is not None:
            loss = torch.nn.functional.cross_entropy(logits, labels, weight=self.class_weights)
        else:
            loss = torch.nn.functional.cross_entropy(logits, labels)

        return (loss, outputs) if return_outputs else loss

In [32]:
training_args = TrainingArguments(
    output_dir = 'sentiment_classification',
    learning_rate = 1e-4,
    per_device_train_batch_size = 3,
    per_device_eval_batch_size = 3,
    num_train_epochs = 1,
    logging_steps=1,
    weight_decay = 0.01,
    evaluation_strategy = 'epoch',
    save_strategy = 'epoch',
    load_best_model_at_end = True,
    report_to="none"
)



In [33]:
trainer = CustomTrainer(
    model = model,
    args = training_args,
    train_dataset = tokenized_data['train'],
    eval_dataset = tokenized_data['test'],
    tokenizer = tokenizer,
    data_collator = collate_fn,
    compute_metrics = compute_metrics,
    class_weights=class_weights
)

train_result = trainer.train()

  self.class_weights = torch.tensor(class_weights,


Epoch,Training Loss,Validation Loss,Qwk
1,0.9559,0.855144,0.821329



Cannot access gated repo for url https://huggingface.co/meta-llama/Meta-Llama-3.1-8B/resolve/main/config.json.
Access to model meta-llama/Meta-Llama-3.1-8B is restricted. You must be authenticated to access it. - silently ignoring the lookup for the file config.json in meta-llama/Meta-Llama-3.1-8B.

Cannot access gated repo for url https://huggingface.co/meta-llama/Meta-Llama-3.1-8B/resolve/main/config.json.
Access to model meta-llama/Meta-Llama-3.1-8B is restricted. You must be authenticated to access it. - silently ignoring the lookup for the file config.json in meta-llama/Meta-Llama-3.1-8B.


## Inferencing after training LLAMA 3 on down-stream task

In [36]:
model_name = "sentiment_classification/checkpoint-4038/"

In [37]:
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    num_labels=6,
    token = HF_TOKEN,
    device_map='auto'
)

Loading checkpoint shards: 100%|██████████| 4/4 [00:06<00:00,  1.58s/it]
Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Meta-Llama-3.1-8B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [38]:
tokenizer = AutoTokenizer.from_pretrained(model_name, add_prefix_space=True,token = HF_TOKEN)

tokenizer.pad_token_id = tokenizer.eos_token_id
tokenizer.pad_token = tokenizer.eos_token

In [39]:
model.config.pad_token_id = tokenizer.pad_token_id
model.config.use_cache = False
model.config.pretraining_tp = 1

In [40]:
sentences = test_df.full_text.tolist()

batch_size = 20

all_outputs = []

for i in range(0, len(sentences), batch_size):
    batch_sentences = sentences[i:i + batch_size]

    inputs = tokenizer(batch_sentences, return_tensors="pt",padding= True,truncation=True, max_length=7500)
    inputs = {k: v.to('cuda' if torch.cuda.is_available() else 'cpu') for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model(**inputs)
        all_outputs.append(outputs['logits'])

final_outputs = torch.cat(all_outputs, dim=0)
test_df['predictions']=final_outputs.argmax(axis=1).cpu().numpy()

In [41]:
def get_metrics_result(test_df):
    from sklearn.metrics import cohen_kappa_score
    labels = test_df.label
    predictions = test_df.predictions
    
    print('qwk:',cohen_kappa_score(labels,predictions,weights = 'quadratic'))

get_metrics_result(test_df)

qwk: 0.8218515324130835
