Here, we are using llama-3 8b model, which is having 8k context length. I do not think so we have to do any changes in the dataset because no essay will go beyond 8k token size.

In [None]:
pip install -r "requirement.txt"

In [None]:
import json
import torch
from transformers import (AutoTokenizer,
                          AutoModelForSequenceClassification,
                          BitsAndBytesConfig,
                          pipeline,
                          DataCollatorWithPadding,
                          TrainingArguments,
                          Trainer)
import pandas as pd
import numpy as np
from datasets import load_dataset, Dataset, DatasetDict
import evaluate
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.metrics import balanced_accuracy_score, classification_report

In [None]:
config_data = json.load(open("config.json"))
HF_TOKEN = config_data["HF_TOKEN"]

In [None]:
model_name = "meta-llama/Meta-Llama-3-8B"

In [None]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16)

In [None]:
master_train_df = pd.read_csv("train.csv")

In [None]:
train_df = master_train_df.copy()

In [None]:
print("Train data:")
display(train_df.head())

In [None]:
train_df.rename(columns = {'score':'label'}, inplace = True)

In [None]:
train_df.head()

In [None]:
train_df.label.value_counts()

In [None]:
train_df['label'] = train_df['label'] - 1

In [None]:
train_df.label.value_counts()

In [None]:
# Splitting the dataframe into 4 separate dataframes based on the labels
label_0_df = train_df[train_df['label'] == 0]
label_1_df = train_df[train_df['label'] == 1]
label_2_df = train_df[train_df['label'] == 2]
label_3_df = train_df[train_df['label'] == 3]
label_4_df = train_df[train_df['label'] == 4]
label_5_df = train_df[train_df['label'] == 5]

# Shuffle each label dataframe
label_0_df = label_0_df.sample(frac=1).reset_index(drop=True)
label_1_df = label_1_df.sample(frac=1).reset_index(drop=True)
label_2_df = label_2_df.sample(frac=1).reset_index(drop=True)
label_3_df = label_3_df.sample(frac=1).reset_index(drop=True)
label_4_df = label_4_df.sample(frac=1).reset_index(drop=True)
label_5_df = label_5_df.sample(frac=1).reset_index(drop=True)

# Splitting each label dataframe into train, test, and validation sets
label_0_train = label_0_df.iloc[:876]
label_0_test = label_0_df.iloc[876:]

label_1_train = label_1_df.iloc[:3306]
label_1_test = label_1_df.iloc[3306:]

label_2_train = label_2_df.iloc[:4396]
label_2_test = label_2_df.iloc[4396:]

label_3_train = label_3_df.iloc[:2748]
label_3_test = label_3_df.iloc[2748:]

label_4_train = label_4_df.iloc[:679]
label_4_test = label_4_df.iloc[679:]

label_5_train = label_5_df.iloc[:109]
label_5_test = label_5_df.iloc[109:]

# Concatenating the splits back together
train_df = pd.concat([label_0_train, label_1_train, label_2_train, label_3_train,label_4_train,label_5_train])
test_df = pd.concat([label_0_test, label_1_test, label_2_test, label_3_test,label_4_test,label_5_test])
# Shuffle the dataframes to ensure randomness
train_df = train_df.sample(frac=1).reset_index(drop=True)
test_df = test_df.sample(frac=1).reset_index(drop=True)

In [None]:
train_df.drop(columns = ['essay_id'],inplace = True)
test_df.drop(columns = ['essay_id'],inplace = True)

In [None]:
train_df.label.value_counts(normalize = True)

In [None]:
test_df.label.value_counts(normalize = True)

In [None]:
from datasets import DatasetDict, Dataset

# Converting pandas DataFrames into Hugging Face Dataset objects:
dataset_train = Dataset.from_pandas(train_df)
dataset_test = Dataset.from_pandas(test_df)

# Combine them into a single DatasetDict
dataset = DatasetDict({
    'train': dataset_train,
    'test': dataset_test
})
dataset

In [None]:
import torch

class_weights=(1/train_df.label.value_counts(normalize=True).sort_index()).tolist()
class_weights=torch.tensor(class_weights)
class_weights=class_weights/class_weights.sum()
class_weights

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    num_labels=6,
    token = HF_TOKEN,
    device_map='auto'
)

In [None]:
lora_config = LoraConfig(
    r = 16,
    lora_alpha = 8,
    target_modules = ['q_proj', 'k_proj', 'v_proj', 'o_proj'],
    lora_dropout = 0.05,
    bias = 'none',
    task_type = 'SEQ_CLS'
)

model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, lora_config)

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name, add_prefix_space=True,token = HF_TOKEN)

tokenizer.pad_token_id = tokenizer.eos_token_id
tokenizer.pad_token = tokenizer.eos_token

In [None]:
model.config.pad_token_id = tokenizer.pad_token_id
model.config.use_cache = False
model.config.pretraining_tp = 1

## Training model

In [None]:
def data_preprocesing(row):
    return tokenizer(row['full_text'], truncation=True, max_length=7500)

tokenized_data = dataset.map(data_preprocesing, batched=True,
remove_columns=['full_text'])
tokenized_data.set_format("torch")

In [None]:
tokenized_data

In [None]:
collate_fn = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
from sklearn.metrics import cohen_kappa_score
def compute_metrics(evaluations):
    predictions, labels = evaluations
    predictions = np.argmax(predictions, axis=1)
    return {'qwk': cohen_kappa_score(labels,predictions,weights = 'quadratic')}

In [None]:
import torch
class CustomTrainer(Trainer):
    def __init__(self, *args, class_weights=None, **kwargs):
        super().__init__(*args, **kwargs)
        if class_weights is not None:
            self.class_weights = torch.tensor(class_weights,
            dtype=torch.float32).to(self.args.device)
        else:
            self.class_weights = None

    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop('labels').long()
        labels = labels.to(self.args.device)

        outputs = model(**inputs)

        logits = outputs.get('logits')

        if self.class_weights is not None:
            loss = torch.nn.functional.cross_entropy(logits, labels, weight=self.class_weights)
        else:
            loss = torch.nn.functional.cross_entropy(logits, labels)

        return (loss, outputs) if return_outputs else loss

In [None]:
training_args = TrainingArguments(
    output_dir = 'sentiment_classification',
    learning_rate = 1e-4,
    per_device_train_batch_size = 3,
    per_device_eval_batch_size = 3,
    num_train_epochs = 1,
    logging_steps=1,
    weight_decay = 0.01,
    evaluation_strategy = 'epoch',
    save_strategy = 'epoch',
    load_best_model_at_end = True,
    report_to="none"
)

In [None]:
trainer = CustomTrainer(
    model = model,
    args = training_args,
    train_dataset = tokenized_data['train'],
    eval_dataset = tokenized_data['test'],
    tokenizer = tokenizer,
    data_collator = collate_fn,
    compute_metrics = compute_metrics,
    class_weights=class_weights
)

train_result = trainer.train()

## Inference

In [None]:
model_name = "sentiment_classification/checkpoint-2019/"

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    num_labels=6,
    token = HF_TOKEN,
    device_map='auto'
)

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name, add_prefix_space=True,token = HF_TOKEN)

tokenizer.pad_token_id = tokenizer.eos_token_id
tokenizer.pad_token = tokenizer.eos_token

In [None]:
model.config.pad_token_id = tokenizer.pad_token_id
model.config.use_cache = False
model.config.pretraining_tp = 1

In [None]:
sentences = test_df.full_text.tolist()

batch_size = 32

all_outputs = []

for i in range(0, len(sentences), batch_size):
    batch_sentences = sentences[i:i + batch_size]

    inputs = tokenizer(batch_sentences, return_tensors="pt",padding= True,truncation=True, max_length=7500)
    inputs = {k: v.to('cuda' if torch.cuda.is_available() else 'cpu') for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model(**inputs)
        all_outputs.append(outputs['logits'])

final_outputs = torch.cat(all_outputs, dim=0)
test_df['predictions']=final_outputs.argmax(axis=1).cpu().numpy()

In [None]:
def get_metrics_result(test_df):
    from sklearn.metrics import cohen_kappa_score
    labels = test_df.label
    predictions = test_df.predictions
    
    print('qwk:',cohen_kappa_score(labels,predictions,weights = 'quadratic'))


get_metrics_result(test_df)