### Install Dependencies

In [None]:
%pip install -q -U trl numpy torch peft transformers  datasets bitsandbytes wandb

In [None]:
# Login to your wandb -optional
%%bash
wandb login --relogin "<API>"

### Import required classes

In [None]:
!pip install seaborn -q
!pip install --upgrade numpy


In [None]:
import numpy as np
import matplotlib.pyplot as plt
#import seaborn as sns
import torch
import wandb
from datasets import load_dataset
from scipy.special import softmax
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model
from sklearn.metrics import accuracy_score, f1_score, log_loss, confusion_matrix
from transformers import set_seed, TrainingArguments, Trainer, AutoTokenizer, AutoModelForSequenceClassification, BitsAndBytesConfig

### Load the model

In [None]:
# Hugging face login
token='<HF_Token>'

#quantization configurations
bnb_config = BitsAndBytesConfig(
    load_in_4bit = True,
    bnb_4bit_qunat_type = "nf4",
    bnb_4bit_compute_dtype = torch.float16,
)

model_name = 'Model'

tokenizer = AutoTokenizer.from_pretrained(model_name)

tokenizer.pad_token = tokenizer.bos_token
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=2, # Change according to your case, it is hate / non-hate in our work.
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True,
    token=token
    )
model.config.pad_token_id = tokenizer.pad_token_id

# Initial trainable parameters of our model.
def count_trainable_params(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)


num_params = count_trainable_params(model)
formatted_num_params = "{:,}".format(num_params)
print(f"Number of trainable parameters: {formatted_num_params}")

**Testing outputs of the model**

In [None]:
input_text = ["Hi ra ela unnav?", "Hi ra ela ఉన్నావ్"]

# Tokenize the input text
tokenized = tokenizer(input_text[1], return_tensors="pt", padding=True, truncation=True)

# Pass the tokenized input through the model
output = model(**tokenized)

# Extract the logits
logits = output
logits

## Loading DataSets

In [16]:
import pandas as pd
from sklearn.model_selection import train_test_split
import re

# Load the dataset
file_path = "ML_Project_main_hate_fake.xlsx"
df = pd.read_excel(file_path)
df = df.head(500)
# Drop columns where the 'Pre_Processed_English_text' is empty
df = df.dropna(subset=['Pre_Processed_English_text'], how='all')

def contains_english_word(text):
    return bool(re.search(r'\b[a-zA-Z]+\b', str(text)))

# Filter rows with at least one English word in 'Pre_Processed_English_text'
df = df[df['Pre_Processed_English_text'].apply(contains_english_word)]

df['New'] = "Query : " + df["Pre_Processed_English_text"] + " Reason : " + df["Summary"]

# Rename the relevant columns for the whole dataframe
df = df.rename(columns={'New': 'text', 'Hate': 'labels'})

# Keep only "text" and "labels" columns in the whole dataframe
df = df[['text', 'labels']]

# Split the data into training (90%) and testing (10%) sets
train_df, test_df = train_test_split(df, test_size=0.1, random_state=42)

# Save the training dataframe to train.csv
train_df.to_csv('train.csv', index=False)

# Save the testing dataframe to test.csv
test_df.to_csv('test.csv', index=False)

In [None]:
dataset = load_dataset('csv', data_files={'train': ['train.csv'],
                                          'test':['test.csv']})
def tokenize(examples):
        return tokenizer(examples["text"], truncation=True)

train_dataset = dataset['train'].map(tokenize, batched=True)
test_dataset = dataset['test'].map(tokenize, batched=True)
dataset

### Configure training_args and peft_config

In [None]:
# Change the parameters and hyper-parameters as per your use case.
epochs = 10
batch_size = 5
gradient_accumulation_steps = 4

# output dir
model_version = "openchat_3.5_QLoRA"
model_dir = f"{model_version}"

peft_config = LoraConfig(
        r=16,
        lora_alpha=64,
        lora_dropout=0.2,
        bias="none",
        task_type='SEQ_CLS',
        target_modules=[
    "up_proj",
    "o_proj",
    "v_proj",
    "gate_proj",
    "q_proj",
    "down_proj",
    "k_proj"
  ]
)

training_args = TrainingArguments(
        run_name=model_version,
        logging_dir=f"{model_dir}/logs",
        output_dir=model_dir,
        logging_steps=100,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        gradient_accumulation_steps=gradient_accumulation_steps,
        evaluation_strategy="epoch",
        learning_rate=2e-5,
        num_train_epochs=epochs,
        lr_scheduler_type="constant",
        save_strategy="epoch",
        fp16=True,

    )
model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, peft_config)

# The parameters after appling LoRA
num_params = count_trainable_params(model)
formatted_num_params = "{:,}".format(num_params)
print(f"Number of trainable parameters: {formatted_num_params}")

### Train the model

In [None]:
# designing computing metrics as per our use case. (F1-Macro is essential and log-loss is optional)
def compute_metrics(p):
    predictions, labels = p.predictions, p.label_ids
    predictions = np.argmax(predictions, axis=1)
    accuracy = accuracy_score(labels, predictions)
    macro_f1 = f1_score(labels, predictions, average='macro')

    return {"accuracy": accuracy, "macro_f1": macro_f1}

# configure Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# Store progress and track with wandb
wandb.init(
project="HOLD-Final", # Name of the dir you wanted to store this run
name=model_version # Run name
)

# start training
trainer.train()

In [20]:
# save trained model
trainer.save_model(f'{model_dir}/model')

### Performance of the model on test data

In [None]:
# You can also load the best model from the checkpoints, refer inference/inference7BLoRA.ipynb
from tqdm import tqdm
predicted = []

for text in tqdm(test_dataset['text']):
    # Tokenize the text and create a batch with a single data point
    tokenized = tokenizer(text, return_tensors="pt", padding=True, truncation=True)

    # Perform inference on the single data point
    output = model(**tokenized)
    logits = output.logits
    logits = logits.float()

    # Calculate class probabilities
    class_probabilities = torch.nn.functional.softmax(logits, dim=1)

    predicted.append(class_probabilities)
concatenated_tensor = torch.cat(predicted)
predicted = concatenated_tensor.detach().cpu().numpy()
predicted

In [None]:
def get_classification_report(p, y):
    probabilities = p

    labels = np.array(y)

    # Threshold probabilities if needed
    thresholded_predictions = np.argmax(probabilities, axis=1)

    f1 = f1_score(labels, thresholded_predictions, average='macro')
    logloss = log_loss(labels, probabilities)

    # Confusion matrix
    cm = confusion_matrix(labels, thresholded_predictions)

    # Plot confusion matrix
    sns.heatmap(cm, annot=True, fmt='g', cmap='Blues', xticklabels=['Non-Hate', 'Hate'], yticklabels=['Non-Hate', 'Hate'])
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.title('Confusion Matrix')
    plt.show()

    print({"F1_Score": f1, "Log_Loss": logloss})

metrics = get_classification_report(predicted, test_dataset['labels'])