In [2]:
##Imports:

import torch  
from datasets import load_dataset, Dataset 
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments  
from peft import LoraConfig, get_peft_model, AutoPeftModelForSequenceClassification  
from torch.utils.data import DataLoader, Dataset
import numpy as np  
import pandas as pd
import evaluate


In [3]:
datasets = load_dataset("liar",trust_remote_code=True)

print(datasets)

DatasetDict({
    train: Dataset({
        features: ['id', 'label', 'statement', 'subject', 'speaker', 'job_title', 'state_info', 'party_affiliation', 'barely_true_counts', 'false_counts', 'half_true_counts', 'mostly_true_counts', 'pants_on_fire_counts', 'context'],
        num_rows: 10269
    })
    test: Dataset({
        features: ['id', 'label', 'statement', 'subject', 'speaker', 'job_title', 'state_info', 'party_affiliation', 'barely_true_counts', 'false_counts', 'half_true_counts', 'mostly_true_counts', 'pants_on_fire_counts', 'context'],
        num_rows: 1283
    })
    validation: Dataset({
        features: ['id', 'label', 'statement', 'subject', 'speaker', 'job_title', 'state_info', 'party_affiliation', 'barely_true_counts', 'false_counts', 'half_true_counts', 'mostly_true_counts', 'pants_on_fire_counts', 'context'],
        num_rows: 1284
    })
})


In [4]:
dff=pd.DataFrame(datasets["train"].shuffle(seed=42).select(range(5)))
dff


Unnamed: 0,id,label,statement,subject,speaker,job_title,state_info,party_affiliation,barely_true_counts,false_counts,half_true_counts,mostly_true_counts,pants_on_fire_counts,context
0,3358.json,1,Thirty-five states have accepted high-speed in...,"stimulus,transportation",raymond-lahood,"Secretary, U.S. Department of Transportation",Illinois,republican,0.0,1.0,1.0,1.0,0.0,a speech to the American Association of State ...
1,8048.json,2,"Since I was elected, crime rates have been at ...",crime,bill-foster,"Mayor, St. Petersburg",Florida,republican,1.0,0.0,2.0,2.0,0.0,a campaign brochure
2,8280.json,5,Warren Buffett recently said Scrap Obamacare a...,"health-care,pundits",blog-posting,,,none,7.0,19.0,3.0,5.0,44.0,posts on the Internet
3,481.json,2,"Eliminating earmarks ""would make barely a drop...",federal-budget,bob-barr,"Runs a consulting firm, Liberty Strategies LLC",Georgia,libertarian,0.0,0.0,0.0,1.0,0.0,a news conference announcing his Libertarian c...
4,12651.json,4,"Rather than work to secure the border, (Marco ...",immigration,carlos-beruff,Developer,Florida,republican,3.0,1.0,0.0,0.0,0.0,a statement to press


In [5]:
columns_to_remove = ['id', 'subject', 'speaker', 'job_title', 'state_info', 'party_affiliation', 
                     'barely_true_counts', 'false_counts', 'half_true_counts', 
                     'mostly_true_counts', 'pants_on_fire_counts', 'context']

dataset = datasets.remove_columns(columns_to_remove)

In [6]:
model_ck="distilbert-base-uncased"

tokenizer = AutoTokenizer.from_pretrained(model_ck)

model = AutoModelForSequenceClassification.from_pretrained(model_ck,num_labels =6)



Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
def tokenize_fun(example):
    return tokenizer(example["statement"],padding="max_length",truncation=True)

tokenized_dataset = dataset.map(tokenize_fun,batched=True)



In [8]:
class Liardataset(Dataset):
    def __init__(self,tokenized_data):
        self.data = tokenized_data

    def __len__(self):
        return len(self.data["input_ids"])
    
    def __getitem__(self,idx):
        return {
            "input_ids" : torch.tensor(self.data["input_ids"][idx],dtype = torch.long),
            "attention_mask": torch.tensor(self.data["attention_mask"][idx], dtype=torch.long),
            "labels": torch.tensor(self.data["label"][idx], dtype=torch.long),
        }


In [9]:
batch_size = 16

train_dataset = Liardataset(tokenized_dataset["train"])
val_dataset = Liardataset(tokenized_dataset["validation"])
test_dataset = Liardataset(tokenized_dataset["test"])

train_dataloader = DataLoader(train_dataset,batch_size=batch_size,shuffle=True)
val_dataloader = DataLoader(val_dataset,batch_size=batch_size,shuffle=False)
test_dataloader = DataLoader(test_dataset,batch_size=batch_size,shuffle=False)

batch = next(iter(train_dataloader))
print(batch)


{'input_ids': tensor([[  101,  2758,  1996,  ...,     0,     0,     0],
        [  101,  2758,  1037,  ...,     0,     0,     0],
        [  101,  2343,  8112,  ...,     0,     0,     0],
        ...,
        [  101,  2062,  2084,  ...,     0,     0,     0],
        [  101,  5392,  2816,  ...,     0,     0,     0],
        [  101,  2758, 12163,  ...,     0,     0,     0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), 'labels': tensor([1, 2, 2, 2, 4, 1, 0, 4, 1, 0, 4, 1, 1, 4, 3, 1])}


In [10]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
metric = evaluate.load("accuracy")
model.eval()
with torch.no_grad():
    for batch in val_dataloader:
        batch = {key:vals.to(device) for key,vals in batch.items()}
        outputs = model(input_ids = batch["input_ids"], attention_mask = batch["attention_mask"])
        logits = outputs.logits
        predictions = torch.argmax(logits,dim=-1)

        metric.add_batch(predictions=predictions, references=batch["labels"])
accuracy = metric.compute()
print(f"Pretrained Model Accuracy: {accuracy['accuracy']:.4f}")

Pretrained Model Accuracy: 0.1885


In [11]:
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    lora_dropout=0.1,
    target_modules= ["q_lin","v_lin","k_lin"],
    bias="none",
    task_type="SEQ_CLS"
)
if hasattr(model,"lora_config"):
    model=model.unload()
peft_model = get_peft_model(model,lora_config)

peft_model.to(device)

peft_model.print_trainable_parameters()

trainable params: 816,390 || all params: 67,774,476 || trainable%: 1.2046


In [12]:
for name, param in model.named_parameters():
    print(name)
 

distilbert.embeddings.word_embeddings.weight
distilbert.embeddings.position_embeddings.weight
distilbert.embeddings.LayerNorm.weight
distilbert.embeddings.LayerNorm.bias
distilbert.transformer.layer.0.attention.q_lin.base_layer.weight
distilbert.transformer.layer.0.attention.q_lin.base_layer.bias
distilbert.transformer.layer.0.attention.q_lin.lora_A.default.weight
distilbert.transformer.layer.0.attention.q_lin.lora_B.default.weight
distilbert.transformer.layer.0.attention.k_lin.base_layer.weight
distilbert.transformer.layer.0.attention.k_lin.base_layer.bias
distilbert.transformer.layer.0.attention.k_lin.lora_A.default.weight
distilbert.transformer.layer.0.attention.k_lin.lora_B.default.weight
distilbert.transformer.layer.0.attention.v_lin.base_layer.weight
distilbert.transformer.layer.0.attention.v_lin.base_layer.bias
distilbert.transformer.layer.0.attention.v_lin.lora_A.default.weight
distilbert.transformer.layer.0.attention.v_lin.lora_B.default.weight
distilbert.transformer.layer.0.a

In [13]:
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-4,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    push_to_hub=False
)

trainer = Trainer(
    model = model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
)
trainer.train()

Epoch,Training Loss,Validation Loss
1,1.6951,1.706794
2,1.6649,1.675118
3,1.5225,1.674011


TrainOutput(global_step=3852, training_loss=1.6634717696923704, metrics={'train_runtime': 804.0906, 'train_samples_per_second': 38.313, 'train_steps_per_second': 4.791, 'total_flos': 4158476678393856.0, 'train_loss': 1.6634717696923704, 'epoch': 3.0})

In [15]:

accuracy_metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return accuracy_metric.compute(predictions=predictions, references=labels)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    compute_metrics=compute_metrics  
)


eval_results = trainer.evaluate()
accuracy = eval_results["eval_accuracy"]
print(f"Trained Model Accuracy: {accuracy:.4f}")


Trained Model Accuracy: 0.2578
