# Political leaning analysis by DistilBERT parameter tuning via LoRA

In [1]:
!pip install transformers datasets peft evaluate



In [2]:
from datasets import load_dataset, DatasetDict, Dataset
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from transformers import (
    AutoTokenizer,
    AutoConfig,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer,
)
from peft import PeftModel, PeftConfig, get_peft_model, LoraConfig
import evaluate
import torch
import numpy as np

In [3]:
model_checkpoint = "distilbert-base-uncased"

# Define label maps
id2label = {0: "UNDEFINED", 1: "LEFT", 2: "RIGHT", 3: "CENTER"}
label2id = {"UNDEFINED": 0, "LEFT": 1, "RIGHT": 2, "CENTER": 3}

model = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint, num_labels=4, id2label=id2label, label2id=label2id
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
from datasets import load_dataset

from transformers import BertTokenizerFast

from torch.utils.data import DataLoader

df = load_dataset("csv", data_files="/Users/ilseoplee/NLPizza_final_project/2017_1.csv")
df

DatasetDict({
    train: Dataset({
        features: ['id', 'date_publish', 'outlet', 'headline', 'lead', 'body', 'authors', 'domain', 'url', 'political_leaning'],
        num_rows: 146718
    })
})

In [5]:
# train_testvalid =
df = df["train"].train_test_split(test_size=0.1)

In [6]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, add_prefix=True)

In [7]:
def tokenize_function(examples):
    text = examples["lead"]
    labels = examples["political_leaning"]

    tokenizer.truncation_side = "left"
    tokenized_inputs = tokenizer(
        text, return_tensors="np", padding=True, truncation=True, max_length=512
    )

    tokenized_inputs["labels"] = [label2id[label] for label in labels]
    return tokenized_inputs


In [8]:
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({"pad_token": "[PAD]"})
    model.resize_token_embeddings(len(tokenizer))

In [9]:
def preprocess_data(dataset): #treat None values
    dataset = dataset.filter(lambda x: x["lead"] is not None and isinstance(x["lead"], str))
    return dataset
df = preprocess_data(df)

Filter:   0%|          | 0/132046 [00:00<?, ? examples/s]

Filter:   0%|          | 0/14672 [00:00<?, ? examples/s]

In [10]:
tokenized_dataset = df.map(tokenize_function, batched=True)
tokenized_dataset

Map:   0%|          | 0/114318 [00:00<?, ? examples/s]

Map:   0%|          | 0/12625 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'date_publish', 'outlet', 'headline', 'lead', 'body', 'authors', 'domain', 'url', 'political_leaning', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 114318
    })
    test: Dataset({
        features: ['id', 'date_publish', 'outlet', 'headline', 'lead', 'body', 'authors', 'domain', 'url', 'political_leaning', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 12625
    })
})

In [11]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [12]:
accuracy = evaluate.load("accuracy")


def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=1)
    return {"accuracy": accuracy.compute(predictions=predictions, references=labels)}

In [13]:
text_list = [
    "It was good.",
    "Not a fan, don't recommended",
    "Better than the first one.",
    "Women have the right to choose and abortion should be allowed.",
]

import torch

# Set device
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
model = model.to(device)

print("Untrained model")
for text in text_list:
    inputs = tokenizer(text, return_tensors="pt").to(
        device
    )  # Move inputs to the correct device
    logits = model(**inputs).logits  # Forward pass
    predictions = torch.argmax(logits, dim=-1)
    print(f"{text} - {id2label[predictions.item()]}")

# print("Untrained model")
# for text in text_list:
#   inputs = tokenizer.encode(text, return_tensors="pt")
#   logits = model(inputs).logits
#   predictions = torch.argmax(logits)
#   print(f'{text} - {id2label[predictions.tolist()]}')

Untrained model
It was good. - UNDEFINED
Not a fan, don't recommended - CENTER
Better than the first one. - LEFT
Women have the right to choose and abortion should be allowed. - LEFT


In [14]:
peft_config = LoraConfig(
    task_type="SEQ_CLS", r=4, lora_alpha=32, lora_dropout=0.01, target_modules=["q_lin"]
)

In [15]:
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

trainable params: 630,532 || all params: 67,587,080 || trainable%: 0.9329


In [16]:
lr = 1e-3
batch_size = 10
num_epochs = 5

training_args = TrainingArguments(
    output_dir="" + model_checkpoint + "lora-txt",
    learning_rate=lr,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)



In [17]:
def compute_metrics(eval_pred):  #Training
    """
    Computes accuracy, precision, recall, and F1 score.
    eval_pred: A tuple of (predictions, labels) provided by the Trainer.
    """
    predictions, labels = eval_pred
    # Convert predictions to the predicted class indices (argmax for softmax outputs)
    predictions = predictions.argmax(axis=-1)

    # Compute metrics
    accuracy = accuracy_score(labels, predictions)
    precision = precision_score(labels, predictions, average='weighted')  # Weighted for class imbalance
    recall = recall_score(labels, predictions, average='weighted')
    f1 = f1_score(labels, predictions, average='weighted')

    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }


In [18]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

  trainer = Trainer(


In [19]:
trainer.train()

  0%|          | 0/57160 [00:00<?, ?it/s]

{'loss': 0.9303, 'grad_norm': 4.505564212799072, 'learning_rate': 0.0009912526242127362, 'epoch': 0.04}
{'loss': 0.7897, 'grad_norm': 3.0711629390716553, 'learning_rate': 0.0009825052484254723, 'epoch': 0.09}
{'loss': 0.7629, 'grad_norm': 3.4180080890655518, 'learning_rate': 0.0009737578726382086, 'epoch': 0.13}
{'loss': 0.747, 'grad_norm': 2.914144992828369, 'learning_rate': 0.0009650104968509448, 'epoch': 0.17}
{'loss': 0.7477, 'grad_norm': 3.478550910949707, 'learning_rate': 0.000956263121063681, 'epoch': 0.22}
{'loss': 0.7469, 'grad_norm': 5.787277698516846, 'learning_rate': 0.0009475157452764171, 'epoch': 0.26}
{'loss': 0.7385, 'grad_norm': 3.0911436080932617, 'learning_rate': 0.0009387683694891532, 'epoch': 0.31}
{'loss': 0.7319, 'grad_norm': 3.49379825592041, 'learning_rate': 0.0009300209937018894, 'epoch': 0.35}
{'loss': 0.7153, 'grad_norm': 2.440849542617798, 'learning_rate': 0.0009212736179146256, 'epoch': 0.39}
{'loss': 0.7083, 'grad_norm': 3.048267126083374, 'learning_rate'

  0%|          | 0/1263 [00:00<?, ?it/s]

{'eval_loss': 0.6697376370429993, 'eval_accuracy': 0.7455049504950495, 'eval_precision': 0.7488754292354503, 'eval_recall': 0.7455049504950495, 'eval_f1': 0.7463705486154194, 'eval_runtime': 129.7641, 'eval_samples_per_second': 97.292, 'eval_steps_per_second': 9.733, 'epoch': 1.0}
{'loss': 0.6845, 'grad_norm': 3.5698649883270264, 'learning_rate': 0.0007988103568929321, 'epoch': 1.01}
{'loss': 0.6769, 'grad_norm': 10.188803672790527, 'learning_rate': 0.0007900629811056683, 'epoch': 1.05}
{'loss': 0.6794, 'grad_norm': 5.290374755859375, 'learning_rate': 0.0007813156053184045, 'epoch': 1.09}
{'loss': 0.685, 'grad_norm': 2.6466569900512695, 'learning_rate': 0.0007725682295311407, 'epoch': 1.14}
{'loss': 0.6594, 'grad_norm': 2.8511807918548584, 'learning_rate': 0.0007638208537438769, 'epoch': 1.18}
{'loss': 0.678, 'grad_norm': 4.404359817504883, 'learning_rate': 0.0007550734779566131, 'epoch': 1.22}
{'loss': 0.6711, 'grad_norm': 4.073235034942627, 'learning_rate': 0.0007463261021693492, 'ep

  0%|          | 0/1263 [00:00<?, ?it/s]

{'eval_loss': 0.6217897534370422, 'eval_accuracy': 0.7588118811881188, 'eval_precision': 0.7681203733288533, 'eval_recall': 0.7588118811881188, 'eval_f1': 0.7570002372018653, 'eval_runtime': 199.3867, 'eval_samples_per_second': 63.319, 'eval_steps_per_second': 6.334, 'epoch': 2.0}
{'loss': 0.6388, 'grad_norm': 6.76192569732666, 'learning_rate': 0.0005976207137858643, 'epoch': 2.01}
{'loss': 0.6192, 'grad_norm': 2.8097987174987793, 'learning_rate': 0.0005888733379986004, 'epoch': 2.06}
{'loss': 0.6391, 'grad_norm': 9.754402160644531, 'learning_rate': 0.0005801259622113367, 'epoch': 2.1}
{'loss': 0.6286, 'grad_norm': 3.873807668685913, 'learning_rate': 0.0005713785864240728, 'epoch': 2.14}
{'loss': 0.62, 'grad_norm': 3.5799458026885986, 'learning_rate': 0.000562631210636809, 'epoch': 2.19}
{'loss': 0.627, 'grad_norm': 3.3360211849212646, 'learning_rate': 0.0005538838348495452, 'epoch': 2.23}
{'loss': 0.6474, 'grad_norm': 7.667523384094238, 'learning_rate': 0.0005451364590622813, 'epoch':

  0%|          | 0/1263 [00:00<?, ?it/s]

{'eval_loss': 0.578837513923645, 'eval_accuracy': 0.7765544554455446, 'eval_precision': 0.7857904484585567, 'eval_recall': 0.7765544554455446, 'eval_f1': 0.7793301970423513, 'eval_runtime': 128.9481, 'eval_samples_per_second': 97.908, 'eval_steps_per_second': 9.795, 'epoch': 3.0}
{'loss': 0.612, 'grad_norm': 5.217955112457275, 'learning_rate': 0.00039643107067879635, 'epoch': 3.02}
{'loss': 0.57, 'grad_norm': 6.749016761779785, 'learning_rate': 0.00038768369489153255, 'epoch': 3.06}
{'loss': 0.557, 'grad_norm': 2.5166800022125244, 'learning_rate': 0.00037893631910426874, 'epoch': 3.11}
{'loss': 0.5807, 'grad_norm': 6.777468204498291, 'learning_rate': 0.00037018894331700494, 'epoch': 3.15}
{'loss': 0.5871, 'grad_norm': 4.157129764556885, 'learning_rate': 0.00036144156752974114, 'epoch': 3.19}
{'loss': 0.5865, 'grad_norm': 2.075314521789551, 'learning_rate': 0.0003526941917424772, 'epoch': 3.24}
{'loss': 0.5877, 'grad_norm': 4.232352256774902, 'learning_rate': 0.0003439468159552134, 'epo

  0%|          | 0/1263 [00:00<?, ?it/s]

{'eval_loss': 0.5534246563911438, 'eval_accuracy': 0.783920792079208, 'eval_precision': 0.7895896753683193, 'eval_recall': 0.783920792079208, 'eval_f1': 0.7859587409132387, 'eval_runtime': 127.2861, 'eval_samples_per_second': 99.186, 'eval_steps_per_second': 9.923, 'epoch': 4.0}
{'loss': 0.5488, 'grad_norm': 4.975074291229248, 'learning_rate': 0.0001952414275717285, 'epoch': 4.02}
{'loss': 0.528, 'grad_norm': 4.00095272064209, 'learning_rate': 0.00018649405178446465, 'epoch': 4.07}
{'loss': 0.5371, 'grad_norm': 2.7952828407287598, 'learning_rate': 0.00017774667599720085, 'epoch': 4.11}
{'loss': 0.5316, 'grad_norm': 2.629671335220337, 'learning_rate': 0.00016899930020993704, 'epoch': 4.16}
{'loss': 0.5672, 'grad_norm': 8.424694061279297, 'learning_rate': 0.00016025192442267318, 'epoch': 4.2}
{'loss': 0.5497, 'grad_norm': 2.5696334838867188, 'learning_rate': 0.00015150454863540938, 'epoch': 4.24}
{'loss': 0.5524, 'grad_norm': 9.766387939453125, 'learning_rate': 0.00014275717284814558, 'e

  0%|          | 0/1263 [00:00<?, ?it/s]

{'eval_loss': 0.5454385280609131, 'eval_accuracy': 0.7908910891089109, 'eval_precision': 0.7984121773063945, 'eval_recall': 0.7908910891089109, 'eval_f1': 0.7928343452975406, 'eval_runtime': 132.377, 'eval_samples_per_second': 95.372, 'eval_steps_per_second': 9.541, 'epoch': 5.0}
{'train_runtime': 17045.3811, 'train_samples_per_second': 33.533, 'train_steps_per_second': 3.353, 'train_loss': 0.6242901519764713, 'epoch': 5.0}


TrainOutput(global_step=57160, training_loss=0.6242901519764713, metrics={'train_runtime': 17045.3811, 'train_samples_per_second': 33.533, 'train_steps_per_second': 3.353, 'total_flos': 7.682690758385664e+16, 'train_loss': 0.6242901519764713, 'epoch': 5.0})

<!--  -->

In [20]:
import torch

device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
print(f"Using device: {device}")

model = model.to(device)
print("Trained model predictions")

for text in text_list:
    inputs = tokenizer.encode(text, return_tensors="pt").to(device)

    logits = model(inputs).logits

    predictions = torch.argmax(logits, dim=-1)

    print(f"{text} - {id2label[predictions.item()]}")


# INITIAL CODE
# model.to('cuda')
# print('Trained model predictions')
# for text in text_list:
#   inputs = tokenizer.encode(text, return_tensors='pt').to('cuda')

#   logits = model(inputs).logits
#   predictions = torch.max(logits,1).indices

#   print(f'{text} - {id2label[predictions.tolist()[0]]}')

Using device: mps
Trained model predictions
It was good. - LEFT
Not a fan, don't recommended - CENTER
Better than the first one. - LEFT
Women have the right to choose and abortion should be allowed. - LEFT


In [None]:
output_model_file = "pytorch_distilbert_imbd.bin"
output_vocab_file = "vocab_distilbert_imbd.bin"

# Save model
model_to_save = model
torch.save(model_to_save, output_model_file)

# Save tokenizer vocabulary in the current directory
tokenizer.save_vocabulary(".")  # Current directory

# Save model state dictionary
torch.save(model.state_dict(), "LORA_distilBERT_LEAD.pth")

print("All files saved")

All files saved
