In [9]:
# !pip install transformers
# !pip install datasets
# !pip install scipy transformers
!pip install torch
!pip install pandas
!pip install datasets



In [10]:
# import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset, DatasetDict, ClassLabel
import pandas as pd

csv_file = "all_years_merged.csv"
df = pd.read_csv(csv_file)

label_mapping = {'left': 0, 'center': 1, 'right': 2}
df['label'] = df['Leaning'].map(label_mapping)

df = df[['Text', 'label']]

dataset = Dataset.from_pandas(df.rename(columns={'Text': 'text', 'label': 'label'}))
dataset = dataset.cast_column('label', ClassLabel(names=['left', 'center', 'right']))

Casting the dataset:   0%|          | 0/948 [00:00<?, ? examples/s]

In [4]:
dataset[4]

{'text': 'Those hopes were quickly dashed.From his earliest days in office to his last, Obama as president was light years away from Obama on the campaign stump.',
 'label': 0}

In [11]:
#!conda create -n bert_env python=3.9
#!conda activate bert_env
#!pip install "numpy<2" scipy transformers datasets pandas
!pip install evaluate



In [None]:
# dataset["train"]

In [12]:
import evaluate
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

dataset = dataset.train_test_split(test_size=0.2, stratify_by_column="label")

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

def tokenize_function(examples):
    return tokenizer(examples['text'], padding="max_length", truncation=True)

train_dataset = dataset["train"].map(
    tokenize_function,
    batched=True,
    load_from_cache_file=True,
    batch_size=16,
)

eval_dataset = dataset["test"].map(
    tokenize_function,
    batched=True,
    load_from_cache_file=True,
    batch_size=16,
)

train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
eval_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=3)

accuracy_metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=1)

    acc = accuracy_score(labels, predictions)

    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, predictions, average="macro", zero_division=0
    )

    return {
        "accuracy": acc,
        "precision": precision,
        "recall": recall,
        "macro_f1": f1,
    }

training_args = TrainingArguments(
    learning_rate=3e-5,
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    report_to=[],
    fp16=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()

eval_results = trainer.evaluate()

print(f"Evaluation Results: {eval_results}")


Map:   0%|          | 0/758 [00:00<?, ? examples/s]

Map:   0%|          | 0/190 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,Macro F1
1,1.0761,1.061742,0.442105,0.502602,0.440476,0.419035
2,0.8989,0.874085,0.710526,0.735748,0.710317,0.715919
3,0.763,0.668123,0.8,0.812215,0.799934,0.801932


Evaluation Results: {'eval_loss': 0.6681230068206787, 'eval_accuracy': 0.8, 'eval_precision': 0.8122151433472188, 'eval_recall': 0.7999338624338623, 'eval_macro_f1': 0.80193241037326, 'eval_runtime': 1.5837, 'eval_samples_per_second': 119.971, 'eval_steps_per_second': 15.154, 'epoch': 3.0}


In [None]:
trainer.evaluate()

{'eval_loss': 1.0963804721832275,
 'eval_accuracy': 0.3050847457627119,
 'eval_runtime': 96.0192,
 'eval_samples_per_second': 0.614,
 'eval_steps_per_second': 0.083,
 'epoch': 2.0}

In [None]:
model.save_pretrained("./saved_model")
tokenizer.save_pretrained("./saved_model")

('./saved_model/tokenizer_config.json',
 './saved_model/special_tokens_map.json',
 './saved_model/vocab.txt',
 './saved_model/added_tokens.json')