# finetune Bert with train and Eval Datasets

In [4]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
import numpy as np
import json
from sklearn.model_selection import train_test_split

from transformers import BertTokenizer
from torch.utils.data import DataLoader, Dataset
import torch

from preprocessor import Preprocessor
from data_loading import DataLoader_Data


In [2]:
# Load and preprocess the training dataset
train_file_path = '/Users/alaaeddinalia/Desktop/Bachelor_Arbeit_2/Rumor_verification/data/raw/English_train.json'
train_data_loader = DataLoader_Data(train_file_path)

# Load and preprocess the validation dataset
val_file_path = '/Users/alaaeddinalia/Desktop/Bachelor_Arbeit_2/Rumor_verification/data/raw/English_train.json'
val_data_loader = DataLoader_Data(val_file_path)


preprocessor = Preprocessor(language='english', remove_urls=True, remove_special_characters=True,
                            remove_stopwords=True, remove_noise_words=True, remove_emojis=True,
                            apply_stemming=False, apply_lemmatization=False)

# Preprocess the training data
for item in train_data_loader.data:
    item['rumor'] = preprocessor.preprocess_text(item['rumor'])
    for i, evidence_entry in enumerate(item['evidence']):
        item['evidence'][i][2] = preprocessor.preprocess_text(evidence_entry[2])
    for i, timeline_entry in enumerate(item['timeline']):
        item['timeline'][i][2] = preprocessor.preprocess_text(timeline_entry[2])

# Preprocess the validation data
for item in val_data_loader.data:
    item['rumor'] = preprocessor.preprocess_text(item['rumor'])
    for i, evidence_entry in enumerate(item['evidence']):
        item['evidence'][i][2] = preprocessor.preprocess_text(evidence_entry[2])
    for i, timeline_entry in enumerate(item['timeline']):
        item['timeline'][i][2] = preprocessor.preprocess_text(timeline_entry[2])


In [3]:
def prepare_data(data_loader):
    texts = []
    labels = []
    for item in data_loader.data:
        evidence_text = " ".join([e[2] for e in item['evidence']])
        combined_text = item['rumor'] + " " + evidence_text
        texts.append(combined_text)

        # Convert labels to integers
        if item['label'] == 'REFUTES':
            labels.append(0)
        elif item['label'] == 'SUPPORTS':
            labels.append(1)
        else:  # NOT ENOUGH INFO
            labels.append(2)

    return texts, labels

# Prepare data for training and validation
train_texts, train_labels = prepare_data(train_data_loader)
val_texts, val_labels = prepare_data(val_data_loader)


In [5]:
import torch
from torch.utils.data import Dataset

class RumorDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, index):
        text = self.texts[index]
        label = self.labels[index]
        
        # Tokenize the text
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }


In [6]:
# Assume prepare_data function from earlier is used to get texts and labels
train_texts, train_labels = prepare_data(train_data_loader)
val_texts, val_labels = prepare_data(val_data_loader)

# Initialize the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Create PyTorch datasets
train_dataset = RumorDataset(train_texts, train_labels, tokenizer, max_len=512)
val_dataset = RumorDataset(val_texts, val_labels, tokenizer, max_len=512)



In [7]:
# Initialize the BERT model for sequence classification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
device = torch.device("mps") if torch.backends.mps.is_available() else torch.device("cpu")
model = model.to(device)

In [9]:
from transformers import TrainingArguments, Trainer


training_args = TrainingArguments(
    output_dir='./results',
    eval_strategy="epoch",  
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=10,
    weight_decay=0.01,
    logging_dir='./logs',         
    logging_steps=10,
)


In [10]:
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=1)

   
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='weighted',zero_division=1)
    

    accuracy = accuracy_score(labels, predictions)
    
    return {"precision": precision, "recall": recall, "f1": f1, "accuracy": accuracy}


In [11]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)


In [12]:
# Train the mode
trainer.train()

  0%|          | 0/120 [00:00<?, ?it/s]

{'loss': 1.0221, 'grad_norm': nan, 'learning_rate': 1.8333333333333333e-05, 'epoch': 0.83}


  0%|          | 0/4 [00:00<?, ?it/s]

{'eval_loss': nan, 'eval_precision': 0.7587890625, 'eval_recall': 0.40625, 'eval_f1': 0.2347222222222222, 'eval_accuracy': 0.40625, 'eval_runtime': 4.0839, 'eval_samples_per_second': 7.836, 'eval_steps_per_second': 0.979, 'epoch': 1.0}
{'loss': 0.0, 'grad_norm': nan, 'learning_rate': 1.6666666666666667e-05, 'epoch': 1.67}


  0%|          | 0/4 [00:00<?, ?it/s]

{'eval_loss': nan, 'eval_precision': 0.7587890625, 'eval_recall': 0.40625, 'eval_f1': 0.2347222222222222, 'eval_accuracy': 0.40625, 'eval_runtime': 4.4199, 'eval_samples_per_second': 7.24, 'eval_steps_per_second': 0.905, 'epoch': 2.0}
{'loss': 0.0, 'grad_norm': nan, 'learning_rate': 1.5000000000000002e-05, 'epoch': 2.5}


  0%|          | 0/4 [00:00<?, ?it/s]

{'eval_loss': nan, 'eval_precision': 0.7587890625, 'eval_recall': 0.40625, 'eval_f1': 0.2347222222222222, 'eval_accuracy': 0.40625, 'eval_runtime': 5.1696, 'eval_samples_per_second': 6.19, 'eval_steps_per_second': 0.774, 'epoch': 3.0}
{'loss': 0.0, 'grad_norm': nan, 'learning_rate': 1.3333333333333333e-05, 'epoch': 3.33}


  0%|          | 0/4 [00:00<?, ?it/s]

{'eval_loss': nan, 'eval_precision': 0.7587890625, 'eval_recall': 0.40625, 'eval_f1': 0.2347222222222222, 'eval_accuracy': 0.40625, 'eval_runtime': 5.2525, 'eval_samples_per_second': 6.092, 'eval_steps_per_second': 0.762, 'epoch': 4.0}
{'loss': 0.0, 'grad_norm': nan, 'learning_rate': 1.1666666666666668e-05, 'epoch': 4.17}
{'loss': 0.0, 'grad_norm': nan, 'learning_rate': 1e-05, 'epoch': 5.0}


  0%|          | 0/4 [00:00<?, ?it/s]

{'eval_loss': nan, 'eval_precision': 0.7587890625, 'eval_recall': 0.40625, 'eval_f1': 0.2347222222222222, 'eval_accuracy': 0.40625, 'eval_runtime': 4.1771, 'eval_samples_per_second': 7.661, 'eval_steps_per_second': 0.958, 'epoch': 5.0}
{'loss': 0.0, 'grad_norm': nan, 'learning_rate': 8.333333333333334e-06, 'epoch': 5.83}


  0%|          | 0/4 [00:00<?, ?it/s]

{'eval_loss': nan, 'eval_precision': 0.7587890625, 'eval_recall': 0.40625, 'eval_f1': 0.2347222222222222, 'eval_accuracy': 0.40625, 'eval_runtime': 3.6883, 'eval_samples_per_second': 8.676, 'eval_steps_per_second': 1.085, 'epoch': 6.0}
{'loss': 0.0, 'grad_norm': nan, 'learning_rate': 6.666666666666667e-06, 'epoch': 6.67}


  0%|          | 0/4 [00:00<?, ?it/s]

{'eval_loss': nan, 'eval_precision': 0.7587890625, 'eval_recall': 0.40625, 'eval_f1': 0.2347222222222222, 'eval_accuracy': 0.40625, 'eval_runtime': 3.6718, 'eval_samples_per_second': 8.715, 'eval_steps_per_second': 1.089, 'epoch': 7.0}
{'loss': 0.0, 'grad_norm': nan, 'learning_rate': 5e-06, 'epoch': 7.5}


  0%|          | 0/4 [00:00<?, ?it/s]

{'eval_loss': nan, 'eval_precision': 0.7587890625, 'eval_recall': 0.40625, 'eval_f1': 0.2347222222222222, 'eval_accuracy': 0.40625, 'eval_runtime': 3.8114, 'eval_samples_per_second': 8.396, 'eval_steps_per_second': 1.049, 'epoch': 8.0}
{'loss': 0.0, 'grad_norm': nan, 'learning_rate': 3.3333333333333333e-06, 'epoch': 8.33}


  0%|          | 0/4 [00:00<?, ?it/s]

{'eval_loss': nan, 'eval_precision': 0.7587890625, 'eval_recall': 0.40625, 'eval_f1': 0.2347222222222222, 'eval_accuracy': 0.40625, 'eval_runtime': 5.8174, 'eval_samples_per_second': 5.501, 'eval_steps_per_second': 0.688, 'epoch': 9.0}
{'loss': 0.0, 'grad_norm': nan, 'learning_rate': 1.6666666666666667e-06, 'epoch': 9.17}
{'loss': 0.0, 'grad_norm': nan, 'learning_rate': 0.0, 'epoch': 10.0}


  0%|          | 0/4 [00:00<?, ?it/s]

{'eval_loss': nan, 'eval_precision': 0.7587890625, 'eval_recall': 0.40625, 'eval_f1': 0.2347222222222222, 'eval_accuracy': 0.40625, 'eval_runtime': 5.8267, 'eval_samples_per_second': 5.492, 'eval_steps_per_second': 0.686, 'epoch': 10.0}
{'train_runtime': 8687.6938, 'train_samples_per_second': 0.111, 'train_steps_per_second': 0.014, 'train_loss': 0.08517306645711263, 'epoch': 10.0}


TrainOutput(global_step=120, training_loss=0.08517306645711263, metrics={'train_runtime': 8687.6938, 'train_samples_per_second': 0.111, 'train_steps_per_second': 0.014, 'total_flos': 252588881018880.0, 'train_loss': 0.08517306645711263, 'epoch': 10.0})

In [13]:
trainer.evaluate()

  0%|          | 0/4 [00:00<?, ?it/s]

{'eval_loss': nan,
 'eval_precision': 0.7587890625,
 'eval_recall': 0.40625,
 'eval_f1': 0.2347222222222222,
 'eval_accuracy': 0.40625,
 'eval_runtime': 4.2532,
 'eval_samples_per_second': 7.524,
 'eval_steps_per_second': 0.94,
 'epoch': 10.0}

In [14]:
model.save_pretrained("/Users/alaaeddinalia/Desktop/Bachelor_Arbeit_2/Rumor_verification/src/models/Rumor_classifier")
tokenizer.save_pretrained("/Users/alaaeddinalia/Desktop/Bachelor_Arbeit_2/Rumor_verification/src/models/Rumor_classifier")

('../data/Rumor_classifier/tokenizer_config.json',
 '../data/Rumor_classifier/special_tokens_map.json',
 '../data/Rumor_classifier/vocab.txt',
 '../data/Rumor_classifier/added_tokens.json')