# finetune Bert with train and Eval Datasets

#   Load and Preprocess the Data

In [None]:

from preprocessor import Preprocessor
from data_loading import DataLoader_Data

# Load the dataset
file_path = '/Users/alaaeddinalia/Desktop/Bachelor_Arbeit_2/Rumor_verification/data/raw/English_train.json'
data_loader = DataLoader_Data(file_path)

# Initialize the preprocessor
preprocessor = Preprocessor(language='english', remove_urls=True, remove_special_characters=True
                            , remove_stopwords=True , remove_noise_words=True, remove_emojis=True,
                            apply_stemming=False, apply_lemmatization=False)

# Preprocess the rumors and evidence
for item in data_loader.data:
    item['rumor'] = preprocessor.preprocess_text(item['rumor'])
    for i, evidence_entry in enumerate(item['evidence']):
        item['evidence'][i][2] = preprocessor.preprocess_text(evidence_entry[2])
    for i, timeline_entry in enumerate(item['timeline']):
        item['timeline'][i][2] = preprocessor.preprocess_text(timeline_entry[2])

#  Pretrain BERT Model for Rumor Classification


In [None]:
# Prepare combined texts and labels for training
texts = []
labels = []

for item in data_loader.data:
    evidence_text = " ".join([e[2] for e in item['evidence']])  # Combine all evidence tweets
    combined_text = item['rumor'] + " " + evidence_text
    texts.append(combined_text)
    
    # Convert labels to integers
    if item['label'] == 'REFUTES':
        labels.append(0)
    elif item['label'] == 'SUPPORTS':
        labels.append(1)
    else:  # NOT ENOUGH INFO
        labels.append(2)

# Split the data into training and validation sets
from sklearn.model_selection import train_test_split
train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels, test_size=0.1, random_state=42)


# Tokenize Preprocessed Data and Create Datasets

In [None]:
from transformers import BertTokenizer
from torch.utils.data import DataLoader, Dataset
import torch

class RumorDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, index):
        text = self.texts[index]
        label = self.labels[index]
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding=True,
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

# Initialize the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Create the datasets using preprocessed texts
train_dataset = RumorDataset(train_texts, train_labels, tokenizer, max_len=512)
val_dataset = RumorDataset(val_texts, val_labels, tokenizer, max_len=512)

# Create the data loaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)


In [34]:
train_dataset[0]

{'text': '“ Watch Qatari suddenly fell inside supermarket Doha suspicions infected Coronan Ministry Public Health confirms circulated social media shopper falling unconscious result infected Corona virus incorrect point shopper suffered sudden fatigue resulted loss balance good health Ministry calls necessity avoiding spreading rumors ensuring accuracy MOPHQatar Ministry Public Health confirms circulated social media shopper falling unconscious result infected Corona virus incorrect would like point shopper suffered sudden fatigue resulting loss balance good health Ministry calls necessity Avoid spreading rumors accurate Ministry Public Health confirms circulated social media shopper falling unconscious result infection Corona virus incorrect would like point shopper suffered sudden fatigue resulting loss balance good health Ministry calls need avoid spreading rumors investigate Precision MOPHQatar Ministry Public Health confirms circulated social media shopper falling unconscious resu

# Fine-tune BERT Model

In [4]:
from transformers import BertForSequenceClassification, AdamW

# Initialize the BERT model for sequence classification
device = torch.device("mps") if torch.backends.mps.is_available() else torch.device("cpu")
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)
model = model.to(device)

# Set up the optimizer
optimizer = AdamW(model.parameters(), lr=2e-5)

# Training loop
num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    total_train_loss = 0

    for batch in train_loader:
        optimizer.zero_grad()

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        total_train_loss += loss.item()

    avg_train_loss = total_train_loss / len(train_loader)
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {avg_train_loss}")

# Save the fine-tuned model
model.save_pretrained('../model_2/rumor_verification_classifier')
tokenizer.save_pretrained('../model_2/rumor_verification_classifier')


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/10, Loss: 1.0341541767120361
Epoch 2/10, Loss: 0.8659103115399679
Epoch 3/10, Loss: 0.7162215560674667
Epoch 4/10, Loss: 0.6065972447395325
Epoch 5/10, Loss: 0.5441709607839584
Epoch 6/10, Loss: 0.46504998455444974
Epoch 7/10, Loss: 0.434492955605189
Epoch 8/10, Loss: 0.3829097996155421
Epoch 9/10, Loss: 0.31461478273073834
Epoch 10/10, Loss: 0.25659436732530594


('../model_2/rumor_verification_classifier/tokenizer_config.json',
 '../model_2/rumor_verification_classifier/special_tokens_map.json',
 '../model_2/rumor_verification_classifier/vocab.txt',
 '../model_2/rumor_verification_classifier/added_tokens.json')

In [10]:
from transformers import BertTokenizer, BertModel
import torch
import numpy as np
from feature_extractor_ import FeatureExtractor

# Assuming your FeatureExtractor class is imported or defined in the same notebook
# Initialize the feature extractor using BERT
extractor = FeatureExtractor(method='bert', bert_model_name='bert-base-uncased', batch_size=16)

# Function to extract features for the entire dataset
def extract_features(dataset, extractor):
    features = []
    
    for item in dataset:
        rumor = item['rumor']
        timeline = item['timeline']
        
        # Extract features for the rumor
        rumor_embedding = extractor.fit_transform([rumor])[0]
        
        # Extract features for the timeline entries
        timeline_texts = [entry[2] for entry in timeline]
        timeline_embeddings = extractor.transform(timeline_texts)
        
        features.append({
            'id': item['id'],
            'rumor': rumor,
            'rumor_embedding': rumor_embedding,
            'timeline_embeddings': timeline_embeddings,
            'timeline': timeline,  # Include the timeline entries here
            'true_label': item['label']
        })
    
    return features

# Perform feature extraction on the entire dataset
features = extract_features(data_loader.data, extractor)

# Print a few examples to verify
for item in features[:3]:
    print(f"ID: {item['id']}, Rumor: {item['rumor']}")
    print(f"Rumor Embedding: {item['rumor_embedding'].shape}, Timeline Embeddings: {item['timeline_embeddings'].shape}\n")




ID: AuRED_014, Rumor: “ Urgent Ramallah Ministry Health spokesman Kamal AlShakhra received 2000 doses American “ Moderna ” Corona vaccine batch designated President Abbas Fatah Central Committee VIPs ”
Rumor Embedding: (768,), Timeline Embeddings: (75, 768)

ID: AuRED_037, Rumor: Macron Sky News visit Mrs Fairouz last night visit Jaj Cedar Reserve realized love large section Lebanese people President Republic Fairouz also told appreciation love President reform project President Republic wants implement also salute President Republic efforts patience
Rumor Embedding: (768,), Timeline Embeddings: (1137, 768)

ID: AuRED_085, Rumor: Saudi Arabia evacuated 10 students China plane could accommodate 500 passengers evacuate 170 Yemeni students participating 100 warplanes bomb Yemen May God protect Sultanate Oman sending private plane Yemeni students stranded China good neighbor Oman coOfDSIoD5ry
Rumor Embedding: (768,), Timeline Embeddings: (139, 768)



In [11]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Function to retrieve top N evidence based on cosine similarity
def retrieve_evidence_from_features(features, top_n=5):
    evidence_results = []
    
    for item in features:
        rumor_embedding = item['rumor_embedding']
        timeline_embeddings = item['timeline_embeddings']
        
        # Calculate cosine similarity between rumor and timeline entries
        similarities = cosine_similarity([rumor_embedding], timeline_embeddings)[0]
        
        # Get the top N most similar timeline entries
        top_indices = np.argsort(similarities)[-top_n:][::-1]
        selected_evidence_texts = [item['timeline'][i][2] for i in top_indices]
        selected_evidence = " ".join(selected_evidence_texts)
        
        evidence_results.append({
            'id': item['id'],
            'rumor': item['rumor'],
            'selected_evidence': selected_evidence,
            'true_label': item['true_label']
        })
    
    return evidence_results

# Perform evidence retrieval using the extracted features
evidence_results = retrieve_evidence_from_features(features)

# Print a few examples to verify
for result in evidence_results[:3]:
    print(f"ID: {result['id']}, Rumor: {result['rumor']}")
    print(f"Selected Evidence: {result['selected_evidence']}\n")


ID: AuRED_014, Rumor: “ Urgent Ramallah Ministry Health spokesman Kamal AlShakhra received 2000 doses American “ Moderna ” Corona vaccine batch designated President Abbas Fatah Central Committee VIPs ”
Selected Evidence: “ Minister Health Dr Mai AlKaila said number convoys sent donation Center reached 5 convoys carrying medical supplies devices medicines Hashemite Charitable Society delivered Ministry Health Palestinian Embassy Amman KSRelief ” Palestinian Minister Health Dr Mai Alkaila 10000 doses Russian Sputnik vaccine arrive today Ministry Health COVID19 SputnikV vaccines Palestine said 730 teams medical health personnel equipped vaccination distributed Ministry Health centers throughout country vaccinate target groups vaccines arrive vaccine palestine COVID19 Minister Health Dr MaiAlkaila inside military hospital Nablus designated treating Covid19 patients various coronavirus treatment centers many governorates today witnessing vaccination campaign health personnel working startin

In [24]:
# Ensure the model is on the correct device
device = torch.device("mps") if torch.backends.mps.is_available() else torch.device("cpu")
model = model.to(device)

# Function to classify rumors using the retrieved evidence, including probability predictions
def classify_rumors_with_probabilities(evidence_results, model, tokenizer, device):
    classification_results = []
    label_map = {0: "REFUTES", 1: "SUPPORTS", 2: "NOT ENOUGH INFO"}

    for item in evidence_results:
        rumor = item['rumor']
        combined_text = rumor + " " + item['selected_evidence']

        # Tokenize and prepare the input for BERT
        inputs = tokenizer.encode_plus(
            combined_text,
            add_special_tokens=True,
            max_length=256,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        input_ids = inputs['input_ids'].to(device)
        attention_mask = inputs['attention_mask'].to(device)

        # Perform classification with the fine-tuned BERT model
        model.eval()
        with torch.no_grad():
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            probabilities = F.softmax(logits, dim=1)
            prediction = torch.argmax(probabilities, dim=1).item()
            predicted_prob = probabilities[0].cpu().numpy()  # Convert tensor to numpy array

        # Store the classification result
        classification_results.append({
            'id': item['id'],
            'rumor': rumor,
            'predicted_label': label_map[prediction],
            'true_label': item['true_label'],
            'selected_evidence': item['selected_evidence'],
            'predicted_probabilities': predicted_prob  # Include predicted probabilities
        })

    return classification_results


In [12]:
from transformers import BertTokenizer, BertForSequenceClassification
import torch

# Load the fine-tuned model and tokenizer
model_dir = '/Users/alaaeddinalia/Desktop/Bachelor_Arbeit_2/Rumor_verification/src/models/rumor_verification_classifier'  # Path where your fine-tuned model is saved
tokenizer = BertTokenizer.from_pretrained(model_dir)
model = BertForSequenceClassification.from_pretrained(model_dir)

# Ensure the model is on the correct device
device = torch.device("mps") if torch.backends.mps.is_available() else torch.device("cpu")
model = model.to(device)

# Function to classify rumors using the retrieved evidence
def classify_rumors(evidence_results, model, tokenizer, device):
    classification_results = []
    label_map = {0: "REFUTES", 1: "SUPPORTS", 2: "NOT ENOUGH INFO"}

    for item in evidence_results:
        rumor = item['rumor']
        combined_text = rumor + " " + item['selected_evidence']

        # Tokenize and prepare the input for BERT
        inputs = tokenizer.encode_plus(
            combined_text,
            add_special_tokens=True,
            max_length=256,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        input_ids = inputs['input_ids'].to(device)
        attention_mask = inputs['attention_mask'].to(device)

        # Perform classification with the fine-tuned BERT model
        model.eval()
        with torch.no_grad():
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            prediction = torch.argmax(logits, dim=1).item()

        # Store the classification result
        classification_results.append({
            'id': item['id'],
            'rumor': rumor,
            'predicted_label': label_map[prediction],
            'true_label': item['true_label'],
            'selected_evidence': item['selected_evidence']
        })

    return classification_results

# Perform classification using the retrieved evidence
classification_results = classify_rumors(evidence_results, model, tokenizer, device)

# Print a few examples to verify
for result in classification_results[:3]:
    print(f"ID: {result['id']}, Rumor: {result['rumor']}")
    print(f"Predicted Label: {result['predicted_label']}, True Label: {result['true_label']}")
    print(f"Selected Evidence: {result['selected_evidence']}\n")


ID: AuRED_014, Rumor: “ Urgent Ramallah Ministry Health spokesman Kamal AlShakhra received 2000 doses American “ Moderna ” Corona vaccine batch designated President Abbas Fatah Central Committee VIPs ”
Predicted Label: REFUTES, True Label: REFUTES
Selected Evidence: “ Minister Health Dr Mai AlKaila said number convoys sent donation Center reached 5 convoys carrying medical supplies devices medicines Hashemite Charitable Society delivered Ministry Health Palestinian Embassy Amman KSRelief ” Palestinian Minister Health Dr Mai Alkaila 10000 doses Russian Sputnik vaccine arrive today Ministry Health COVID19 SputnikV vaccines Palestine said 730 teams medical health personnel equipped vaccination distributed Ministry Health centers throughout country vaccinate target groups vaccines arrive vaccine palestine COVID19 Minister Health Dr MaiAlkaila inside military hospital Nablus designated treating Covid19 patients various coronavirus treatment centers many governorates today witnessing vaccina

In [13]:
from evaluation import Evaluation


In [14]:
# Convert true and predicted labels to integers
label_map = {"REFUTES": 0, "SUPPORTS": 1, "NOT ENOUGH INFO": 2}
true_labels_int = [label_map[result['true_label']] for result in classification_results]
predicted_labels_int = [label_map[result['predicted_label']] for result in classification_results]

# Instantiate the Evaluation class with true and predicted labels
evaluator = Evaluation(y_true=true_labels_int, y_pred=predicted_labels_int)

# Get individual metrics
precision = evaluator.precision()
recall = evaluator.recall()
f1 = evaluator.f1()

# Get all metrics together
metrics = evaluator.all_metrics()

# Print the results
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

# Print all metrics at once
print("All Metrics:", metrics)


Precision: 0.7046
Recall: 0.5104
F1 Score: 0.4450
All Metrics: {'precision': 0.7046222810111699, 'recall': 0.5104166666666666, 'f1_score': 0.4450373482726424}
