# Import required libraries


In [4]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
from torch.utils.data import Dataset
from feature_extractor_ import FeatureExtractor
from preprocessor import Preprocessor
from data_loading import DataLoader_Data


# Load and preprocess the datasets

In [5]:

# Load the dataset
train_file_path = '/Users/alaaeddinalia/Desktop/Bachelor_Arbeit_2/Rumor_verification/data/raw/English_train.json'
train_data_loader = DataLoader_Data(train_file_path)

val_file_path = '/Users/alaaeddinalia/Desktop/Bachelor_Arbeit_2/Rumor_verification/data/raw/English_dev.json'
val_data_loader = DataLoader_Data(val_file_path)

# Initialize the preprocessor
preprocessor = Preprocessor(language='english', remove_urls=True, remove_special_characters=True,
                            remove_stopwords=True, remove_noise_words=True, remove_emojis=True,
                            apply_stemming=False, apply_lemmatization=False)

# Preprocess the training data
for item in train_data_loader.data:
    item['rumor'] = preprocessor.preprocess_text(item['rumor'])
    for i, evidence_entry in enumerate(item['evidence']):
        item['evidence'][i][2] = preprocessor.preprocess_text(evidence_entry[2])
    for i, timeline_entry in enumerate(item['timeline']):
        item['timeline'][i][2] = preprocessor.preprocess_text(timeline_entry[2])

# Preprocess the validation data
for item in val_data_loader.data:
    item['rumor'] = preprocessor.preprocess_text(item['rumor'])
    for i, evidence_entry in enumerate(item['evidence']):
        item['evidence'][i][2] = preprocessor.preprocess_text(evidence_entry[2])
    for i, timeline_entry in enumerate(item['timeline']):
        item['timeline'][i][2] = preprocessor.preprocess_text(timeline_entry[2])


# Evidence Retrieval Stage (Using FeatureExtractor)

In [6]:
from transformers import BertTokenizer, BertModel
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Initialize the BERT feature extractor
feature_extractor = FeatureExtractor(method='bert', bert_model_name='bert-base-uncased', batch_size=16)

def retrieve_evidence(rumor, timeline, top_n=5):
    rumor_embedding = feature_extractor.transform([rumor])
    
    # Compute embeddings for all timeline entries
    timeline_texts = [entry[2] for entry in timeline]
    timeline_embeddings = feature_extractor.transform(timeline_texts)
    
    # Compute cosine similarity between the rumor and all timeline embeddings
    similarities = cosine_similarity(rumor_embedding, timeline_embeddings)[0]
    
    # Get top N most similar timeline entries
    top_indices = np.argsort(similarities)[-top_n:]
    selected_evidence = [timeline[i][2] for i in top_indices]
    
    return " ".join(selected_evidence)




# Rumor Classification Stage

In [7]:
class RumorDataset(Dataset):
    def __init__(self, rumors, labels, tokenizer, max_len=512):
        self.rumors = rumors
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.rumors)

    def __getitem__(self, index):
        rumor = self.rumors[index]
        label = self.labels[index]
        
        encoding = self.tokenizer.encode_plus(
            rumor,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }


# Prepare the data


In [8]:
def prepare_data(data_loader):
    rumors = []
    labels = []
    for item in data_loader.data:
        # Retrieve top 5 evidence tweets from the timeline
        top_evidence = retrieve_evidence(item['rumor'], item['timeline'])
        combined_text = item['rumor'] + " " + top_evidence
        
        rumors.append(combined_text)
        # Convert labels to integers
        if item['label'] == 'REFUTES':
            labels.append(0)
        elif item['label'] == 'SUPPORTS':
            labels.append(1)
        else:  # NOT ENOUGH INFO
            labels.append(2)

    return rumors, labels

# Prepare data for training and validation
train_rumors, train_labels = prepare_data(train_data_loader)
val_rumors, val_labels = prepare_data(val_data_loader)

# Initialize the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Create PyTorch datasets
train_dataset = RumorDataset(train_rumors, train_labels, tokenizer, max_len=512)
val_dataset = RumorDataset(val_rumors, val_labels, tokenizer, max_len=512)




# Model Evaluation

In [9]:
# Load the fine-tuned classification model
model = BertForSequenceClassification.from_pretrained("/Users/alaaeddinalia/Desktop/Bachelor_Arbeit_2/Rumor_verification/src/models/Rumor_classifier")

device = torch.device("mps") if torch.backends.mps.is_available() else torch.device("cpu")
model = model.to(device)

from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',
    per_device_eval_batch_size=8,
    logging_dir='./logs',
)

# Define the compute_metrics function
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='weighted', zero_division=1)
    accuracy = accuracy_score(labels, predictions)
    return {"precision": precision, "recall": recall, "f1": f1, "accuracy": accuracy}

trainer = Trainer(
    model=model,
    args=training_args,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# Evaluate the model
trainer.evaluate()


  0%|          | 0/4 [00:00<?, ?it/s]

{'eval_loss': nan,
 'eval_model_preparation_time': 0.0014,
 'eval_precision': 0.7587890625,
 'eval_recall': 0.40625,
 'eval_f1': 0.2347222222222222,
 'eval_accuracy': 0.40625,
 'eval_runtime': 3.9094,
 'eval_samples_per_second': 8.185,
 'eval_steps_per_second': 1.023}