# English data


In [1]:
from preparing.data_loading import DataLoader
from preparing.preprocessor import Preprocessor
from preparing.data_cleaning import DataCleaner
from utils.preprocessing import preprocess_data
from utils.feature_extractor import FeatureExtractor
from evaluation.retrieval_evaluation_bm25 import evaluate_recall_at_k_bm25, evaluate_map_bm25
from utils.data_split import load_and_combine_datasets, stratified_split
import numpy as np

# Load the data
train_file = '/Users/alaaeddinalia/Desktop/thesis_submission /Rumor_verification/data/raw/English_train.json'
dev_file = '/Users/alaaeddinalia/Desktop/thesis_submission /Rumor_verification/data/raw/English_dev.json'

# Combine datasets 
data = load_and_combine_datasets(train_file, dev_file)

# Clean data
cleaner = DataCleaner()
clean_data = cleaner.remove_invalid_tweets(data)

# Preprocess dataset
preprocessor = Preprocessor()
preprocessed_data = preprocess_data(clean_data, preprocessor)

# Split the data
train_data, test_data = stratified_split(preprocessed_data, label_key='label')

# Initialize the BM25 
extractor = FeatureExtractor(method="bm25")


for item in test_data:
    rumor_text = item['rumor']  
    timeline_texts = [timeline_entry[2] for timeline_entry in item['timeline']] 
    
    # Fit the BM25 model
    extractor.fit_transform(timeline_texts)  
    
    # Calculate BM25 scores 
    bm25_scores = extractor._bm25_transform([rumor_text])
    
    # Assign BM25 scores to the corresponding timeline entries
    for i, timeline_entry in enumerate(item['timeline']):
        timeline_entry.append(bm25_scores[0][i]) 


# Recall at k
recall_at_5 = evaluate_recall_at_k_bm25(test_data,5)
recall_at_10 = evaluate_recall_at_k_bm25(test_data,10)
recall_at_15 = evaluate_recall_at_k_bm25(test_data,15)

# MAP
map_score = evaluate_map_bm25(test_data)


print(f"Recall@5 : {recall_at_5:.4f}")
print(f"Recall@10: {recall_at_10:.4f}")
print(f"Recall@15: {recall_at_15:.4f}")
print(f"Mean Average Precision (MAP): {map_score:.4f}")


  from tqdm.autonotebook import tqdm, trange


Recall@5 : 0.6044
Recall@10: 0.7029
Recall@15: 0.7436
Mean Average Precision (MAP): 0.6226


# Arabic data

In [5]:

# Load the data
train_file = '/Users/alaaeddinalia/Desktop/thesis_submission /Rumor_verification/data/raw/Arabic_train.json'
dev_file = '/Users/alaaeddinalia/Desktop/thesis_submission /Rumor_verification/data/raw/Arabic_dev.json'

# Combine datasets 
data = load_and_combine_datasets(train_file, dev_file)

# Clean data
cleaner = DataCleaner()
clean_data = cleaner.remove_invalid_tweets(data)

# Preprocess dataset
preprocessor = Preprocessor()
preprocessed_data = preprocess_data(clean_data, preprocessor)

# Split the data 
train_data, test_data = stratified_split(preprocessed_data, label_key='label')

# Initialize the BM25 
extractor = FeatureExtractor(method="bm25")


for item in test_data:
    rumor_text = item['rumor']  
    timeline_texts = [timeline_entry[2] for timeline_entry in item['timeline']] 
    
    # Fit the BM25 model 
    extractor.fit_transform(timeline_texts)  
    
    # Calculate BM25 scores 
    bm25_scores = extractor._bm25_transform([rumor_text])
    
    # Assign BM25 scores to the corresponding timeline entries
    for i, timeline_entry in enumerate(item['timeline']):
        timeline_entry.append(bm25_scores[0][i]) 


# Recall at k
recall_at_5 = evaluate_recall_at_k_bm25(test_data,5)
recall_at_10 = evaluate_recall_at_k_bm25(test_data,10)
recall_at_15 = evaluate_recall_at_k_bm25(test_data,15)

# MAP
map_score = evaluate_map_bm25(test_data)


print(f"Recall@5 : {recall_at_5:.4f}")
print(f"Recall@10: {recall_at_10:.4f}")
print(f"Recall@15: {recall_at_15:.4f}")
print(f"Mean Average Precision (MAP): {map_score:.4f}")


Recall@5 : 0.7833
Recall@10: 0.8222
Recall@15: 0.9000
Mean Average Precision (MAP): 0.7937
