# English data

In [2]:
from preparing.data_loading import DataLoader
from preparing.preprocessor import Preprocessor
from preparing.data_cleaning import DataCleaner
from utils.preprocessing import preprocess_data
from utils.feature_extractor import FeatureExtractor
from utils.similarity_calculation import calculate_similarities
from evaluation.retrieval_evaluation import evaluate_recall_at_k, evaluate_map
from utils.data_split import load_and_combine_datasets, stratified_split
import numpy as np

train_file = '/Users/alaaeddinalia/Desktop/thesis_submission /Rumor_verification/data/raw/English_train.json'
dev_file = '/Users/alaaeddinalia/Desktop/thesis_submission /Rumor_verification/data/raw/English_dev.json'

# Combine datasets
data = load_and_combine_datasets(train_file, dev_file)


#Cleandata
cleaner = DataCleaner()
clean_data = cleaner.remove_invalid_tweets(data)

#Preprocess dataset
preprocessor = Preprocessor()
preprocessed_data = preprocess_data(clean_data,preprocessor)

#data split
train_data, test_data = stratified_split(preprocessed_data, label_key='label')

#Feature extractor (Sbert)
extractor = FeatureExtractor(method="sbert")


rumor_texts_test = [item['rumor'] for item in test_data]
timeline_texts_test = [timeline_entry[2] for item in test_data for timeline_entry in item['timeline']]



rumor_vectors_test = extractor.transform(rumor_texts_test)
timeline_vectors_test = extractor.transform(timeline_texts_test)


#Assign vectors back to rumors and timeline entries 
rumor_index = 0
timeline_index = 0

for item in test_data:
    # Assign rumor vector
    item['rumor_vector'] = rumor_vectors_test[rumor_index]
    rumor_index += 1
    
    # Assign timeline vectors
    for timeline_entry in item['timeline']:
        timeline_entry.append(timeline_vectors_test[timeline_index])  
        timeline_index += 1
        
# Calculate similarities between rumors and timeline entries
similarities = calculate_similarities(test_data) 


# Recall at k
recall_at_5 = evaluate_recall_at_k(test_data, similarities, 5)
recall_at_10 = evaluate_recall_at_k(test_data, similarities, 10)
recall_at_15 = evaluate_recall_at_k(test_data, similarities, 15)

# MAP
map_score = evaluate_map(test_data, similarities)

print(f"Recall@5 : {recall_at_5:.4f}")
print(f"Recall@10: {recall_at_10:.4f}")
print(f"Recall@15: {recall_at_15:.4f}")
print(f"Mean Average Precision (MAP): {map_score:.4f}")        



Recall@5 : 0.6362
Recall@10: 0.7607
Recall@15: 0.7607
Mean Average Precision (MAP): 0.6635


# Arabic data

In [5]:

arabic_train_file = '/Users/alaaeddinalia/Desktop/thesis_submission /Rumor_verification/data/raw/Arabic_train.json'
arabic_dev_file = '/Users/alaaeddinalia/Desktop/thesis_submission /Rumor_verification/data/raw/Arabic_dev.json'

# Combine datasets
data = load_and_combine_datasets(arabic_train_file, arabic_dev_file)


#Cleandata
cleaner = DataCleaner()
clean_data = cleaner.remove_invalid_tweets(data)

#Preprocess dataset
preprocessor = Preprocessor()
preprocessed_data = preprocess_data(clean_data,preprocessor)

train_data, test_data = stratified_split(preprocessed_data, label_key='label')

#Feature extractor (Sbert)
extractor = FeatureExtractor(method="sbert")

# Prepare test data 
rumor_texts_test = [item['rumor'] for item in test_data]
timeline_texts_test = [timeline_entry[2] for item in test_data for timeline_entry in item['timeline']]



rumor_vectors_test = extractor.transform(rumor_texts_test)
timeline_vectors_test = extractor.transform(timeline_texts_test)


#Assign vectors back to rumors and timeline entries 
rumor_index = 0
timeline_index = 0

for item in test_data:
    # Assign rumor vector
    item['rumor_vector'] = rumor_vectors_test[rumor_index]
    rumor_index += 1
    
    # Assign timeline vectors
    for timeline_entry in item['timeline']:
        timeline_entry.append(timeline_vectors_test[timeline_index])  
        timeline_index += 1
        
# Calculate similarities between rumors and timeline entries
similarities = calculate_similarities(test_data) 


# Recall at k
recall_at_5 = evaluate_recall_at_k(test_data, similarities, 5)
recall_at_10 = evaluate_recall_at_k(test_data, similarities, 10)
recall_at_15 = evaluate_recall_at_k(test_data, similarities, 15)

# MAP
map_score = evaluate_map(test_data, similarities)

print(f"Recall@5 : {recall_at_5:.4f}")
print(f"Recall@10: {recall_at_10:.4f}")
print(f"Recall@15: {recall_at_15:.4f}")
print(f"Mean Average Precision (MAP): {map_score:.4f}")        



Recall@5 : 0.7778
Recall@10: 0.8000
Recall@15: 0.8000
Mean Average Precision (MAP): 0.7085
