In [7]:
import json
from data_loading import DataLoader_Data

# Load the dataset
file_path = '/Users/alaaeddinalia/Desktop/thesis _Rumor_verifiction/Rumor_verification/data/raw/English_train.json'
data_loader = DataLoader_Data(file_path)
data = data_loader.load_data()

print(f"Loaded {len(data)} rumors from the dataset.")


Loaded 96 rumors from the dataset.


In [8]:
from language_detector import LanguageDetector
from preprocessor import Preprocessor
from data_cleaning import DatasetCleaner

# Clean the dataset
dataset_cleaner = DatasetCleaner()
cleaned_data = dataset_cleaner.remove_irrelevant_rumors(data)

# Detect language from the first rumor
language_detector = LanguageDetector()
detected_language = language_detector.detect_language(cleaned_data[0]['rumor'])

# Preprocess the dataset
preprocessor = Preprocessor(
    language=detected_language,
    remove_urls=True,
    remove_noise_words=True,
    remove_special_characters=False,
    apply_lemmatization=False
)

preprocessed_data = []
for item in cleaned_data:
    item['rumor'] = preprocessor.preprocess_text(item['rumor'])
    for i, timeline_entry in enumerate(item['timeline']):
        item['timeline'][i][2] = preprocessor.preprocess_text(timeline_entry[2])
    preprocessed_data.append(item)

# Save preprocessed data
preprocessed_file_path = '/Users/alaaeddinalia/Desktop/thesis _Rumor_verifiction/Rumor_verification/data/processed/English_train_preprocessed_clean.json'
with open(preprocessed_file_path, 'w', encoding='utf-8') as f:
    json.dump(preprocessed_data, f, ensure_ascii=False, indent=4)

print(f"Preprocessed data saved to {preprocessed_file_path}.")


Preprocessed data saved to /Users/alaaeddinalia/Desktop/thesis _Rumor_verifiction/Rumor_verification/data/processed/English_train_preprocessed_clean.json.


In [10]:
from feature_extractor_ import FeatureExtractor
import numpy as np

# Initialize SBERT extractor
extractor = FeatureExtractor(method='sbert', sbert_model_name='paraphrase-multilingual-MiniLM-L12-v2', batch_size=16)

# Collect all texts rumors and timelines 
all_texts = [item['rumor'] for item in preprocessed_data] + \
            [timeline_entry[2] for item in preprocessed_data for timeline_entry in item['timeline']]

vectors = extractor.fit_transform(all_texts)

# Assign SBERT vectors to the data
index = 0
for item in preprocessed_data:
    item['rumor_vector'] = vectors[index].tolist()
    index += 1
    for timeline_entry in item['timeline']:
        timeline_entry.append(vectors[index].tolist())
        index += 1

print(f"Extracted SBERT embeddings and assigned them to the dataset.")




Extracted SBERT embeddings and assigned them to the dataset.


In [11]:
#Identify Relevant Tweets Based on Cosine Similarity 
from relevant_timeline_retriever import RelevantTimelineRetriever
from sklearn.metrics.pairwise import cosine_similarity

retriever = RelevantTimelineRetriever(thresholding_technique='mean-shift')

# Calculate cosine similarities between rumor and timeline tweets
timeline_similarities = retriever.calculate_similarities(preprocessed_data)

# Retrieve relevant timelines based on similarities
relevant_timeline_entries = retriever.retrieve_relevant_timelines(timeline_similarities, preprocessed_data)
print(f"Identified relevant timelines based on cosine similarity.")


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Identified relevant timelines based on cosine similarity.


In [27]:
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

# Initialize lists to store the true labels and predicted labels
true_labels = []
predicted_labels = []

# Function to convert stance label to numerical ID
label_mapping_reverse = {"SUPPORTS": 0, "REFUTES": 1, "NOT ENOUGH INFO": 2}

# Loop through all rumor entries and their relevant timelines
for rumor_entry in relevant_timeline_entries:
    rumor_text = rumor_entry['rumor_text']
    relevant_timelines = rumor_entry['relevant_timelines']
    rumor_id = rumor_entry['rumor_id']

    # Get the true label from the preprocessed data by matching the rumor ID
    true_label_text = next((item['label'] for item in preprocessed_data if item['id'] == rumor_id), None)
    
    if true_label_text is None:
        print(f"Warning: No true label found for rumor ID {rumor_id}")
        continue

    # Convert the true label to its numerical form
    true_label_id = label_mapping_reverse[true_label_text]

    # Predict the stance for the rumor and retrieve relevant evidence
    final_stance, relevant_evidence = stance_predictor.predict_rumor_stance(
        rumor_text, relevant_timelines, threshold_t2=0.5
    )
    
    # Convert final stance to numerical ID
    predicted_label_id = label_mapping_reverse[final_stance]

    # Append the true and predicted labels for evaluation
    true_labels.append(true_label_id)
    predicted_labels.append(predicted_label_id)

# Calculate evaluation metrics
precision, recall, f1, _ = precision_recall_fscore_support(true_labels, predicted_labels, average='weighted', zero_division=0)
accuracy = accuracy_score(true_labels, predicted_labels)

# Output the evaluation results
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")


KeyboardInterrupt: 