In [1]:
import json
from data_loading import DataLoader_Data
from preprocessor import Preprocessor
from language_detector import LanguageDetector
from feature_extractor_ import FeatureExtractor


# Load the dataset
file_path = '/Users/alaaeddinalia/Desktop/thesis _Rumor_verifiction/Rumor_verification/data/raw/English_train.json'
data_loader = DataLoader_Data(file_path)
data = data_loader.load_data()

from language_detector import LanguageDetector
from preprocessor import Preprocessor
from data_cleaning import DatasetCleaner

# Clean the dataset
dataset_cleaner = DatasetCleaner()
cleaned_data = dataset_cleaner.remove_irrelevant_rumors(data)


# Detect language 
language_detector = LanguageDetector()
detected_language = language_detector.detect_language(cleaned_data[0]['rumor'])


preprocessor = Preprocessor(
    language=detected_language,
    remove_urls=False,
    remove_noise_words=False,
    remove_special_characters=False,
    apply_lemmatization=False
)

preprocessed_data = []

for item in cleaned_data:
    
    preprocessed_item = item.copy()
    
    # Preprocess the rumor
    preprocessed_item['rumor'] = preprocessor.preprocess_text(item['rumor'])
    
    # Preprocess each timeline entry's text
    for i, timeline_entry in enumerate(item['timeline']):
        preprocessed_item['timeline'][i][2] = preprocessor.preprocess_text(timeline_entry[2])
    
    
    preprocessed_data.append(preprocessed_item)


from feature_extractor_ import FeatureExtractor
import numpy as np


extractor = FeatureExtractor(method='sbert', sbert_model_name='paraphrase-multilingual-MiniLM-L12-v2', batch_size=16)


all_texts = [item['rumor'] for item in preprocessed_data] + \
            [timeline_entry[2] for item in preprocessed_data for timeline_entry in item['timeline']]

vectors = extractor.fit_transform(all_texts)

index = 0
for item in preprocessed_data:
    item['rumor_vector'] = vectors[index].tolist()
    index += 1
    for timeline_entry in item['timeline']:
        timeline_entry.append(vectors[index].tolist())
        index += 1




In [2]:
def compare_with_evidence(relevant_timelines, evidence):
    true_positives = []
    false_negatives = []
    

    retrieved_ids = set([entry['timeline_id'] for entry in relevant_timelines])
    evidence_ids = set([entry[1] for entry in evidence])

    # True positives: timeline entries in both the retrieved timelines and evidence
    true_positives = retrieved_ids.intersection(evidence_ids)

    # False negatives: evidence entries that were missed in the retrieved timelines
    false_negatives = evidence_ids.difference(retrieved_ids)

    return true_positives, false_negatives


In [3]:
from relevant_timeline_retriever import RelevantTimelineRetriever
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd 


retriever_iqr = RelevantTimelineRetriever(thresholding_technique='iqr', use_semi_quartile=True)
retriever_mean_shift = RelevantTimelineRetriever(thresholding_technique='mean-shift', bandwidth=0.2)
retriever_game_theory = RelevantTimelineRetriever(thresholding_technique='game-theory')


aggregated_results = {
    'IQR': {'True Positives (TP)': 0, 'False Negatives (FN)': 0, 'Total Evidence': 0},
    'Mean Shift': {'True Positives (TP)': 0, 'False Negatives (FN)': 0, 'Total Evidence': 0},
    'Game Theory': {'True Positives (TP)': 0, 'False Negatives (FN)': 0, 'Total Evidence': 0},
}

#
for idx, rumor_data in enumerate(preprocessed_data):

    # Calculate similarities
    similarities = retriever_iqr.calculate_similarities([rumor_data]) 

    # Retrieve relevant timelines using different techniques
    relevant_timelines_iqr = retriever_iqr.retrieve_relevant_timelines(similarities, [rumor_data])
    relevant_timelines_mean_shift = retriever_mean_shift.retrieve_relevant_timelines(similarities, [rumor_data])
    relevant_timelines_game_theory = retriever_game_theory.retrieve_relevant_timelines(similarities, [rumor_data])

    # Evidence for the current rumor
    evidence_for_rumor = rumor_data['evidence']
    total_evidence_for_rumor = len(evidence_for_rumor)

    # IQR Technique
    tp_iqr, fn_iqr = compare_with_evidence(relevant_timelines_iqr[0]['relevant_timelines'], evidence_for_rumor)
    aggregated_results['IQR']['True Positives (TP)'] += len(tp_iqr)
    aggregated_results['IQR']['False Negatives (FN)'] += len(fn_iqr)
    aggregated_results['IQR']['Total Evidence'] += total_evidence_for_rumor

    # Mean Shift Technique
    tp_mean_shift, fn_mean_shift = compare_with_evidence(relevant_timelines_mean_shift[0]['relevant_timelines'], evidence_for_rumor)
    aggregated_results['Mean Shift']['True Positives (TP)'] += len(tp_mean_shift)
    aggregated_results['Mean Shift']['False Negatives (FN)'] += len(fn_mean_shift)
    aggregated_results['Mean Shift']['Total Evidence'] += total_evidence_for_rumor

    # Game Theory Technique
    tp_game_theory, fn_game_theory = compare_with_evidence(relevant_timelines_game_theory[0]['relevant_timelines'], evidence_for_rumor)
    aggregated_results['Game Theory']['True Positives (TP)'] += len(tp_game_theory)
    aggregated_results['Game Theory']['False Negatives (FN)'] += len(fn_game_theory)
    aggregated_results['Game Theory']['Total Evidence'] += total_evidence_for_rumor


final_results = pd.DataFrame(aggregated_results).T.reset_index().rename(columns={'index': 'Approach'})

# Display table
print(final_results)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


      Approach  True Positives (TP)  False Negatives (FN)  Total Evidence
0          IQR                  240                    36             276
1   Mean Shift                  270                     6             276
2  Game Theory                  155                   121             276
