# Import Necessary Libraries


In [45]:
from data_loading import DataLoader_Data
from language_detector import LanguageDetector
from preprocessor import Preprocessor
from feature_extractor_ import FeatureExtractor
from similarity_calculator import SimilarityCalculator
from rumor_classifier import RumorClassifier
from evaluation import Evaluation
from sentiment_analyzer import SentimentAnalyzer
from ner_extractor import NERExtractor
import json
import torch
import numpy as np


# Load and Preprocess the Dataset

In [46]:
# Define file paths
file_path = '/Users/alaaeddinalia/Desktop/Bachelor_Arbeit_2/Rumor_verification/data/raw/English_train.json'
preprocessed_file_path = '/Users/alaaeddinalia/Desktop/Bachelor_Arbeit_2/Rumor_verification/data/processed/English_train_preprocessed.json'


# Load the dataset
data_loader = DataLoader_Data(file_path)
language_detector = LanguageDetector()
preprocessor = Preprocessor(
    language=language_detector.detect_language(data_loader.data[0]['rumor']),
    remove_urls=True,
    remove_special_characters=True,
    remove_stopwords=True,
    remove_noise_words=True,
    remove_emojis=True,
    apply_stemming=True,
    apply_lemmatization=True
)

# Preprocess data
preprocessed_data = []
for item in data_loader.data:
    item['rumor'] = preprocessor.preprocess_text(item['rumor'])
    for i, timeline_entry in enumerate(item['timeline']):
        item['timeline'][i][2] = preprocessor.preprocess_text(timeline_entry[2])
    for j, evidence_entry in enumerate(item['evidence']):
        item['evidence'][j][2] = preprocessor.preprocess_text(evidence_entry[2])
    preprocessed_data.append(item)

# Save preprocessed data
with open(preprocessed_file_path, 'w') as f:
    json.dump(preprocessed_data, f, ensure_ascii=False, indent=4)


 # Perform Sentiment Analysis and Named Entity Recognition (NER)

In [47]:
# Initialize sentiment analyzer and NER extractor
sentiment_analyzer = SentimentAnalyzer()
ner_extractor = NERExtractor()

# Apply sentiment analysis and NER to the preprocessed data
for item in preprocessed_data:
    item['sentiment'] = sentiment_analyzer.analyze_sentiment(item['rumor'])
    item['entities'] = ner_extractor.extract_entities(item['rumor'])


# Feature Extraction using TF_IDF

In [48]:
# Initialize the FeatureExtractor with the TF-IDF method
extractor = FeatureExtractor(method='tfidf')

# Combine all texts for feature extraction
all_texts = [item['rumor'] for item in preprocessed_data] + \
            [timeline_entry[2] for item in preprocessed_data for timeline_entry in item['timeline']] + \
            [evidence_entry[2] for item in preprocessed_data for evidence_entry in item['evidence']]

# Extract TF-IDF vectors
vectors = extractor.fit_transform(all_texts)

# Ensure vectors are a numpy array or sparse matrix (as TF-IDF returns a sparse matrix by default)
if hasattr(vectors, 'toarray'):
    vectors = vectors.toarray()  # Convert sparse matrix to a dense array (optional, based on your needs)

# Check if vectors is a numpy array after conversion
if isinstance(vectors, np.ndarray):
    # Assign vectors to the data
    index = 0
    for item in preprocessed_data:
        item['rumor_vector'] = vectors[index].tolist()  # Assign the TF-IDF vector for the rumor
        index += 1
        
        # Assign vectors for timeline entries
        for timeline_entry in item['timeline']:
            timeline_entry.append(vectors[index].tolist())  # Add the vector to the timeline entry
            index += 1
        
        # Assign vectors for evidence entries
        for evidence_entry in item['evidence']:
            evidence_entry.append(vectors[index].tolist())  # Add the vector to the evidence entry
            index += 1
else:
    print(f"Error: Expected numpy array, but got {type(vectors)}")


In [49]:
rumor_vectors = vectors[:len(preprocessed_data)]  # Extract rumor vectors (first N entries)
print(rumor_vectors.shape)  # Print the shape of the rumor vectors

(96, 1000)


# Calculate Similarities

In [51]:
# Calculate similarities
similarity_calculator = SimilarityCalculator()
evidence_similarities = similarity_calculator.calculate_evidence_similarity(preprocessed_data, extractor)
timeline_similarities = similarity_calculator.calculate_timeline_similarity(preprocessed_data)

# Save similarity results
with open('/Users/alaaeddinalia/Desktop/Bachelor_Arbeit_2/Rumor_verification/data/similarity/similarity_tfidf/English_train_evidence_similarity_results.json', 'w') as f:
    json.dump(evidence_similarities, f, ensure_ascii=False, indent=4)
with open('/Users/alaaeddinalia/Desktop/Bachelor_Arbeit_2/Rumor_verification/data/similarity/similarity_tfidf/English_train_timeline_similarity_results.json', 'w') as f:
    json.dump(timeline_similarities, f, ensure_ascii=False, indent=4)


 # Calculate averages for classification thresholds

In [52]:
avg_total=similarity_calculator.calculate_average_similarity(evidence_similarities),
avg_refutes=similarity_calculator.calculate_average_similarity([sim for sim in evidence_similarities if sim['label'] == "REFUTES"]),
avg_supports=similarity_calculator.calculate_average_similarity([sim for sim in evidence_similarities if sim['label'] == "SUPPORTS"]) 


 # Calculate averages for classification thresholds
 
 
classifier = RumorClassifier(
    avg_total=similarity_calculator.calculate_average_similarity(evidence_similarities),
    avg_refutes=similarity_calculator.calculate_average_similarity([sim for sim in evidence_similarities if sim['label'] == "REFUTES"]),
    avg_supports=similarity_calculator.calculate_average_similarity([sim for sim in evidence_similarities if sim['label'] == "SUPPORTS"])
)

print(avg_total, avg_refutes , avg_supports)


(0.030973366730171797,) (0.03360906624163678,) 0.02440079959398695


# Classify the rumors

In [53]:
predictions = classifier.classify(timeline_similarities, preprocessed_data)

# Evaluate the Model

In [54]:
# Evaluate with precision, recall, and F1
ground_truth_labels = [item['label'] for item in preprocessed_data]
evaluator = Evaluation(ground_truth_labels, predictions)
metrics = evaluator.all_metrics()

print(f"Precision: {metrics['precision']:.4f}")
print(f"Recall: {metrics['recall']:.4f}")
print(f"F1 Score: {metrics['f1_score']:.4f}")


Precision: 0.4353
Recall: 0.4062
F1 Score: 0.2624


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
