In [None]:
pip install psycopg2-binary scikit-learn numpy

In [None]:
pip install sentence-transformers

In [None]:
pip install hdbscan-0.8.29-cp313-cp313-win_amd64.whl

In [None]:
pip install bertopic

In [21]:
import os
import json
import numpy as np
from pathlib import Path
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer

# Find the repository root dynamically
REPO_ROOT = Path(os.getcwd()).resolve()
while REPO_ROOT.name != "TrueLens" and REPO_ROOT != REPO_ROOT.parent:
    REPO_ROOT = REPO_ROOT.parent  # Move up until we reach the repo root

# Define paths relative to the repository root
ARTICLES_DIR = REPO_ROOT / "news_filtered_data/news_source_data/data/articles"
OUTPUT_FILE = REPO_ROOT / "news_filtered_data/news_source_data/data/grouped_articles.json"

def is_ascii(s):
    """ Check if the string contains only ASCII characters. """
    return all(ord(c) < 128 for c in s)

def load_articles(directory):
    """ Load articles from JSON files and filter out empty ones. """
    articles = []
    
    for file in Path(directory).rglob("*.json"):  # Recursively find all JSON files
        with open(file, "r", encoding="utf-8") as f:
            data = json.load(f)
            text = " ".join(data.get("body_paragraphs", [])).strip()  # Combine and clean text
            
            # Skip empty articles and non-ASCII titles
            if text and is_ascii(data["title"]):
                articles.append({
                    "url": data["url"],
                    "title": data["title"],
                    "ut": data["ut"],
                    "body_paragraphs": text,
                    "outlet": data.get("outlet", "unknown"),
                    "reporter": data.get("reporter", None)
                })
    
    return articles

def cluster_articles(articles, num_clusters=5):
    """ Cluster articles using BERTopic. """
    body_paragraphs = [article["body_paragraphs"] for article in articles]
    
    # Handle case where all documents are empty
    if not any(body_paragraphs):
        raise ValueError("No valid articles found for clustering.")
    
    # Use a pre-trained SentenceTransformer model for embeddings
    model = SentenceTransformer("all-MiniLM-L6-v2")
    embeddings = model.encode(body_paragraphs, show_progress_bar=True)
    
    # Create and fit BERTopic model
    topic_model = BERTopic(nr_topics=num_clusters)
    topics, _ = topic_model.fit_transform(body_paragraphs, embeddings)
    
    for i, article in enumerate(articles):
        article["cluster_id"] = int(topics[i])
    
    return articles

def save_grouped_articles(articles, output_file):
    """ Save grouped articles to a JSON file and return grouped data. """
    grouped_articles = {}
    for article in articles:
        cluster_id = article["cluster_id"]
        if cluster_id not in grouped_articles:
            grouped_articles[cluster_id] = []
        grouped_articles[cluster_id].append(article)
    
    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(grouped_articles, f, indent=2)
    
    return grouped_articles  # Return the dictionary

def create_reporter_and_outlet(article):
    """ Create reporter and outlet information. """
    reporter = article["reporter"]
    outlet = article["outlet"]
    
    if not reporter:
        reporter = f"system-{outlet}"
        is_system = True
    else:
        is_system = False
    
    return {
        "reporter": reporter,
        "is_system": is_system,
        "outlet": outlet
    }

def process_articles(articles):
    """ Process articles to add reporter and outlet information. """
    for article in articles:
        reporter_info = create_reporter_and_outlet(article)
        article.update(reporter_info)
    
    return articles

def create_story_and_calculate_factuality(grouped_articles):
    """ Create stories and calculate factuality for clustered articles. """
    # Mock trpcClient for demonstration purposes
    class MockTRPCClient:
        class Story:
            @staticmethod
            def create(data):
                return {"id": 1, "title": data["title"], "articles": data["articles"]}
        
        class Article:
            @staticmethod
            def update(data):
                pass
    
    trpcClient = MockTRPCClient()
    
    for cluster_id, articles in grouped_articles.items():
        # Create a new story
        story = trpcClient.Story.create({
            "title": f"Story for cluster {cluster_id}",
            "articles": [article["url"] for article in articles]
        })
        
        # Calculate factuality (assuming a function or service for this)
        factuality = calculate_factuality(articles)
        
        # Update articles with story ID and factuality
        for article in articles:
            trpcClient.Article.update({
                "url": article["url"],
                "storyId": story["id"],
                "factuality": factuality
            })

def calculate_factuality(articles):
    """ Dummy function to calculate factuality. Replace with actual implementation. """
    return 75  # Placeholder value

# Run the process
articles = load_articles(ARTICLES_DIR)

if articles:
    clustered_articles = cluster_articles(articles, num_clusters=min(5, len(articles)))  # Ensure we don't request more clusters than articles
    print("Clustering Completed.")
    
    processed_articles = process_articles(clustered_articles)
    grouped_articles = save_grouped_articles(processed_articles, OUTPUT_FILE)  # Capture returned dictionary
    print(f"Grouped articles saved to {OUTPUT_FILE}.")

    # Now, pass the correctly formatted dictionary
    create_story_and_calculate_factuality(grouped_articles)
    print("Stories created and factuality calculated.")
else:
    print("No valid articles found. Process terminated.")


Batches: 100%|██████████| 60/60 [00:43<00:00,  1.38it/s]


Clustering Completed.
Grouped articles saved to D:\sdgpTruelens\TrueLens\news_filtered_data\news_source_data\data\grouped_articles.json.
Stories created and factuality calculated.
