In [1]:
pip install psycopg2-binary scikit-learn numpy

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.3.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import os
import json
import numpy as np
from pathlib import Path
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

# Find the repository root dynamically
REPO_ROOT = Path(os.getcwd()).resolve()
while REPO_ROOT.name != "TrueLens" and REPO_ROOT != REPO_ROOT.parent:
    REPO_ROOT = REPO_ROOT.parent  # Move up until we reach the repo root

# Define paths relative to the repository root
ARTICLES_DIR = REPO_ROOT / "news_filtered_data/news_source_data/data/articles"
OUTPUT_FILE = REPO_ROOT / "news_filtered_data/news_source_data/data/grouped_articles.json"

def is_ascii(s):
    """ Check if the string contains only ASCII characters. """
    return all(ord(c) < 128 for c in s)

def load_articles(directory):
    """ Load articles from JSON files and filter out empty ones. """
    articles = []
    
    for file in Path(directory).rglob("*.json"):  # Recursively find all JSON files
        with open(file, "r", encoding="utf-8") as f:
            data = json.load(f)
            text = " ".join(data.get("body_paragraphs", [])).strip()  # Combine and clean text
            
            # Skip empty articles and non-ASCII titles
            if text and is_ascii(data["title"]):
                articles.append({
                    "url": data["url"],
                    "title": data["title"],
                    "ut": data["ut"],
                    "body_paragraphs": text
                })
    
    return articles

def cluster_articles(articles, num_clusters=5):
    """ Cluster articles using TF-IDF and KMeans. """
    body_paragraphs = [article["body_paragraphs"] for article in articles]
    
    # Handle case where all documents are empty
    if not any(body_paragraphs):
        raise ValueError("No valid articles found for clustering.")
    
    vectorizer = TfidfVectorizer(stop_words="english", max_features=5000)
    X = vectorizer.fit_transform(body_paragraphs)

    if X.shape[0] < num_clusters:
        raise ValueError(f"Not enough valid articles ({X.shape[0]}) for {num_clusters} clusters. Reduce the cluster count.")

    kmeans = KMeans(n_clusters=num_clusters, random_state=42, n_init=10)
    clusters = kmeans.fit_predict(X)

    for i, article in enumerate(articles):
        article["cluster_id"] = int(clusters[i])
    
    return articles

def save_grouped_articles(articles, output_file):
    """ Save grouped articles to a JSON file. """
    grouped_articles = {}
    for article in articles:
        cluster_id = article["cluster_id"]
        if cluster_id not in grouped_articles:
            grouped_articles[cluster_id] = []
        grouped_articles[cluster_id].append(article)
    
    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(grouped_articles, f, indent=2)

# Run the process
articles = load_articles(ARTICLES_DIR)

if articles:
    clustered_articles = cluster_articles(articles, num_clusters=min(5, len(articles)))  # Ensure we don't request more clusters than articles
    print("Clustering Completed.")
    save_grouped_articles(clustered_articles, OUTPUT_FILE)
    print(f"Grouped articles saved to {OUTPUT_FILE}.")
else:
    print("No valid articles found. Process terminated.")

Clustering Completed.
Grouped articles saved to D:\sdgpTruelens\TrueLens\news_filtered_data\news_source_data\data\grouped_articles.json.
