In [None]:
pip install psycopg2-binary scikit-learn numpy

In [1]:
import os
import json
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

In [8]:
# Path to the articles directory
ARTICLES_DIR = "D:/TrueLensSdgp/TrueLens/news_filtered_data/news_source_data/data/articles"

def load_articles(directory):
    """ Load articles from JSON files and filter out empty ones. """
    articles = []
    
    for root, _, files in os.walk(directory):
        for file in files:
            if file.endswith(".json"):
                with open(os.path.join(root, file), "r", encoding="utf-8") as f:
                    data = json.load(f)
                    text = " ".join(data["body_paragraphs"]).strip()
                    
                    # Skip empty articles
                    if text:
                        articles.append({
                            "url": data["url"],
                            "title": data["title"],
                            "ut": data["ut"],
                            "body_paragraphs": text
                        })
    
    return articles

# Test article loading
articles = load_articles(ARTICLES_DIR)
print(f"Loaded {len(articles)} articles.")


Loaded 0 articles.


In [9]:
# Path to the articles directory
ARTICLES_DIR = "D:/TrueLensSdgp/TrueLens/news_filtered_data/news_source_data/data/articles"

def load_articles(directory):
    """ Load articles from JSON files and filter out empty ones. """
    articles = []
    
    for root, _, files in os.walk(directory):
        for file in files:
            if file.endswith(".json"):
                with open(os.path.join(root, file), "r", encoding="utf-8") as f:
                    data = json.load(f)
                    text = " ".join(data.get("body_paragraphs", [])).strip() 
                    
                    # Skip empty articles
                    if text:
                        articles.append({
                            "url": data["url"],
                            "title": data["title"],
                            "ut": data["ut"],
                            "body_paragraphs": text
                        })
    
    return articles

# Test article loading
articles = load_articles(ARTICLES_DIR)
print(f"Loaded {len(articles)} articles successfully.")


Loaded 0 articles successfully.


In [10]:
def load_articles(directory):
    """ Load articles from JSON files and filter out empty ones. """
    articles = []
    
    for root, _, files in os.walk(directory):
        for file in files:
            if file.endswith(".json"):
                with open(os.path.join(root, file), "r", encoding="utf-8") as f:
                    data = json.load(f)
                    
                    text = " ".join(data["body_paragraphs"]).strip()

                    if text and is_ascii(data["title"]):  
                        articles.append({
                            "url": data["url"],
                            "title": data["title"],
                            "ut": data["ut"],
                            "body_paragraphs": text
                        })
    
    return articles

# Test article loading
articles = load_articles(ARTICLES_DIR)
print(f"Loaded {len(articles)} articles successfully.")

Loaded 0 articles successfully.


In [12]:
def load_articles(directory):
    """ Load articles from JSON files and filter out empty ones. """
    articles = []
    
    for root, _, files in os.walk(directory):
        for file in files:
            if file.endswith(".json"):
                with open(os.path.join(root, file), "r", encoding="utf-8") as f:
                    data = json.load(f)
                    
                    text = " ".join(data.get("body_paragraphs", [])).strip()

                    title = data.get("title", "")

                    if text and is_ascii(title):  
                        articles.append({
                            "url": data.get("url", ""),
                            "title": title,
                            "ut": data.get("ut", ""),
                            "body_paragraphs": text
                        })
    
    return articles

# Test article loading
articles = load_articles(ARTICLES_DIR)
print(f"Loaded {len(articles)} articles successfully.")


Loaded 0 articles successfully.


In [13]:
def cluster_articles(articles, num_clusters=5):
    """ Cluster articles using TF-IDF and KMeans. """
    body_paragraphs = [article["body_paragraphs"] for article in articles]
    
    # Handle case where all documents are empty
    if not any(body_paragraphs):
        raise ValueError("No valid articles found for clustering.")
    
    vectorizer = TfidfVectorizer(stop_words="english", max_features=5000)
    X = vectorizer.fit_transform(body_paragraphs)

    # Ensure we have enough articles for the requested number of clusters
    if X.shape[0] < num_clusters:
        raise ValueError(f"Not enough valid articles ({X.shape[0]}) for {num_clusters} clusters. Reduce the cluster count.")

    kmeans = KMeans(n_clusters=num_clusters, random_state=42, n_init=10)
    clusters = kmeans.fit_predict(X)

    # Assign cluster ID to each article
    for i, article in enumerate(articles):
        article["cluster_id"] = int(clusters[i])
    
    return articles

# Test clustering function
if articles:
    clustered_articles = cluster_articles(articles, num_clusters=min(5, len(articles)))
    print("Clustering completed successfully.")
else:
    print("No valid articles to cluster.")


No valid articles to cluster.
