In [1]:
import json
import os
from collections import defaultdict
import statistics

In [2]:
# Папка с результатами классификации
INPUT_FILE = "classified_articles.json"
OUTPUT_FILE = "aggregated_statistics.json"

In [3]:
# Загрузка статей
with open(INPUT_FILE, "r", encoding="utf-8") as f:
    articles = json.load(f)

In [4]:
# Структуры агрегации
country_stats = defaultdict(lambda: {
    "total_articles": 0,
    "theme_counts": defaultdict(int),
    "sentiments": [],
})

In [5]:
# Маппинг тональности в числовую шкалу
sentiment_scores = {
    "positive": 1,
    "neutral": 0,
    "negative": -1
}

In [6]:
# Обработка статей
for article in articles:
    country = article.get("country", "unknown")
    sentiment = article.get("sentiment", "neutral")
    themes = article.get("themes", [])

    # Обновление статистики
    country_stats[country]["total_articles"] += 1
    country_stats[country]["sentiments"].append(sentiment_scores.get(sentiment, 0))
    for theme in themes:
        country_stats[country]["theme_counts"][theme] += 1

In [7]:
# Финальный подсчёт по каждой стране
aggregated = {}
for country, stats in country_stats.items():
    total = stats["total_articles"]
    sentiment_avg = round(statistics.mean(stats["sentiments"]), 3) if stats["sentiments"] else 0.0

    # Преобразование счётчиков тем в доли
    theme_distribution = {
        theme: round(count / total, 3)
        for theme, count in stats["theme_counts"].items()
    }

    aggregated[country] = {
        "total_articles": total,
        "average_sentiment": sentiment_avg,
        "theme_distribution": theme_distribution
    }

In [8]:
# Сохранение результата
with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
    json.dump(aggregated, f, indent=4, ensure_ascii=False)

print("Агрегация завершена. Результаты сохранены в", OUTPUT_FILE)

Агрегация завершена. Результаты сохранены в aggregated_statistics.json
