In [1]:
from transformers import pipeline
import json
import os
from tqdm import tqdm

In [2]:
# === Zero-Shot классификатор ===
topic_classifier = pipeline("zero-shot-classification",
                      model="facebook/bart-large-mnli",
                      device=0)




In [3]:
# === Сентимент-анализатор ===
sentiment_classifier = pipeline("sentiment-analysis",
                              model="distilbert-base-uncased-finetuned-sst-2-english",
                              device=0)

In [4]:
# === Темы для классификации ===
candidate_labels = [
    "Politics", "Economy", "Military", "Health", "Technology",
    "Energy", "Diplomacy", "Environment", "Conflict", "Elections",
    "Crime", "Education", "Transport", "Culture"
]

In [5]:
input_folder = "cleaned_articles"
output_folder = "classified_articles"
os.makedirs(output_folder, exist_ok=True)

In [None]:
# Обработка всех файлов
for filename in tqdm(os.listdir(input_folder)):
    if not filename.endswith("_cleaned.json"):
        continue

    filepath = os.path.join(input_folder, filename)
    with open(filepath, "r", encoding="utf-8") as f:
        article = json.load(f)

    sentences = article.get("cleaned_sentences", [])
    
    if not sentences:
        print(f"Empty content in {filename}, skipping.")
        continue

    full_text = " ".join(sentences)

    # Тематическая классификация
    try:
        topic_result = topic_classifier(full_text, 
                                        candidate_labels, 
                                        multi_label=True)
        article["topics"] = dict(zip(topic_result["labels"], 
                                     topic_result["scores"]))
    except Exception as e:
        print(f"Topic classification failed for {filename}: {e}")
        article["topics"] = {}

    # Анализ тональности
    try:
        sentiments = sentiment_classifier(sentences, batch_size=32)
        pos, neg, neu = 0, 0, 0
        for s in sentiments:
            label = s["label"]
            if label == "POSITIVE":
                pos += 1
            elif label == "NEGATIVE":
                neg += 1
            else:
                neu += 1
        total = len(sentiments)
        article["sentiment"] = {
            "positive": round(pos / total, 3),
            "negative": round(neg / total, 3),
            "neutral": round(neu / total, 3)
        }
    except Exception as e:
        print(f"Sentiment analysis failed for {filename}: {e}")
        article["sentiment"] = {}

    # Сохраняем
    output_path = os.path.join(output_folder, 
                               filename.replace("_cleaned.json", 
                                                "_classified.json"))
    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(article, f, ensure_ascii=False, indent=2)

  1%|▏         | 10/750 [00:14<18:36,  1.51s/it]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
100%|██████████| 750/750 [17:56<00:00,  1.44s/it]
