In [None]:
import os
import json
from langdetect import detect
from deep_translator import GoogleTranslator
import trafilatura

In [3]:
input_dir = "./"
output_dir = "parsed_articles"
os.makedirs(output_dir, exist_ok=True)

In [5]:
def detect_language(text):
    try:
        return detect(text)
    except:
        return "unknown"

In [None]:
def translate_text(text, source_lang):
    if source_lang == "en":
        return text
    try:
        translated = GoogleTranslator(source=source_lang, 
                                      target="en").translate(text)
        return translated
    except:
        return ""

In [None]:
for filename in os.listdir(input_dir):
    if not filename.endswith(".json"):
        continue

    with open(os.path.join(input_dir, filename), 
              "r", 
              encoding="utf-8") as f:
        articles = json.load(f)

    for article in articles:
        url = article.get("url")
        if not url:
            continue

        downloaded = trafilatura.fetch_url(url)
        if not downloaded:
            continue

        full_text = trafilatura.extract(downloaded, 
                                        include_comments=False, 
                                        include_tables=False)
        if not full_text:
            continue

        lang = detect_language(full_text)
        translated = translate_text(full_text, lang)

        output_data = {
            "title": article.get("title"),
            "published_date": article.get("published date"),
            "url": url,
            "original_language": lang,
            "original_text": full_text,
            "translated_text_en": translated
        }

        base = os.path.splitext(filename)[0]
        out_filename = f"{base}_{articles.index(article)}_parsed.json"
        with open(os.path.join(output_dir, out_filename), 
                  "w", 
                  encoding="utf-8") as f_out:
            json.dump(output_data, f_out, ensure_ascii=False, indent=2)

        print(f"Parsed and saved: {out_filename}")

Parsed and saved: au_articles_0_parsed.json
Parsed and saved: au_articles_1_parsed.json
Parsed and saved: au_articles_2_parsed.json
Parsed and saved: au_articles_3_parsed.json
Parsed and saved: au_articles_4_parsed.json
Parsed and saved: au_articles_5_parsed.json
Parsed and saved: au_articles_6_parsed.json
Parsed and saved: au_articles_7_parsed.json
Parsed and saved: au_articles_8_parsed.json
Parsed and saved: au_articles_9_parsed.json
Parsed and saved: au_articles_10_parsed.json
Parsed and saved: au_articles_11_parsed.json
Parsed and saved: au_articles_12_parsed.json
Parsed and saved: au_articles_13_parsed.json
Parsed and saved: au_articles_14_parsed.json
Parsed and saved: au_articles_15_parsed.json
Parsed and saved: au_articles_16_parsed.json
Parsed and saved: au_articles_17_parsed.json
Parsed and saved: au_articles_18_parsed.json
Parsed and saved: au_articles_19_parsed.json
Parsed and saved: au_articles_20_parsed.json
Parsed and saved: au_articles_21_parsed.json
Parsed and saved: au