In [2]:
import os
import json
import pandas as pd

DATA_DIR = "/Users/delmedigo/Dev/langtest/langscrape/data/production_21_10_2025/jsons"

mapping_dict = {
    "Number": "meta_data.id",
    "Link": "meta_data.url",
    "Title": "summary.title",
    "Description*": "summary.summary",
    "Published Date": "summary.publication_date",
    "Event date from": "summary.event_start_date",
    "Event date to": "summary.event_end_date",
    "Platform": "summary.platform",
    "Author": "summary.author",
    "Source": "summary.source",
    "Reference": "summary.platform",
    "Language": "summary.language",
    "Location": "summary.location_tags",
    "Type": "summary.type",
    "Media": "summary.media",
    "Theme (tag)": "summary.theme_tags",
    "Places & Organizations (tag)": "summary.countries_and_organizations_tags",
    "Locations (tag)": "summary.location_tags",
    "Figures (tag)": "summary.figures_tags",
}

def get_value(obj, path):
    """Resolve dotted paths like 'a.b[0]' safely."""
    if not path:
        return ""
    parts = path.replace("]", "").split(".")
    current = obj
    try:
        for part in parts:
            if "[" in part:
                key, idx = part.split("[")
                current = current.get(key, [])[int(idx)]
            else:
                current = current.get(part, "")
        if isinstance(current, list):
            return ", ".join(map(str, current))
        return current
    except Exception:
        return ""

rows = []
files = [
    f for f in os.listdir(DATA_DIR)
    if f.endswith(".json") and f != "links.csv"
]

for file in files:
    path = os.path.join(DATA_DIR, file)
    try:
        with open(path, "r", encoding="utf-8") as f:
            data = json.load(f)
        row = {}
        for col, path_expr in mapping_dict.items():
            row[col] = get_value(data, path_expr) if path_expr else ""
        rows.append(row)
    except Exception as e:
        print(f"❌ Failed to load {file}: {e}")

df = pd.DataFrame(rows)
print(f"✅ Loaded {len(df)} records into DataFrame.")
df.to_excel(os.path.join(DATA_DIR, "results_table.xlsx"), index=False)
print(f"✅ Saved {len(df)} records in results_table.xlsx")

❌ Failed to load palestinian-prisoners-released-gaza.html.json: Expecting value: line 41 column 17 (char 2660)
❌ Failed to load israel-gaza-cease-fire-talks.html.json: Expecting value: line 40 column 17 (char 2850)
❌ Failed to load israel-hostages-killed-strike.html.json: Expecting value: line 37 column 17 (char 2231)
❌ Failed to load babies-dying-preventable-causes-besieged-gaza-oxfam.json: Expecting value: line 23 column 17 (char 5663)
❌ Failed to load biden-netanyahu-iran-israel-us-wont-support.json: Expecting value: line 47 column 17 (char 2942)
❌ Failed to load israel-hamas-war-gaza-news#missing-people-under-gazas-rubble-make-for-a-shadow-death-toll.json: Expecting value: line 34 column 17 (char 32343)
❌ Failed to load stop-playing-games-over-israeli-hostages-families-tell-netanyahu-hqjctj0mp.json: Expecting value: line 27 column 17 (char 775)
❌ Failed to load oct-7-attacks-hamas-israel-sexual-violence.html.json: Expecting value: line 33 column 17 (char 17029)
❌ Failed to load isr

In [None]:
extractor_inputs, extractor_outputs, summarizer_inputs, summarizer_outputs = [], [], [], []
total_tokens = []
for file in files:
    path = os.path.join(DATA_DIR, file)
    try:
        with open(path, "r", encoding="utf-8") as f:
            data = json.load(f)
        extractor_inputs.append(data["meta_data"]['token_usage']["extractor"]["input_tokens"])
        extractor_outputs.append(data["meta_data"]['token_usage']["extractor"]["output_tokens"])
        summarizer_inputs.append(data["meta_data"]['token_usage']["summarizer"]["input_tokens"])
        summarizer_outputs.append(data["meta_data"]['token_usage']["summarizer"]["output_tokens"])
    except Exception as e:
        print(f"❌ Failed to load {file}: {e}")

In [None]:
from statistics import mean
import numpy
def get_metrics(l):
    l = [e for e in l if e != 0]
    if isinstance(l, list):
        return mean(l), max(l), min(l)
    elif isinstance(l, numpy.int64):
        return mean(l).item(), max(l).item(), min(l).item()
    else:
        return mean(l).item(), max(l).item(), min(l).item()

a, b, c = get_metrics(numpy.array(extractor_inputs) + numpy.array(extractor_outputs) + numpy.array(summarizer_inputs) + numpy.array(summarizer_outputs))

In [None]:
print("mean:",a,"max:",b,"min:",c)

In [None]:
!pip install 

In [None]:
!python -m newspaper --url="https://edition.cnn.com/2023/11/17/success/job-seekers-use-ai/index.html" --language=en --output-format=json --output-file=article.json


In [None]:
import newspaper

article = newspaper.article('https://www.haaretz.co.il/magazine/2025-10-23/ty-article-magazine/.highlight/0000019a-0a73-dfc6-a3bf-fb77ff1e0000')

In [None]:
article.summary