In [None]:
JSON_SCHEME = {
    "title_in_english": "The title of the content in english",
    "title_in_hebrew": "The title of the content in hebrew",
    "author_in_english": "The content's author name (string, empty if unknown) in english",
    "author_in_hebrew": "The content's author name (string, empty if unknown) in hebrew",
    "publication_date": "YYYY-MM-DD",
    "language": "The primary language of the content",
    "type": "One of the content types listed above",
    "media": "One or more of the media types listed above",
    "platform": "The website or platform where the content is published",
    "source": "for example, if a media post is reposting, or showing a Guardian's interview - the source = 'Guardian'",
    "reference": "a link to the primary document or official source cited (string, empty if none)",
    "summary_in_enlgish": "One clear, informative sentence summarizing the main content, in english",
    "summary_in_hebrew": "One clear, informative sentence summarizing the main content, in hebrew",
    "event_start_date": "YYYY-MM-DD",
    "event_end_date": "YYYY-MM-DD",
    "theme_tags": ["relevant", "theme", "tags"],
    "countries_and_organizations_tags": ["relevant", "countries_and_organizations", "tags"],
    "free_location_tags": ["relevant", "specific", "location", "tags", "not nessearily from the list of tags allowed."],
    "location_tags": ["relevant", "location", "tags"],
    "figures_tags": ["relevant", "figures", "tags"]
}

In [10]:
import os
import json
import pandas as pd

DATA_DIR = "/Users/delmedigo/Dev/langtest/langscrape/data/experiment_08_10_2025 copy/jsons"

mapping_dict = {
    "Number": "meta_data.id",
    "Link": "meta_data.url",
    "English Title": "summary.title_in_english",
    "Hebrew Title": "summary.title_in_hebrew",
    "English Description": ["summary.summary_in_enlgish", "summary.summary_in_english"],
    "Hebrew Description": "summary.summary_in_hebrew",
    "Published Date": "summary.publication_date",
    "Event date from": "summary.event_start_date",
    "Event date to": "summary.event_end_date",
    "Platform": "summary.platform",
    "Author (English)": "summary.author_in_english",
    "Author (Hebrew)": "summary.author_in_hebrew",
    "Source": "summary.source",
    "Reference": "summary.platform",
    "Language": "summary.language",
    "Location": "summary.free_location_tags",
    "Type": "summary.type",
    "Media": "summary.media",
    "Theme": "summary.theme_tags",
    "Countries & Organizations": "summary.countries_and_organizations_tags",
    "Location (tag)": "summary.location_tags",
    "Figures": "summary.figures_tags",
}

def get_value(obj, path):
    """Resolve dotted paths like 'a.b[0]' safely."""
    if not path:
        return ""
    parts = path.replace("]", "").split(".")
    current = obj
    try:
        for part in parts:
            if "[" in part:
                key, idx = part.split("[")
                current = current.get(key, [])[int(idx)]
            else:
                current = current.get(part, "")
        if isinstance(current, list):
            return ", ".join(map(str, current))
        return current
    except Exception:
        return ""

rows = []
files = [
    f for f in os.listdir(DATA_DIR)
    if f.endswith(".json") and f != "links.csv"
]

for file in files:
    path = os.path.join(DATA_DIR, file)
    try:
        with open(path, "r", encoding="utf-8") as f:
            data = json.load(f)
        row = {}
        for col, path_expr in mapping_dict.items():
            if isinstance(path_expr, list):
                for p in path_expr:
                    value = get_value(data, p) if p else ""
                    if value != "":
                        row[col] = value
            else:
                row[col] = get_value(data, path_expr) if path_expr else ""
        rows.append(row)
    except Exception as e:
        print(f"❌ Failed to load {file}: {e}")

df = pd.DataFrame(rows)
print(f"✅ Loaded {len(df)} records into DataFrame.")
df.to_excel(os.path.join(DATA_DIR, "results_table.xlsx"), index=False)
print(f"✅ Saved {len(df)} records in results_table.xlsx")

✅ Loaded 29 records into DataFrame.
✅ Saved 29 records in results_table.xlsx


In [3]:
extractor_inputs, extractor_outputs, summarizer_inputs, summarizer_outputs = [], [], [], []
total_tokens = []
for file in files:
    path = os.path.join(DATA_DIR, file)
    try:
        with open(path, "r", encoding="utf-8") as f:
            data = json.load(f)
        extractor_inputs.append(data["meta_data"]['token_usage']["extractor"]["input_tokens"])
        extractor_outputs.append(data["meta_data"]['token_usage']["extractor"]["output_tokens"])
        summarizer_inputs.append(data["meta_data"]['token_usage']["summarizer"]["input_tokens"])
        summarizer_outputs.append(data["meta_data"]['token_usage']["summarizer"]["output_tokens"])
    except Exception as e:
        print(f"❌ Failed to load {file}: {e}")

❌ Failed to load logging.json: 'meta_data'


In [None]:
from statistics import mean
import numpy
def get_metrics(l):
    l = [e for e in l if e != 0]
    if isinstance(l, list):
        return mean(l), max(l), min(l)
    elif isinstance(l, numpy.int64):
        return mean(l).item(), max(l).item(), min(l).item()
    else:
        return mean(l).item(), max(l).item(), min(l).item()

a, b, c = get_metrics(numpy.array(extractor_inputs) + numpy.array(extractor_outputs) + numpy.array(summarizer_inputs) + numpy.array(summarizer_outputs))

In [None]:
print("mean:",a,"max:",b,"min:",c)

In [None]:
!pip install 

In [None]:
!python -m newspaper --url="https://edition.cnn.com/2023/11/17/success/job-seekers-use-ai/index.html" --language=en --output-format=json --output-file=article.json


In [None]:
import newspaper

article = newspaper.article('https://www.haaretz.co.il/magazine/2025-10-23/ty-article-magazine/.highlight/0000019a-0a73-dfc6-a3bf-fb77ff1e0000')

In [None]:
article.summary