In [1]:
import os
import json
import pandas as pd

DATA_DIR = "/Users/delmedigo/Dev/langtest/langscrape/data/jsons"

mapping_dict = {
    "Number": "meta_data.id",
    "Link": "meta_data.url",
    "Title": "summary.title",
    "Description*": "summary.summary",
    "Published Date": "summary.publication_date",
    "Event date from": "summary.event_start_date",
    "Event date to": "summary.event_end_date",
    "Platform": "summary.platform",
    "Author": "summary.author",
    "Source": None,
    "Reference": None,
    "Language": "summary.language",
    "Location": "summary.location_tags",
    "Type": "summary.type",
    "Media": "summary.media",
    "Theme (tag)": "summary.theme_tags",
    "Places & Organizations (tag)": "summary.countries_and_organizations_tags",
    "Locations (tag)": "summary.location_tags",
    "Figures (tag)": "summary.figures_tags",
}

def get_value(obj, path):
    """Resolve dotted paths like 'a.b[0]' safely."""
    if not path:
        return ""
    parts = path.replace("]", "").split(".")
    current = obj
    try:
        for part in parts:
            if "[" in part:
                key, idx = part.split("[")
                current = current.get(key, [])[int(idx)]
            else:
                current = current.get(part, "")
        if isinstance(current, list):
            return ", ".join(map(str, current))
        return current
    except Exception:
        return ""

rows = []
files = [
    f for f in os.listdir(DATA_DIR)
    if f.endswith(".json") and f != "links.csv"
]

for file in files:
    path = os.path.join(DATA_DIR, file)
    try:
        with open(path, "r", encoding="utf-8") as f:
            data = json.load(f)
        row = {}
        for col, path_expr in mapping_dict.items():
            row[col] = get_value(data, path_expr) if path_expr else ""
        rows.append(row)
    except Exception as e:
        print(f"❌ Failed to load {file}: {e}")

df = pd.DataFrame(rows)
print(f"✅ Loaded {len(df)} records into DataFrame.")
df.to_excel(os.path.join(DATA_DIR, "results_table.xlsx"), index=False)
print(f"✅ Saved {len(df)} records in results_table.xlsx")

✅ Loaded 27 records into DataFrame.
✅ Saved 27 records in results_table.xlsx


In [2]:
extractor_inputs, extractor_outputs, summarizer_inputs, summarizer_outputs = [], [], [], []
total_tokens = []
for file in files:
    path = os.path.join(DATA_DIR, file)
    try:
        with open(path, "r", encoding="utf-8") as f:
            data = json.load(f)
        extractor_inputs.append(data["meta_data"]['token_usage']["extractor"]["input_tokens"])
        extractor_outputs.append(data["meta_data"]['token_usage']["extractor"]["output_tokens"])
        summarizer_inputs.append(data["meta_data"]['token_usage']["summarizer"]["input_tokens"])
        summarizer_outputs.append(data["meta_data"]['token_usage']["summarizer"]["output_tokens"])
    except Exception as e:
        print(f"❌ Failed to load {file}: {e}")

❌ Failed to load logging.json: 'meta_data'


In [3]:
from statistics import mean
import numpy
def get_metrics(l):
    l = [e for e in l if e != 0]
    if isinstance(l, list):
        return mean(l), max(l), min(l)
    elif isinstance(l, numpy.int64):
        return mean(l).item(), max(l).item(), min(l).item()
    else:
        return mean(l).item(), max(l).item(), min(l).item()

a, b, c = get_metrics(numpy.array(extractor_inputs) + numpy.array(extractor_outputs) + numpy.array(summarizer_inputs) + numpy.array(summarizer_outputs))

In [4]:
print("mean:",a,"max:",b,"min:",c)

mean: 32271 max: 54881 min: 10704


In [None]:
!pip install 

Collecting newspaper4k
  Downloading newspaper4k-0.9.3.1-py3-none-any.whl.metadata (14 kB)
Collecting Pillow>=4.0.0 (from newspaper4k)
  Using cached pillow-12.0.0-cp314-cp314-macosx_11_0_arm64.whl.metadata (8.8 kB)
Collecting feedparser>=6.0.0 (from newspaper4k)
  Downloading feedparser-6.0.12-py3-none-any.whl.metadata (2.7 kB)
Collecting nltk>=3.6.6 (from newspaper4k)
  Downloading nltk-3.9.2-py3-none-any.whl.metadata (3.2 kB)
Collecting tldextract>=2.0.1 (from newspaper4k)
  Downloading tldextract-5.3.0-py3-none-any.whl.metadata (11 kB)
Collecting sgmllib3k (from feedparser>=6.0.0->newspaper4k)
  Downloading sgmllib3k-1.0.0.tar.gz (5.8 kB)
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
[?25hCollecting click (from nltk>=3.6.6->newspaper4k)
  Using cached click-8.3.0-py3-none-any.whl.metadata (2.6 kB)
Collecting joblib (from nltk>=3.6.6->newspaper4k)
  Using cached jo

Collecting lxml_html_clean
  Downloading lxml_html_clean-0.4.3-py3-none-any.whl.metadata (2.3 kB)
Downloading lxml_html_clean-0.4.3-py3-none-any.whl (14 kB)
Installing collected packages: lxml_html_clean
Successfully installed lxml_html_clean-0.4.3


In [8]:
!python -m newspaper --url="https://edition.cnn.com/2023/11/17/success/job-seekers-use-ai/index.html" --language=en --output-format=json --output-file=article.json


[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/delmedigo/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [15]:
import newspaper

article = newspaper.article('https://www.haaretz.co.il/magazine/2025-10-23/ty-article-magazine/.highlight/0000019a-0a73-dfc6-a3bf-fb77ff1e0000')

In [17]:
article.summary

''