In [1]:
import os
import json
import pandas as pd

DATA_DIR = "/Users/delmedigo/Dev/langtest/langscrape/data/jsons"

mapping_dict = {
    "Number": "meta_data.id",
    "Link": "meta_data.url",
    "Title": "summary.title",
    "Description*": "summary.summary",
    "Published Date": "summary.publication_date",
    "Event date from": "summary.event_start_date",
    "Event date to": "summary.event_end_date",
    "Platform": "summary.platform",
    "Author": "summary.author",
    "Source": None,
    "Reference": None,
    "Language": "summary.language",
    "Location": "summary.location_tags",
    "Type": "summary.type",
    "Media": "summary.media",
    "Theme (tag)": "summary.theme_tags",
    "Places & Organizations (tag)": "summary.countries_and_organizations_tags",
    "Locations (tag)": "summary.location_tags",
    "Figures (tag)": "summary.figures_tags",
}

def get_value(obj, path):
    """Resolve dotted paths like 'a.b[0]' safely."""
    if not path:
        return ""
    parts = path.replace("]", "").split(".")
    current = obj
    try:
        for part in parts:
            if "[" in part:
                key, idx = part.split("[")
                current = current.get(key, [])[int(idx)]
            else:
                current = current.get(part, "")
        if isinstance(current, list):
            return ", ".join(map(str, current))
        return current
    except Exception:
        return ""

rows = []
files = [
    f for f in os.listdir(DATA_DIR)
    if f.endswith(".json") and f != "links.csv"
]

for file in files:
    path = os.path.join(DATA_DIR, file)
    try:
        with open(path, "r", encoding="utf-8") as f:
            data = json.load(f)
        row = {}
        for col, path_expr in mapping_dict.items():
            row[col] = get_value(data, path_expr) if path_expr else ""
        rows.append(row)
    except Exception as e:
        print(f"❌ Failed to load {file}: {e}")

df = pd.DataFrame(rows)
print(f"✅ Loaded {len(df)} records into DataFrame.")
df.to_excel(os.path.join(DATA_DIR, "results_table.xlsx"), index=False)
print(f"✅ Saved {len(df)} records in results_table.xlsx")

✅ Loaded 48 records into DataFrame.
✅ Saved 48 records in results_table.xlsx


In [None]:
extractor_inputs, extractor_outputs, summarizer_inputs, summarizer_outputs = [], [], [], []
total_tokens = []
for file in files:
    path = os.path.join(DATA_DIR, file)
    try:
        with open(path, "r", encoding="utf-8") as f:
            data = json.load(f)
        extractor_inputs.append(data["meta_data"]['token_usage']["extractor"]["input_tokens"])
        extractor_outputs.append(data["meta_data"]['token_usage']["extractor"]["output_tokens"])
        summarizer_inputs.append(data["meta_data"]['token_usage']["summarizer"]["input_tokens"])
        summarizer_outputs.append(data["meta_data"]['token_usage']["summarizer"]["output_tokens"])
    except Exception as e:
        print(f"❌ Failed to load {file}: {e}")

❌ Failed to load logging.json: 'meta_data'


In [20]:
from statistics import mean
import numpy
def get_metrics(l):
    l = [e for e in l if e != 0]
    if isinstance(l, list):
        return mean(l), max(l), min(l)
    elif isinstance(l, numpy.int64):
        return mean(l).item(), max(l).item(), min(l).item()
    else:
        return mean(l).item(), max(l).item(), min(l).item()

a, b, c = get_metrics(numpy.array(extractor_inputs) + numpy.array(extractor_outputs) + numpy.array(summarizer_inputs) + numpy.array(summarizer_outputs))

In [21]:
print("mean:",a,"max:",b,"min:",c)

mean: 28192 max: 79676 min: 4556
