In [None]:
%load_ext autoreload
%autoreload 2


from src import functions_claude2

In [None]:
input_file2 = 'batches/claude/batch_2025-05-10--011952.json'
output_file2 = 'batches/claude-finished/zero_shot_nonanno.jsonl'

results = functions_claude2.results_from_file(input_file2, output_file2)

In [None]:
input_file3 = 'batches/claude/batch_2025-05-10--172326.json'
output_file3 = 'batches/claude-finished/zero_shot_last-four.jsonl'

results = results + functions_claude2.results_from_file(input_file3, output_file3)

In [None]:
input_file = 'batches/claude/batch_2025-05-06--001511.json'
output_file = 'batches/claude-finished/zero_shot_explained.jsonl'
results = results + functions_claude2.results_from_file(input_file, output_file)

gold_labels = [
    entry.get('gold_label', '3_kein_thema') for entry in results
]
predictions = [
    entry['output']['result']['message']['content'][0]['input']['topic_ai'] for entry in results
]

from sklearn.metrics import classification_report

print(classification_report(gold_labels, predictions, zero_division=0, digits=4))

In [None]:
len(results)

In [None]:
def get_io_tokens(output):
    usage = output['result']['message']['usage']
    input_tokens = usage['input_tokens']
    output_tokens = usage['output_tokens']

    return input_tokens, output_tokens


tokens = [
    get_io_tokens(entry['output']) for entry in results
]


cost = functions_claude2.PricingSonnet
input_cost = sum([x[0] for x in tokens]) * cost.input_tokens_batch
output_cost = sum([x[1] for x in tokens]) * cost.output_tokens_batch
total_cost = input_cost + output_cost
print(f"Total cost: {total_cost:.2f} EUR")


In [None]:
len(results)

In [None]:
import json
from pathlib import Path
from collections import defaultdict

corpus_dict = {}
url_duplicates = defaultdict(list)
results_dict = {
    result['url']: result for result in results
}

def get_label(output):
    try:
        label = output['result']['message']['content'][0]['input']['topic_ai']
    except KeyError:
        label = None
    return label

CORPUSDIR = '/home/brunobrocai/Data/MoWiKo/Paper-themKorp-2/final_corpus'

# First pass: identify duplicates
for file in Path(CORPUSDIR).glob('*.json'):
    with open(file, 'r', encoding='utf-8') as f:
        data = json.load(f)
    url = data['url']
    url_duplicates[url].append(str(file))

    # Still store in corpus_dict, but now we're aware of duplicates
    if url not in corpus_dict:
        corpus_dict[url] = data
        corpus_dict[url]['filepath'] = str(file)

# Write Claude labels to corpus
for url, result in results_dict.items():
    label = get_label(result['output'])

    # Handle duplicates if they exist
    if url in url_duplicates and len(url_duplicates[url]) > 1:
        print(f"Found {len(url_duplicates[url])} duplicates for URL: {url}")
        for filepath in url_duplicates[url]:
            with open(filepath, 'r', encoding='utf-8') as f:
                data = json.load(f)
            data['Claude_0shot'] = label
            with open(filepath, 'w', encoding='utf-8') as f:
                json.dump(data, f, ensure_ascii=False, indent=4)
    elif url in corpus_dict:
        with open(corpus_dict[url]['filepath'], 'r', encoding='utf-8') as f:
            data = json.load(f)
        data['Claude_0shot'] = label
        with open(corpus_dict[url]['filepath'], 'w', encoding='utf-8') as f:
            json.dump(data, f, ensure_ascii=False, indent=4)

In [None]:
# Write claude labels to corpus
for url, result in results_dict.items():
    if url in corpus_dict:
        with open(corpus_dict[url]['filepath'], 'r', encoding='utf-8') as f:
            data = json.load(f)
        data['Claude_0shot'] = get_label(result['output'])
        with open(corpus_dict[url]['filepath'], 'w', encoding='utf-8') as f:
            json.dump(data, f, ensure_ascii=False, indent=4)