# Cope scripts w/ input to destination

In [None]:
from glob import glob
from pathlib import Path
import shutil, os
from tqdm import tqdm

src_list = glob("/home/b27jin/CodeModernization/notebooks/*.ipynb")
dst_dir = Path("/home/b27jin/CodeModernization/notebooks_w_output")
dst_dir.mkdir(parents=True, exist_ok=True)

copied, missing = 0, []
for file in tqdm(src_list):
    base = Path(file).stem
    src = Path(f"/home/b27jin/mle-bench-internal/docker-test/scripts_out_all/{base}.ipynb")
    dst = dst_dir / f"{base}.ipynb"

    if src.exists():
        shutil.copy2(src, dst)
        copied += 1
        # print(f"Copied: {src} -> {dst}")
    else:
        missing.append(str(src))

print(f"\nDone. Copied {copied}/{len(src_list)} files.")
if missing:
    print("Missing sources:")
    for m in missing:
        print(f"  {m}")

# Create a json file including file info

In [29]:
import json
from pathlib import Path
from tqdm import tqdm
from glob import glob
import datetime, re

src_list = glob("/home/b27jin/CodeModernization/notebooks/*.ipynb")

with open("/home/b27jin/mle-bench-internal/docker-test/mlebench_score.json", "r") as f:
    score_content = json.load(f)

with open("/home/b27jin/mle-bench-internal/fetch/kernal.json", "r") as f:
    kernel_content = json.load(f)

info = {}
for file in tqdm(src_list):
    key = Path(file).stem + ".ipynb"
    compt = key.split("_")[0]
    submission_id = "_".join(Path(file).stem.split("_")[1:]) + ".html"
    # print(key, compt, submission_id)

    info[key] = {}
    is_buggy = False if "status" in score_content[key] else True
    passed = True if "status" in score_content[key] else False

    if passed:
        with open(f"/home/b27jin/mle-bench-internal/docker-test/scripts_scores/{Path(file).stem}.json", "r", encoding="utf-8") as f:
            # Read the entire file content, which is a single JSON string literal
            file_content_string = f.read()
            # First, parse the outer string literal to get the inner content
            inner_content = json.loads(file_content_string)
            # Find the start of the JSON object within the inner content
            # This handles cases where there's leading text/logs.
            match = re.search(r'{\s*"competition_id":', inner_content)
            if not match:
                raise ValueError("Could not find JSON object in file content")
            # Extract the JSON part of the string from where the match started
            json_string = inner_content[match.start():]
            # Now, parse the actual JSON object
            score_data = json.loads(json_string)
    measured_score = score_data['score'] if passed else None
    reported_score = float(kernel_content[compt][submission_id]['ps']) if "ps" in kernel_content[compt][submission_id] else None
    
    if isinstance(reported_score, float) and isinstance(measured_score, float) and reported_score != 0.0:
        thrus = abs((measured_score-reported_score)/reported_score) 
    else:
        thrus = None
    replicable = True if thrus and thrus <= 0.5 else False

    year = kernel_content[compt][submission_id]['year']
    month = kernel_content[compt][submission_id]['month']
    day = kernel_content[compt][submission_id]['date']
    creation = datetime.datetime(year, month, day).strftime('%m/%d/%Y')
    info[key] = {
        "is_buggy": is_buggy,
        "passed": passed,
        "measured_score": measured_score,
        "reported_score": reported_score,
        "replicable": replicable,
        "thrus": thrus,
        "creation": creation
    }

with open("sampled_notebook_info.json", "w", encoding="utf-8") as f:
    json.dump(info, f, ensure_ascii=False, indent=2)


100%|██████████| 219/219 [00:00<00:00, 24798.94it/s]
