# Cope scripts w/ input to destination

In [None]:
from glob import glob
from pathlib import Path
import shutil, os
from tqdm import tqdm

src_list = glob("/home/b27jin/CodeModernization/notebooks/*.ipynb")
dst_dir = Path("/home/b27jin/CodeModernization/notebooks_w_output")
dst_dir.mkdir(parents=True, exist_ok=True)

copied, missing = 0, []
for file in tqdm(src_list):
    base = Path(file).stem
    src = Path(f"/home/b27jin/mle-bench-internal/docker-test/scripts_out_all/{base}.ipynb")
    dst = dst_dir / f"{base}.ipynb"

    if src.exists():
        shutil.copy2(src, dst)
        copied += 1
        # print(f"Copied: {src} -> {dst}")
    else:
        missing.append(str(src))

print(f"\nDone. Copied {copied}/{len(src_list)} files.")
if missing:
    print("Missing sources:")
    for m in missing:
        print(f"  {m}")

# Create a json file including file info

In [None]:
import json
from pathlib import Path
from tqdm import tqdm
from glob import glob
import datetime, re

src_list = glob("/home/b27jin/CodeModernization/notebooks/*.ipynb")

with open("/home/b27jin/mle-bench-internal/docker-test/mlebench_score.json", "r") as f:
    score_content = json.load(f)

with open("/home/b27jin/CodeModernization/kernel.json", "r") as f:
    kernel_content = json.load(f)

info = {}
for file in tqdm(src_list):
    key = Path(file).stem + ".ipynb"
    compt = key.split("_")[0]
    submission_id = "_".join(Path(file).stem.split("_")[1:]) + ".html"
    # print(key, compt, submission_id)

    info[key] = {}
    is_buggy = False if "status" in score_content[key] else True
    passed = True if "status" in score_content[key] else False

    if passed:
        with open(f"/home/b27jin/mle-bench-internal/docker-test/scripts_scores/{Path(file).stem}.json", "r", encoding="utf-8") as f:
            # Read the entire file content, which is a single JSON string literal
            file_content_string = f.read()
            # First, parse the outer string literal to get the inner content
            inner_content = json.loads(file_content_string)
            # Find the start of the JSON object within the inner content
            # This handles cases where there's leading text/logs.
            match = re.search(r'{\s*"competition_id":', inner_content)
            if not match:
                raise ValueError("Could not find JSON object in file content")
            # Extract the JSON part of the string from where the match started
            json_string = inner_content[match.start():]
            # Now, parse the actual JSON object
            score_data = json.loads(json_string)
    measured_score = score_data['score'] if passed else None
    reported_score = float(kernel_content[compt][submission_id]['ps']) if "ps" in kernel_content[compt][submission_id] else None
    
    if isinstance(reported_score, float) and isinstance(measured_score, float) and reported_score != 0.0:
        thrus = abs((measured_score-reported_score)/reported_score) 
    else:
        thrus = None
    replicable = True if thrus and thrus <= 0.5 else False

    year = kernel_content[compt][submission_id]['year']
    month = kernel_content[compt][submission_id]['month']
    day = kernel_content[compt][submission_id]['date']
    creation = datetime.datetime(year, month, day).strftime('%m/%d/%Y')
    info[key] = {
        "is_buggy": is_buggy,
        "passed": passed,
        "measured_score": measured_score,
        "reported_score": reported_score,
        "replicable": replicable,
        "thrus": thrus,
        "creation": creation
    }

with open("sampled_notebook_info.json", "w", encoding="utf-8") as f:
    json.dump(info, f, ensure_ascii=False, indent=2)


# nb stats

In [3]:
import json
import os
import re
from pathlib import Path
from tqdm import tqdm

# Read the JSON file
json_path = "/home/b27jin/mle-bench-internal/docker-test/executable_files_w_timer_parrallel_full.json"
scripts_dir = "/home/b27jin/mle-bench-internal/docker-test/scripts_out_all"

with open("/home/b27jin/mle-bench-internal/docker-test/mlebench_score.json", "r") as f:
    score_content = json.load(f)
with open("/home/b27jin/CodeModernization/kernel.json", "r") as f:
    kernel_content = json.load(f)
    
def get_score(file):
    key = Path(file).stem + ".ipynb"
    passed = True if "status" in score_content[key] else False
    if passed:
        with open(f"/home/b27jin/mle-bench-internal/docker-test/scripts_scores/{Path(file).stem}.json", "r", encoding="utf-8") as f:
                # Read the entire file content, which is a single JSON string literal
                file_content_string = f.read()
                # First, parse the outer string literal to get the inner content
                inner_content = json.loads(file_content_string)
                # Find the start of the JSON object within the inner content
                # This handles cases where there's leading text/logs.
                match = re.search(r'{\s*"competition_id":', inner_content)
                if not match:
                    raise ValueError("Could not find JSON object in file content")
                # Extract the JSON part of the string from where the match started
                json_string = inner_content[match.start():]
                # Now, parse the actual JSON object
                score_data = json.loads(json_string)
    measured_score = score_data['score'] if passed else None
    return measured_score

def is_replicable(entity,measured_score):
    compt = entity.split("_")[0]
    submission_id = "_".join(Path(entity).stem.split("_")[1:]) + ".html"
    reported_score = float(kernel_content[compt][submission_id]['ps']) if "ps" in kernel_content[compt][submission_id] else None

    if isinstance(reported_score, float) and isinstance(measured_score, float) and reported_score != 0.0:
        thrus = abs((measured_score-reported_score)/reported_score) 
    else:
        thrus = None
    replicable = True if thrus and thrus <= 0.5 else False

    return replicable

error_notebooks = []
error_notebooks_no_csv = []
error_notebooks_csv_overThr = []
error_notebooks_csv_belowThr = []

no_error_notebooks = []
no_error_notebooks_no_csv = []
no_error_notebooks_csv_overThr = []
no_error_notebooks_csv_belowThr = []

timeout_notebooks = []

with open(json_path, 'r') as f:
    data = json.load(f)

print(f"Found {len(data)} entities in the JSON file")

# Check each entity
for entity, info in tqdm(data.items()):
    # Check if status not exists
    if 'status' not in info and 'execution_time' in info and info['execution_time'] >= 600:
        timeout_notebooks.append(entity)
    else:
        # Check if the script file exists
        script_path = os.path.join(scripts_dir, entity)
        if os.path.exists(script_path):
            # Try to read and check for errors in the script
            # Iterate over all notebook files in output_dir
            try:
                nb = json.loads(Path(script_path).read_text(encoding="utf-8"))
            except Exception as e:
                print(f"Failed to load {script_path.name}: {e}")
                continue

            # Scan cells for any error outputs
            has_error = False
            for cell in nb.get("cells", []):
                if cell.get("cell_type") != "code":
                    continue
                for out in cell.get("outputs", []):
                    if out.get("output_type") == "error":
                        error_notebooks.append(entity)
                        measured_score = get_score(script_path)
                        if measured_score==None:
                            error_notebooks_no_csv.append(entity)
                        else:
                            replicable = is_replicable(entity, measured_score)
                            if replicable:
                                error_notebooks_csv_belowThr.append(entity)
                            else:
                                error_notebooks_csv_overThr.append(entity)
                        

                        has_error = True
                        break
                if has_error:
                    break
            
            # If no errors found, add to no_error_notebooks
            if not has_error:
                no_error_notebooks.append(entity)
                measured_score = get_score(script_path)
                if measured_score==None:
                    no_error_notebooks_no_csv.append(entity)
                else:
                    replicable = is_replicable(entity, measured_score)
                    if replicable:
                        no_error_notebooks_csv_belowThr.append(entity)
                    else:
                        no_error_notebooks_csv_overThr.append(entity)

        else:
            print(f"Script NOT FOUND: {script_path}")


print(f"Timeout Notebooks: {len(timeout_notebooks)}")
print(f"Notebooks with errors: {len(error_notebooks)}")
print(f"    - w/o csv: {len(error_notebooks_no_csv)}")
print(f"    - w/ csv (replicable): {len(error_notebooks_csv_belowThr)}")
print(f"    - w/ csv (non-replicable): {len(error_notebooks_csv_overThr)}")
print(f"Notebooks without errors: {len(no_error_notebooks)}")
print(f"    - w/o csv: {len(no_error_notebooks_no_csv)}")
print(f"    - w/ csv (replicable): {len(no_error_notebooks_csv_belowThr)}")
print(f"    - w/ csv (non-replicable): {len(no_error_notebooks_csv_overThr)}")

Found 12036 entities in the JSON file


100%|██████████| 12036/12036 [03:31<00:00, 56.93it/s] 

Timeout Notebooks: 456
Notebooks with errors: 8778
    - w/o csv: 6113
    - w/ csv (replicable): 2035
    - w/ csv (non-replicable): 630
Notebooks without errors: 2802
    - w/o csv: 183
    - w/ csv (replicable): 2374
    - w/ csv (non-replicable): 245





# Stats of all collected scripts

In [1]:
import json
import os
import re
import html
import random
import inspect
from tqdm import tqdm
import nbformat
from nbformat.v4 import new_notebook, new_markdown_cell, new_code_cell

with open("/home/b27jin/mle-bench-internal/docker-test/executable_files_stats.json", "r", encoding="utf-8") as f:
    data = list(json.load(f).keys())

def read_html_content(file_path, nb=None):
    """Read the content of an HTML file and append code cells to notebook if provided"""

    with open(file_path, "r", encoding="utf-8") as file:
        content = file.read()

        # Find all code blocks with highlight hl-ipython3 class
        highlight_blocks = re.findall(r'<div class="highlight hl-ipython3">(.*?)</div>', content, re.DOTALL)
        
        # Process all input areas as before for backward compatibility
        input_areas = re.findall(r'<div class="input_area">(.*?)</div>', content, re.DOTALL)
        input_areas = "".join(input_areas) if input_areas else ""
        input_areas = re.sub(r' *<pre>', '<pre>', input_areas)
        
        # If notebook object provided, add each code block as a cell
        if nb is not None:
            # nb.cells.append(new_code_cell('%pip install Unidecode monai ttach optuna optuna-integration'))
            nb.cells.append(new_code_cell('import pandas as pd\nfrom pathlib import Path'))
            for block in highlight_blocks:
                # Extract code from the highlight block
                code_match = re.search(r'<pre>(.*?)</pre>', block, re.DOTALL)
                if code_match:
                    # Clean up the code
                    code_text = html.unescape(code_match.group(1))
                    raw_code = assemble_code_regex(code_text)
                    # code = inspect.cleandoc(raw_code)
                    nb.cells.append(new_code_cell(raw_code))
        
        return input_areas
    
def assemble_code_regex(html_snippet: str) -> str:
    """
    Uses a regular expression to remove HTML tags from the snippet and unescapes HTML entities.
    """
    # Unescape any HTML entities (if present)
    code = html.unescape(html_snippet)
    # Remove all HTML tags using regex
    code = re.sub(r'</?[a-zA-Z][^>]*>', '', code)

    return code

# parent = 'C:\\Users\\b27jin\\Documents\\mle-bench-internal\\fetch\\competitions'
parent = "/home/b27jin/mle-bench-internal/fetch/competitions"

# 1. Load the JSON data
with open("/home/b27jin/CodeModernization/kernel.json", "r", encoding="utf-8") as f:
    data = json.load(f)
print(f"Total entries: {sum([len(files) for comp, files in data.items()])}")

# 2. Flatten & filter entries
filtered = []
for comp, files in data.items():
    for fname, info in files.items():
        if info['year'] >= 2019 and "ps" in info and info['runtime'] <= 600 and len(info['datasets'])<=1:
            # keep a reference to competition and filename if you need them
            filtered.append((comp, fname))

print(f"Total filtered entries: {len(filtered)}")

r_count = 0
valid_scripts = []
for comp, fname in tqdm(filtered):
    path = os.path.join(parent, comp, 'html', fname)

    # Create notebook first
    nb = new_notebook()

    # Pass notebook to read_html_content to add cells directly
    content = read_html_content(path, nb)

    # If no cells were added, add the whole content as one cell
    if len(nb.cells) != 1:
       valid_scripts.append(f"{comp}_{fname.split('.html')[0]}.ipynb")
    else:
        r_count += 1


print(f'{len(valid_scripts)} valid scripts')
print(f'{r_count=}')


Total entries: 151236
Total filtered entries: 12197


100%|██████████| 12197/12197 [01:59<00:00, 102.33it/s]

12043 valid scripts
r_count=154





In [2]:
json_path = "/home/b27jin/mle-bench-internal/docker-test/executable_files_w_timer_parrallel_full.json"
with open(json_path, 'r') as f:
    data = json.load(f)
# Check each entity
files_only = []
for entity, info in tqdm(data.items()):
    files_only.append(entity)

print(f'{len(files_only)} files in scripts_full')

valid_set = set(valid_scripts)
print(f'{len(valid_set)} valid scripts')
files_set = set(files_only)

in_files_not_valid = sorted(files_set - valid_set)
in_valid_not_files = sorted(valid_set - files_set)
print(f"In files_only but not in valid_scripts: {len(in_files_not_valid)}")
for f in in_files_not_valid[:25]:
    print("  ", f)
print(f"In valid_scripts but not in files_only: {len(in_valid_not_files)}")
for f in in_valid_not_files:
    print("  ", f)


100%|██████████| 12036/12036 [00:00<00:00, 4799186.51it/s]

12036 files in scripts_full
12043 valid scripts
In files_only but not in valid_scripts: 4
   osic-pulmonary-fibrosis-progression_fflorio_osic-starter-in-r-preprocessing-with-recipes_v7_C1.ipynb
   spaceship-titanic_draganpinsent98_spaceship-titanic-r-glm-rf-dt-nb_v29_C1.ipynb
   spaceship-titanic_draganpinsent98_spaceship-titanic-r-glm-rf-dt-nb_v30_C1.ipynb
   spaceship-titanic_yiukitcheung_spaceship-titanic-r-0-804-accuracy_v21_C1.ipynb
In valid_scripts but not in files_only: 11
   imet-2020-fgvc7_ashkhagan_imet2020_v2_C1.ipynb
   kuzushiji-recognition_seriousran_try-to-break-0_v1_C1.ipynb
   rsna-breast-cancer-detection_abdulkadirguner_27subat-sub2-sites1and2_v1_C1.ipynb
   rsna-breast-cancer-detection_cafelatte1_rsna-baseline-with-logistic-regression_v22_C1.ipynb
   rsna-breast-cancer-detection_dschettler8845_rsna-bcd-simple-age-baseline-submission_v11_C1.ipynb
   rsna-breast-cancer-detection_jitshil143_rsna-breast-cancer-inference_v11_C1.ipynb
   rsna-breast-cancer-detection_meerat




# Create Kernel

In [1]:
from tqdm import tqdm
import re,os,glob
import datetime
from collections import Counter
import ast
import html
import json
import warnings
import tempfile
import nbformat
import subprocess
from pathlib import Path
from nbformat.v4 import new_notebook, new_code_cell
warnings.filterwarnings("ignore", category=SyntaxWarning)

def get_folders(base_path):
    """Get the folder names under the base path"""
    return [f for f in os.listdir(base_path) if os.path.isdir(os.path.join(base_path, f))]

def read_html_content(file_path, nb=None):
    """Read the content of an HTML file and append code cells to notebook if provided"""

    with open(file_path, "r", encoding="utf-8") as file:
        content = file.read()

        # Find all code blocks with highlight hl-ipython3 class
        highlight_blocks = re.findall(r'<div class="highlight hl-ipython3">(.*?)</div>', content, re.DOTALL)
        
        # Process all input areas as before for backward compatibility
        input_areas = re.findall(r'<div class="input_area">(.*?)</div>', content, re.DOTALL)
        input_areas = "".join(input_areas) if input_areas else ""
        input_areas = re.sub(r' *<pre>', '<pre>', input_areas)
        
        # If notebook object provided, add each code block as a cell
        if nb is not None:
            # nb.cells.append(new_code_cell('%pip install Unidecode monai ttach optuna optuna-integration'))
            nb.cells.append(new_code_cell('import pandas as pd\nfrom pathlib import Path'))
            for block in highlight_blocks:
                # Extract code from the highlight block
                code_match = re.search(r'<pre>(.*?)</pre>', block, re.DOTALL)
                if code_match:
                    # Clean up the code
                    code_text = html.unescape(code_match.group(1))
                    raw_code = assemble_code_regex(code_text)
                    # code = inspect.cleandoc(raw_code)
                    nb.cells.append(new_code_cell(raw_code))
        
        return nb
    
def assemble_code_regex(html_snippet: str) -> str:
    """
    Uses a regular expression to remove HTML tags from the snippet and unescapes HTML entities.
    """
    # Unescape any HTML entities (if present)
    code = html.unescape(html_snippet)
    # Remove all HTML tags using regex
    code = re.sub(r'</?[a-zA-Z][^>]*>', '', code)

    return code


def time_to_seconds(text):
    """
    Extracts hours, minutes, and seconds from a text and converts them to seconds.
    Expected formats are, for example, "4s", "1m 54s", or "1h 59m 59s".
    """
    # The regex looks for optional hours and minutes, and mandatory seconds.
    time_pattern = r'(?:(?P<h>\d+)\s*h)?\s*(?:(?P<m>\d+)\s*m)?\s*(?P<s>\d+)\s*s'
    match = re.search(time_pattern, text)
    if match:
        h = int(match.group('h')) if match.group('h') is not None else 0
        m_val = int(match.group('m')) if match.group('m') is not None else 0
        s = int(match.group('s'))
        return h * 3600 + m_val * 60 + s
    return None

def get_imports_from_file(notebook):
    """Use pigar to detect dependencies"""
    deps = set()
    with tempfile.TemporaryDirectory() as tmpdir:
        workdir = Path(tmpdir)
        
        with open(workdir / "script.ipynb", "w", encoding="utf-8") as file:
            nbformat.write(notebook, file)
        try:
            # pigar generate <folder>
            # print("Running pigar...")
            result = subprocess.run(
                ["pigar", "generate", "--auto-select", tmpdir],
                # ["pipreqs", "--scan-notebooks", tmpdir],
                cwd=tmpdir,
                capture_output=True,
                text=True,
                # input="*\n" + "y\n" + "*\n" * 20,
                input="y\n" * 20,
            )
            # print("Pigar completed.")

            # print("STDOUT:", result.stdout)
            # print("STDERR:", result.stderr)
            # print("Return code:", result.returncode)

            print(get_folders(tmpdir))
            # Parse requirements.txt output
            req_file = Path(tmpdir) / "requirements.txt"
            print(req_file.exists())
            if req_file.exists():
                for line in req_file.read_text().splitlines():
                    line = line.strip()
                    if not line or line.startswith('#'):
                        continue
                    pkg = line.split('==')[0].split('>=')[0].split('<=')[0].strip()
                    if pkg:
                        deps.add(pkg)
            
                return deps
        except Exception as e:
            print(f"pigar failed: {e}")
            pass
    
    return deps


# parent = 'C:\\Users\\b27jin\\Documents\\mle-bench-internal\\fetch\\competitions'
parent = '/home/b27jin/mle-bench-internal/fetch/competitions'

entity = {}
for competi in tqdm(get_folders(parent), desc="Competitions"):
# for competi in dev:
    entity[competi] = {}
    i=1
    for file in tqdm(glob.glob(os.path.join(parent, competi, 'meta_html','*_C1.html')), desc=competi, leave=False):
        entity[competi][file.split("/")[-1]] = {}
        with open(os.path.join(parent, competi, 'meta_html',file.split("/")[-1]), "r", encoding="utf-8") as fp:
            content = fp.read()
            # Submission Year
            pattern = r'<span [^>]*title="([A-Z][a-z]{2} [A-Z][a-z]{2} \d{2} \d{4} \d{2}:\d{2}:\d{2} GMT[+-]\d{4} \([^"]+\))"'
            date = re.findall(pattern, content, re.DOTALL)[0]
            # print(date)
            date_str = date.split(" (")[0].replace("GMT", "")
            # print(date_str)
            dt = datetime.datetime.strptime(date_str, "%a %b %d %Y %H:%M:%S %z")

            entity[competi][file.split("/")[-1]]['year'] = dt.year
            entity[competi][file.split("/")[-1]]['month'] = dt.month
            entity[competi][file.split("/")[-1]]['date'] = dt.day
            entity[competi][file.split("/")[-1]]['datetime'] = dt.astimezone(datetime.timezone.utc).strftime('%Y-%m-%dT%H:%M:%S.%fZ')
            
            # Find Private Score     
            pattern = r'<p class="sc-gQaihK[^"]*">\s*([-\d.]+)\s*</p>'
            matches = re.findall(pattern, content, re.DOTALL)
            # has P.Score
            if matches:
                entity[competi][file.split("/")[-1]]['ps'] =  matches[0]

            # Extract all time blocks (non-greedy match with DOTALL)
            pattern = r'<p\s+class="sc-gQaihK\s+(?:sc-bHbnRu|sc-hKjFaw)\s+bwaGMg\s+(?:hAkjhA|jGRPCU)">(.*?)</p>'
            p_tags = re.findall(pattern, content, re.DOTALL)
            for tag in p_tags:
                seconds = time_to_seconds(tag)
                if seconds is not None:
                    entity[competi][file.split("/")[-1]]['runtime'] =  seconds

            # Extract dependencies
            path = os.path.join(parent, competi, 'html', file.split("/")[-1])
            # Create notebook first
            nb = new_notebook()
            # Pass notebook to read_html_content to add cells directly
            nb = read_html_content(path, nb)

            # If no cells were added, add the whole content as one cell
            if len(nb.cells) != 1:
                file_deps = get_imports_from_file(nb)
                entity[competi][file.split("/")[-1]]['api'] = list(file_deps)
            else:
                entity[competi][file.split("/")[-1]]['R'] = 1
            

            pattern = r'<p\s+class="sc-gQaihK sc-dyfHgC bwaGMg igmQhu">\s*(.*?)\s*</p>'
            matches = re.findall(pattern, content, flags=re.DOTALL)
            entity[competi][file.split("/")[-1]]['datasets'] =  list(set(matches))
        
        break
    break
entity
# Save the entity dictionary into a JSON file.
# with open("kernel.json", "w", encoding="utf-8") as json_file:
#     json.dump(entity, json_file, indent=4, ensure_ascii=False)

Competitions:   0%|          | 0/79 [00:23<?, ?it/s]

[]
True





{'cassava-leaf-disease-classification': {'aaroswings_ensemble-inference-notebook_v34_C1.html': {'year': 2021,
   'month': 2,
   'date': 6,
   'datetime': '2021-02-07T04:00:45.000000Z',
   'runtime': 32,
   'api': ['numpy',
    'albumentations',
    'opencv-python-headless',
    'opencv-python',
    'torch',
    'pillow',
    'pandas'],
   'datasets': ['Cassava Leaf Disease Classification', 'private-dataset']}}}

In [6]:
tot = 0
tot_c1 = 0
for competi in tqdm(get_folders(parent)):
    tot += len(glob.glob(os.path.join(parent, competi,'meta_html','*.html')))
    tot_c1 += len(glob.glob(os.path.join(parent, competi,'meta_html','*_C1.html')))
print(f'Total files in all competitions: {tot}')
print(f'Total runnable _C1 files in all competitions: {tot_c1}')

  0%|          | 0/79 [00:00<?, ?it/s]

100%|██████████| 79/79 [00:00<00:00, 171.83it/s]

Total files in all competitions: 171075
Total runnable _C1 files in all competitions: 151236





In [56]:
for competi in tqdm(get_folders(parent)):
    print(competi, len(glob.glob(os.path.join(parent, competi,'meta_html','*_C1.html'))))

100%|██████████| 79/79 [00:00<00:00, 386.29it/s]

cassava-leaf-disease-classification 9376
tabular-playground-series-may-2022 1350
tweet-sentiment-extraction 5473
freesound-audio-tagging-2019 837
cdiscount-image-classification-challenge 379
text-normalization-challenge-english-language 261
inaturalist-2019-fgvc6 90
herbarium-2021-fgvc8 143
random-acts-of-pizza 48
aerial-cactus-identification 2679
leaf-classification 1482
movie-review-sentiment-analysis-kernels-only 1311
dog-breed-identification 2418
plant-pathology-2021-fgvc8 3509
histopathologic-cancer-detection 2658
champs-scalar-coupling 1863
tabular-playground-series-dec-2021 2081
osic-pulmonary-fibrosis-progression 5546
ventilator-pressure-prediction 2254
jigsaw-unintended-bias-in-toxicity-classification 3439
chaii-hindi-and-tamil-question-answering 1599
denoising-dirty-documents 441
playground-series-s3e18 1321
imet-2020-fgvc7 195
bms-molecular-translation 1105
smartphone-decimeter-2022 239
facebook-recruiting-iii-keyword-extraction 74
iwildcam-2019-fgvc6 226
tensorflow-speech-r




In [None]:
import requests
import json
import datetime
from pathlib import Path
import time

with open("/home/b27jin/config.json", "r") as f:
    config = json.load(f)
pypi_key = config["pypi"]

with open("/home/b27jin/CodeModernization/kernel.json", "r", encoding="utf-8") as f:
    kernel_content = json.load(f)

results = []
session = requests.Session()
base_url = "https://libraries.io/api/pypi/{pkg}"

for script_name in valid_scripts:
    compt = script_name.split("_")[0]
    submission_id = "_".join(Path(script_name).stem.split("_")[1:]) + ".html"

    script_meta = kernel_content[compt][submission_id]
    submission_date = submission_date = datetime.datetime(script_meta["year"], script_meta["month"], script_meta["date"])
    
    apis = script_meta.get("api") or []
    print(apis)
    for api in apis:
        url = base_url.format(pkg=api)
        try:
            resp = session.get(url, params={"api_key": pypi_key, "per_page": 100}, timeout=15)
            resp.raise_for_status()
            api_meta = resp.json()
        except Exception as e:
            print(f"Fetch fail {api}: {e}")
            continue
        
        closest_v = None
        closest_dt = None
        if api_meta and api_meta["status"] != "Removed":
            versions = api_meta.get("versions", [])
            for version in versions:
                pub_date = datetime.datetime.strptime(version['published_at'], '%Y-%m-%dT%H:%M:%S.%fZ') #'2008-04-25T16:22:32.000Z',

                if not pub_date:
                    continue

                if pub_date <= submission_date:
                    if closest_dt is None or pub_date > closest_dt:
                        closest_dt = pub_date
                        closest_v = version['number']
            
            if closest_dt:
                results.append((script_name, api, closest_v, closest_dt.strftime("%Y-%m-%d"), submission_date.strftime("%Y-%m-%d")))
                print(f"{script_name} | {api} | {closest_v} | {closest_dt.date()} <= {submission_date.date()}")
            else:
                print(f"{script_name} | {api} | no version <= submission_date")
        
        time.sleep(0.2)
    break