In [1]:
import argparse
import csv
import difflib
import logging
import itertools
import json
import re
import statistics
import tqdm

from dataclasses import dataclass
from pathlib import Path
import numpy as np
from typing import Callable, Iterator, Mapping, Sequence, Tuple

In [2]:
FLOAT_RE = '(([0-9]+[.])?[0-9]+)'

In [3]:
@dataclass
class SubmissionVerdict:
    source_code: str
    verdict_file: Path
    status: str
    cpu_time: float
    memory: float

    @staticmethod
    def stricly_better_time(a: "SubmissionVerdict", b: "SubmissionVerdict", min_improvement: float=1.2) -> float:
        is_better = a.cpu_time*min_improvement < b.cpu_time and a.memory <= b.memory
        if is_better:
            return b.cpu_time / a.cpu_time
        return 0

    @staticmethod
    def stricly_better_memory(a: "SubmissionVerdict", b: "SubmissionVerdict", min_improvement: float=1.2) -> float:
        is_better = a.memory*min_improvement < b.memory and a.cpu_time <= b.cpu_time
        if is_better:
            return b.memory / a.memory
        return 0

@dataclass
class AllSubmissionVerdict:
    original: SubmissionVerdict
    improved: SubmissionVerdict
    predicted: Sequence[SubmissionVerdict]

In [4]:
def safe_div(numerator, denominator):
    if denominator != 0:
        return numerator/denominator
    return 0

def read_source_code(verdict_file: Path):
    source_code_filename = verdict_file.name[:-13]
    
    glob_result = list(verdict_file.parent.glob(source_code_filename+'.*c*'))
    if glob_result:
        assert len(glob_result) == 1
        with glob_result[0].open() as f:
            return f.read()
    else:
        raise FileNotFoundError(f'{verdict_file} does not have a corresponding source code file')


def read_verdict_file(verdict_file: Path, partition: str):
    source_code = read_source_code(verdict_file)

    if not verdict_file.exists():
        raise ValueError(f'Verdict file {verdict_file} does not exists')

    with verdict_file.open() as f:
        verdict = json.load(f)

    if 'cpu_time' in verdict and verdict['cpu_time'] is not None:
        if type(verdict['cpu_time']) == int or type(verdict['cpu_time']) == float:
            cpu_time = verdict['cpu_time']
        else:
            cpu_time_re = re.search(FLOAT_RE, verdict['cpu_time'])
            if cpu_time_re is None:
                raise ValueError(f'CPU time for {verdict_file} is None')
            else:
                cpu_time = float(cpu_time_re.groups()[0])
    elif 'exec_time_ms' in verdict and verdict['exec_time_ms'] is not None:
        cpu_time = verdict['exec_time_ms']
    else:
        raise ValueError(f'CPU time for {verdict_file} is None')

    if 'memory' in verdict and verdict['memory'] is not None:
        if type(verdict['memory']) == int or type(verdict['memory']) == float:
            memory = verdict['memory']
        else:
            memory_re = re.search(FLOAT_RE, verdict['memory'])
            if memory_re is None:
                raise ValueError(f'Memory consumption for {verdict_file} is None')
            else:
                memory = float(memory_re.groups()[0])
    elif 'memory_usage_kb' in verdict and verdict['memory_usage_kb'] is not None:
        memory = verdict['memory_usage_kb']
    else:
        raise ValueError(f'Memory consumption for {verdict_file} is None')
    
    if partition == 'codeforces':
        # Codeforces reports time in a interval, meaning that 0-15 ms are all in the same intervall.
        cpu_time = cpu_time // 16 + 1
    
    # Set both values to 1 if they are zero to avoid division by 0.
    if cpu_time == 0:
        cpu_time = 1
    if memory == 0:
        memory = 1
    

    return SubmissionVerdict(source_code, verdict_file.resolve(), verdict['status'], cpu_time, memory)


def get_verdicts(submission_dir: Path, partition: str):
    original_verdict_file = submission_dir / 'original_verdict.json'
    improved_verdict_file = submission_dir / 'improved_verdict.json'

    original_verdict = read_verdict_file(original_verdict_file, partition)
    improved_verdict = read_verdict_file(improved_verdict_file, partition)

    predicted_verdicts = []

    for predicted_verdict_file in submission_dir.glob('*_verdict.json'):
        if predicted_verdict_file == original_verdict_file:
            continue

        if predicted_verdict_file == improved_verdict_file:
            continue

        predicted_verdicts.append(read_verdict_file(predicted_verdict_file, partition))

    return AllSubmissionVerdict(
        original_verdict,
        improved_verdict,
        predicted_verdicts
    )

def get_max_improvement(
    submission_verdicts: AllSubmissionVerdict,
    compare_fn: Callable[[SubmissionVerdict, SubmissionVerdict], Tuple[float, float]],
    string_sim_threshold: float=0.0):
    max_performance_improvement = 0
    max_improvement_string_sim = -1
    for predicted_submission in submission_verdicts.predicted:
        if predicted_submission.status not in ('Accepted', 'STATE_ACCEPTED'):
            continue

        if ''.join(submission_verdicts.original.source_code.split()) == ''.join(predicted_submission.source_code.split()):
            continue
        performance_improvement = compare_fn(predicted_submission, submission_verdicts.original)
        string_sim = difflib.SequenceMatcher(None, submission_verdicts.original.source_code, predicted_submission.source_code).ratio()
        if performance_improvement > max_performance_improvement and string_sim > string_sim_threshold:
            max_performance_improvement = performance_improvement
            max_improvement_string_sim = string_sim
    return max_performance_improvement, max_improvement_string_sim


def string_sim_stats_str(string_sims: Sequence[float]):
    if string_sims:
        stats_str = (f'min={min(string_sims):.4f}, '
                     f'max={max(string_sims):.4f}, '
                     f'mean={statistics.mean(string_sims):.4f}, '
                     f'median={statistics.median(string_sims):.4f}, '
                     f'std={statistics.stdev(string_sims):.4f}, '
                     f'var={statistics.variance(string_sims):.4f}')
    else:
        stats_str = 'No string sims'
    return stats_str


def read_codenet_csv(codenet_problem_csv: Path) -> Mapping[str, str]:
    problem_name_to_dataset = {}
    with codenet_problem_csv.open() as f:
        csv_reader = csv.reader(f, delimiter=',')
        next(csv_reader)
        for line in csv_reader:
            problem_name_to_dataset[line[0]] = line[2]
    return problem_name_to_dataset


def get_submission_dir_generator(
        root_dirs: Sequence[Path], partitions: Sequence[str], problem_name_to_dataset: Mapping[str, str]
    ) -> Iterator[Path]:
    generators = []
    for root_dir, partition in zip(root_dirs, partitions):
        if partition == 'codeforces':
            codeforces_dir = root_dir / 'Codeforces'
            generators.append((
                (v.parent, partition)
                for v in codeforces_dir.rglob('original_verdict.json')
            ))
        elif partition == 'atcoder':
            codenet_dir = root_dir / 'Codenet'
            generators.append((
                (v.parent, partition)
                for v in codenet_dir.rglob('original_verdict.json')
                if problem_name_to_dataset[v.parent.parent.name] == 'AtCoder'
            ))
        else:
            codenet_dir = root_dir / 'Codenet'
            generators.append((
                (v.parent, partition)
                for v in codenet_dir.rglob('original_verdict.json')
                if problem_name_to_dataset[v.parent.parent.name] == 'AIZU'
            ))  
    return itertools.chain(*generators)


def read_all_verdicts(all_submission_dir_generator: Iterator[Tuple[Path, str]]):
    all_submission_dir = list(all_submission_dir_generator)
    all_predicted_submission_verdicts = []
    num_invalid_verdict_dir = 0
    for submission_dir, partition in tqdm.tqdm(all_submission_dir, desc='Process all submissions'):
        try:
            all_predicted_submission_verdicts.append(get_verdicts(submission_dir, partition))
        except UnicodeDecodeError as e:
            logging.warning(f'Caught UnicodeDecodeError when processing {submission_dir}: {e}')
            num_invalid_verdict_dir += 1
            continue
        except ValueError as e:
            logging.warning(f'Caught ValueError when processing {submission_dir}: {e}')
            num_invalid_verdict_dir += 1
            continue
    return all_predicted_submission_verdicts, num_invalid_verdict_dir


def compute_performance_statistics(all_submission_dir_generator: Iterator[Tuple[Path, str]], partition: str, print_to_console: bool=True):
    strictly_time_improvements = []
    strictly_time_improvements_string_sims = []
    strictly_memory_improvements = []
    strictly_memory_improvements_string_sims = []

    num_predicted_submissions = 0
    num_not_accepted_original = 0
    
    all_predicted_submission_verdicts, num_invalid_verdict_dir = read_all_verdicts(all_submission_dir_generator)
    for submission_verdicts in all_predicted_submission_verdicts:
        
        num_predicted_submissions += len(submission_verdicts.predicted)

        if submission_verdicts.original.status not in ('Accepted', 'STATE_ACCEPTED'):
            if submission_verdicts.original.status.startswith(('STATE_TIMELIMIT', 'STATE_MEMORYLIMIT', 'Time limit exceeded', 'Memory limit exceeded')):
                logging.warning(f'Verdict for the original submission of {submission_verdicts.original.verdict_file.parent} is exceed limits.')
                strictly_time_improvements.append(0)
                strictly_time_improvements_string_sims.append(-1)
                strictly_memory_improvements.append(0)
                strictly_memory_improvements_string_sims.append(-1)
            else:
                logging.warning(f'Verdict for the original submission of {submission_verdicts.original.verdict_file.parent} is not accept.')
            num_not_accepted_original += 1
            continue

        if not submission_verdicts.predicted:
            logging.info(f'No predictions for {submission_verdicts.original.verdict_file.parent}.')
            strictly_time_improvements.append(0)
            strictly_time_improvements_string_sims.append(-1)
            strictly_memory_improvements.append(0)
            strictly_memory_improvements_string_sims.append(-1)
            continue

        max_strictly_time_improvement, string_sim = get_max_improvement(
            submission_verdicts, SubmissionVerdict.stricly_better_time)
        strictly_time_improvements.append(max_strictly_time_improvement)
        strictly_time_improvements_string_sims.append(string_sim)

        max_strictly_memory_improvement, string_sim = get_max_improvement(
            submission_verdicts, SubmissionVerdict.stricly_better_memory)
        strictly_memory_improvements.append(max_strictly_memory_improvement)
        strictly_memory_improvements_string_sims.append(string_sim)
        
    if partition == 'codeforces':
        missing_values = 300 - len(strictly_time_improvements)
        for _ in range(missing_values):
            logging.info(f'For codeforces, we add dummy value to compensate for the fact that we have all kinds of errors, but we force the denominator to be of size 300.')
            strictly_time_improvements.append(0)
            strictly_time_improvements_string_sims.append(-1)
            strictly_memory_improvements.append(0)
            strictly_memory_improvements_string_sims.append(-1)
    elif partition == 'aizu':
        missing_values = 259 - len(strictly_time_improvements)
        for _ in range(missing_values):
            logging.info(f'For AIZU, we add dummy value to compensate for the fact that we have all kinds of errors, but we force the denominator to be of size 259.')
            strictly_time_improvements.append(0)
            strictly_time_improvements_string_sims.append(-1)
            strictly_memory_improvements.append(0)
    elif partition == 'all':
        missing_values = 559 - len(strictly_time_improvements)
        for _ in range(missing_values):
            logging.info(f'For all samples, we add dummy value to compensate for the fact that we have all kinds of errors, but we force the denominator to be of size 559.')
            strictly_time_improvements.append(0)
            strictly_time_improvements_string_sims.append(-1)
            strictly_memory_improvements.append(0)
            strictly_memory_improvements_string_sims.append(-1)

    if print_to_console:
        print(f'{num_not_accepted_original} of the original submissions are not accepted.')
        print(f'Caught {num_invalid_verdict_dir} exceptions while processing the result')

        try:
            optimized_strictly_time_improvements, optimized_strictly_time_improvement_string_sims = zip(*filter(
                lambda x: x[0] > 0 and x[1] >=0.8, zip(strictly_time_improvements, strictly_time_improvements_string_sims)))
        except ValueError:
            optimized_strictly_time_improvements, optimized_strictly_time_improvement_string_sims = [], []
        try:
            optimized_strictly_memory_improvements, optimized_strictly_memory_improvement_string_sims = zip(*filter(
                lambda x: x[0] > 0 and x[1] >=0.8, zip(strictly_memory_improvements, strictly_memory_improvements_string_sims)))
        except ValueError:
            optimized_strictly_memory_improvements, optimized_strictly_memory_improvement_string_sims = [], []
            
        print(f'num strictly_time_improvements: {len(strictly_time_improvements)}')
        print(f'num optimized_strictly_time_improvements: {len(optimized_strictly_time_improvements)}')
        print(f'Strictly time improvement: %OPT={safe_div(len(optimized_strictly_time_improvements)*100, len(strictly_time_improvements))}, '
              f'PI={safe_div(sum(optimized_strictly_time_improvements), len(optimized_strictly_time_improvements))}, '
              f'UR={safe_div(sum([1 - 1/i for i in optimized_strictly_time_improvements])*100, len(optimized_strictly_time_improvements))}')
        print(f'num strictly_memory_improvements: {len(strictly_memory_improvements)}')
        print(f'num optimized_strictly_memory_improvements: {len(optimized_strictly_memory_improvements)}')
        print(f'Strictly memory improvement: %OPT={safe_div(len(optimized_strictly_memory_improvements)*100, len(strictly_memory_improvements))}, '
              f'PI={safe_div(sum(optimized_strictly_memory_improvements), len(optimized_strictly_memory_improvements))}, '
              f'UR={safe_div(sum([1 - 1/i for i in optimized_strictly_memory_improvements])*100, len(optimized_strictly_memory_improvements))}')
        print()

        print(f'Strictly time improvement, string similarity: {string_sim_stats_str(optimized_strictly_time_improvement_string_sims)}')
        print(f'Strictly memory improvement, string similarity: {string_sim_stats_str(optimized_strictly_memory_improvement_string_sims)}')
        print()

    return (
        strictly_time_improvements, strictly_time_improvements_string_sims,
        strictly_memory_improvements, strictly_memory_improvements_string_sims
    )
        
        
def compute_verdict_statistics(all_submission_dir_generator: Iterator[Tuple[Path, str]]):
    num_dir = 0
    num_total_valid_diffs = 0
    num_invalid_of_exception = 0
    num_discard_of_orig_not_accept = 0
    num_compile_errors = 0
    num_wrong_answer = 0
    num_time_limit_exceeded = 0
    num_memory_limit_exceeded = 0
    num_runtime_error = 0
    num_accepted = 0
    num_other = 0
    num_improved = 0
    for submission_dir, partition in tqdm.tqdm(all_submission_dir_generator):
        num_dir += 1
        num_valid_diffs = len(list(submission_dir.glob('*verdict.json'))) - 2
        num_total_valid_diffs += num_valid_diffs
        
        try:
            submission_verdicts = get_verdicts(submission_dir, partition)
        except UnicodeDecodeError as e:
            num_invalid_of_exception += num_valid_diffs
            continue
        except ValueError as e:
            num_invalid_of_exception += num_valid_diffs
            continue
                                                                      
        if submission_verdicts.original.status not in ('Accepted', 'STATE_ACCEPTED'):
            num_discard_of_orig_not_accept += num_valid_diffs                                             
            continue
                                                                      
        assert num_valid_diffs == len(submission_verdicts.predicted)
                              
        for predicted_submission in submission_verdicts.predicted:
            if predicted_submission.status in ('STATE_COMPILEERROR', 'Compilation error'):
                num_compile_errors += 1
            elif predicted_submission.status.startswith('Wrong answer') or predicted_submission.status == 'STATE_WRONGANSWER':
                num_wrong_answer += 1
            elif predicted_submission.status.startswith('Time limit') or predicted_submission.status == 'STATE_TIMELIMIT':
                num_time_limit_exceeded += 1
            elif predicted_submission.status.startswith('Memory limit') or predicted_submission.status == 'STATE_MEMORYLIMIT':
                num_memory_limit_exceeded += 1
            elif predicted_submission.status.startswith('Runtime error') or predicted_submission.status == 'STATE_RUNTIMEERROR':
                num_runtime_error += 1
            elif predicted_submission.status in ('Accepted', 'STATE_ACCEPTED'):
                num_accepted += 1
                time_improved = SubmissionVerdict.stricly_better_time(predicted_submission, submission_verdicts.original) > 0
                memory_improved = SubmissionVerdict.stricly_better_memory(predicted_submission, submission_verdicts.original) > 0
                if time_improved or memory_improved:
                    num_improved += 1
            else:
                num_other += 1
                              
    num_predicted_predictions = num_dir*10
    print(f'num_predicted_predictions: {num_predicted_predictions}')
    print(f'num_malformed_diffs: {num_predicted_predictions-num_total_valid_diffs}')

    print(f'num_invalid_of_exception: {num_invalid_of_exception}')
    print(f'num_discard_of_orig_not_accept: {num_discard_of_orig_not_accept}')
    print(f'num_compile_errors: {num_compile_errors}')
    print(f'num_wrong_answer: {num_wrong_answer}')
    print(f'num_time_limit_exceeded: {num_time_limit_exceeded}')
    print(f'num_memory_limit_exceeded: {num_memory_limit_exceeded}')
    print(f'num_runtime_error: {num_runtime_error}')
    print(f'num_other: {num_other}')
    print(f'num_accepted: {num_accepted}')
    print(f'num_improved: {num_improved}')
    
    
def compute_all_string_sims(all_submission_dir_generator: Iterator[Tuple[Path, str]]):
    all_predicted_submission_verdicts, _ = read_all_verdicts(all_submission_dir_generator)
    all_string_sims = []
    for submission_verdicts in all_predicted_submission_verdicts:
        for predicted_submission in submission_verdicts.predicted:
            all_string_sims.append(difflib.SequenceMatcher(None, submission_verdicts.original.source_code, predicted_submission.source_code).ratio())
            
    return all_string_sims

In [45]:
# ChatGPT codeforces performance statistics
if __name__ == '__main__' and '__file__' not in globals():
    partitions = ['codeforces']
    root_dirs = [Path('../../predictions/submissions-cf-gpt35/')]
    codenet_problem_csv = Path('../../data/problem_list.csv')

    problem_name_to_dataset = read_codenet_csv(codenet_problem_csv)

    all_submission_dir_generator = get_submission_dir_generator(root_dirs, partitions, problem_name_to_dataset)
    compute_performance_statistics(all_submission_dir_generator, 'codeforces')

Process all submissions: 100%|█| 300/300 [00:02<00:00


Evaluated predictions for 299 samples.
8 of the original submissions are not accepted.
Caught 1 exceptions while processing the result
num strictly_time_improvements: 300
num optimized_strictly_time_improvements: 36
Strictly time improvement: %OPT=12.0, PI=3.7994376794800195, UR=50.79273147604937
num strictly_memory_improvements: 300
num optimized_strictly_memory_improvements: 11
Strictly memory improvement: %OPT=3.6666666666666665, PI=11.573488983870737, UR=53.86716559991667

Strictly time improvement, string similarity: min=0.8055, max=0.9968, mean=0.9212, median=0.9255, std=0.0582, var=0.0034
Strictly memory improvement, string similarity: min=0.8095, max=0.9780, mean=0.9012, median=0.8863, std=0.0576, var=0.0033



In [46]:
# ChatGPT AIZU performance statistics
if __name__ == '__main__' and '__file__' not in globals():
    partitions = ['aizu']
    root_dirs = [Path('../../predictions/submissions-aizu-gpt35/')]
    codenet_problem_csv = Path('../../data/problem_list.csv')

    problem_name_to_dataset = read_codenet_csv(codenet_problem_csv)

    all_submission_dir_generator = get_submission_dir_generator(root_dirs, partitions, problem_name_to_dataset)
    compute_performance_statistics(all_submission_dir_generator, 'aizu')

Process all submissions: 100%|█| 259/259 [00:01<00:00


Evaluated predictions for 257 samples.
41 of the original submissions are not accepted.
Caught 2 exceptions while processing the result
num strictly_time_improvements: 259
num optimized_strictly_time_improvements: 12
Strictly time improvement: %OPT=4.633204633204633, PI=6.871372549019607, UR=69.38061266286498
num strictly_memory_improvements: 259
num optimized_strictly_memory_improvements: 5
Strictly memory improvement: %OPT=1.9305019305019304, PI=3.694206542531486, UR=49.523204876270356

Strictly time improvement, string similarity: min=0.8293, max=0.9711, mean=0.8973, median=0.8896, std=0.0515, var=0.0026
Strictly memory improvement, string similarity: min=0.9095, max=0.9786, mean=0.9498, median=0.9536, std=0.0289, var=0.0008



In [5]:
# ChatGPT-CoT codeforces performance statistics
if __name__ == '__main__' and '__file__' not in globals():
    partitions = ['codeforces']
    root_dirs = [Path('../../predictions/submissions-cf-cot-gpt35/')]
    codenet_problem_csv = Path('../../data/problem_list.csv')

    problem_name_to_dataset = read_codenet_csv(codenet_problem_csv)

    all_submission_dir_generator = get_submission_dir_generator(root_dirs, partitions, problem_name_to_dataset)
    compute_performance_statistics(all_submission_dir_generator, 'codeforces')

Process all submissions: 100%|██████████| 300/300 [00:00<00:00, 403.85it/s]


8 of the original submissions are not accepted.
Caught 3 exceptions while processing the result
num strictly_time_improvements: 300
num optimized_strictly_time_improvements: 2
Strictly time improvement: %OPT=0.6666666666666666, PI=2.8583333333333334, UR=59.4562647754137
num strictly_memory_improvements: 300
num optimized_strictly_memory_improvements: 0
Strictly memory improvement: %OPT=0.0, PI=0, UR=0

Strictly time improvement, string similarity: min=0.8000, max=0.8135, mean=0.8068, median=0.8068, std=0.0096, var=0.0001
Strictly memory improvement, string similarity: No string sims



In [6]:
# ChatGPT-CoT AIZU performance statistics
if __name__ == '__main__' and '__file__' not in globals():
    partitions = ['aizu']
    root_dirs = [Path('../../predictions/submissions-aizu-cot-gpt35/')]
    codenet_problem_csv = Path('../../data/problem_list.csv')

    problem_name_to_dataset = read_codenet_csv(codenet_problem_csv)

    all_submission_dir_generator = get_submission_dir_generator(root_dirs, partitions, problem_name_to_dataset)
    compute_performance_statistics(all_submission_dir_generator, 'aizu')

Process all submissions: 100%|██████████| 259/259 [00:00<00:00, 446.56it/s]


43 of the original submissions are not accepted.
Caught 0 exceptions while processing the result
num strictly_time_improvements: 259
num optimized_strictly_time_improvements: 2
Strictly time improvement: %OPT=0.7722007722007722, PI=3.108974358974359, UR=52.26377952755905
num strictly_memory_improvements: 259
num optimized_strictly_memory_improvements: 2
Strictly memory improvement: %OPT=0.7722007722007722, PI=1.427175527175527, UR=28.154178534317133

Strictly time improvement, string similarity: min=0.8093, max=0.9833, mean=0.8963, median=0.8963, std=0.1230, var=0.0151
Strictly memory improvement, string similarity: min=0.8184, max=0.8757, mean=0.8471, median=0.8471, std=0.0405, var=0.0016



In [6]:
# GPT4-CoT codeforces performance statistics
if __name__ == '__main__' and '__file__' not in globals():
    partitions = ['codeforces']
    root_dirs = [Path('../../predictions/submissions-cf-cot-gpt4/')]
    codenet_problem_csv = Path('../../data/problem_list.csv')

    problem_name_to_dataset = read_codenet_csv(codenet_problem_csv)

    all_submission_dir_generator = get_submission_dir_generator(root_dirs, partitions, problem_name_to_dataset)
    compute_performance_statistics(all_submission_dir_generator, 'codeforces')

Process all submissions: 100%|██████████| 300/300 [00:00<00:00, 412.34it/s]


8 of the original submissions are not accepted.
Caught 1 exceptions while processing the result
num strictly_time_improvements: 300
num optimized_strictly_time_improvements: 23
Strictly time improvement: %OPT=7.666666666666667, PI=4.458739470145331, UR=50.52558958841513
num strictly_memory_improvements: 300
num optimized_strictly_memory_improvements: 16
Strictly memory improvement: %OPT=5.333333333333333, PI=83.06844672543382, UR=52.91748015604514

Strictly time improvement, string similarity: min=0.8006, max=0.9891, mean=0.8915, median=0.8834, std=0.0547, var=0.0030
Strictly memory improvement, string similarity: min=0.8006, max=0.9891, mean=0.9061, median=0.9029, std=0.0546, var=0.0030



In [7]:
# GPT4-CoT AIZU performance statistics
if __name__ == '__main__' and '__file__' not in globals():
    partitions = ['aizu']
    root_dirs = [Path('../../predictions/submissions-aizu-cot-gpt4/')]
    codenet_problem_csv = Path('../../data/problem_list.csv')

    problem_name_to_dataset = read_codenet_csv(codenet_problem_csv)

    all_submission_dir_generator = get_submission_dir_generator(root_dirs, partitions, problem_name_to_dataset)
    compute_performance_statistics(all_submission_dir_generator, 'aizu')

Process all submissions:   0%|          | 0/259 [00:00<?, ?it/s]

Process all submissions: 100%|██████████| 259/259 [00:00<00:00, 412.36it/s]


43 of the original submissions are not accepted.
Caught 0 exceptions while processing the result
num strictly_time_improvements: 259
num optimized_strictly_time_improvements: 8
Strictly time improvement: %OPT=3.088803088803089, PI=3.1746482683982684, UR=55.142915466544636
num strictly_memory_improvements: 259
num optimized_strictly_memory_improvements: 2
Strictly memory improvement: %OPT=0.7722007722007722, PI=3.787010170618549, UR=61.93262247495686

Strictly time improvement, string similarity: min=0.8016, max=0.9775, mean=0.8945, median=0.8928, std=0.0638, var=0.0041
Strictly memory improvement, string similarity: min=0.9290, max=0.9612, mean=0.9451, median=0.9451, std=0.0228, var=0.0005



In [6]:
# GPT4 Codeforces performance statistics
if __name__ == '__main__' and '__file__' not in globals():
    partitions = ['codeforces']
    root_dirs = [Path('../../predictions/submissions-cf-gpt4/')]
    codenet_problem_csv = Path('../../data/problem_list.csv')

    problem_name_to_dataset = read_codenet_csv(codenet_problem_csv)

    all_submission_dir_generator = get_submission_dir_generator(root_dirs, partitions, problem_name_to_dataset)
    compute_performance_statistics(all_submission_dir_generator, 'codeforces')

Process all submissions:   0%|          | 0/300 [00:00<?, ?it/s]

Process all submissions: 100%|██████████| 300/300 [00:00<00:00, 410.30it/s]


3 of the original submissions are not accepted.
Caught 2 exceptions while processing the result
num strictly_time_improvements: 300
num optimized_strictly_time_improvements: 12
Strictly time improvement: %OPT=4.0, PI=2.7324486378655357, UR=52.58290748595377
num strictly_memory_improvements: 300
num optimized_strictly_memory_improvements: 4
Strictly memory improvement: %OPT=1.3333333333333333, PI=1.5269846995570397, UR=31.30867586746482

Strictly time improvement, string similarity: min=0.8085, max=0.9703, mean=0.8941, median=0.9110, std=0.0670, var=0.0045
Strictly memory improvement, string similarity: min=0.8123, max=0.9242, mean=0.8535, median=0.8387, std=0.0510, var=0.0026



In [50]:
# GPT4 AIZU performance statistics
if __name__ == '__main__' and '__file__' not in globals():
    partitions = ['aizu']
    root_dirs = [Path('../../predictions/submissions-aizu-gpt4/')]
    codenet_problem_csv = Path('../../data/problem_list.csv')

    problem_name_to_dataset = read_codenet_csv(codenet_problem_csv)

    all_submission_dir_generator = get_submission_dir_generator(root_dirs, partitions, problem_name_to_dataset)
    compute_performance_statistics(all_submission_dir_generator, 'aizu')

Process all submissions: 100%|█| 259/259 [00:01<00:00


Evaluated predictions for 256 samples.
41 of the original submissions are not accepted.
Caught 3 exceptions while processing the result
num strictly_time_improvements: 259
num optimized_strictly_time_improvements: 4
Strictly time improvement: %OPT=1.5444015444015444, PI=4.875, UR=73.88888888888889
num strictly_memory_improvements: 259
num optimized_strictly_memory_improvements: 2
Strictly memory improvement: %OPT=0.7722007722007722, PI=2.4464504652892547, UR=44.92388001864061

Strictly time improvement, string similarity: min=0.8670, max=0.9756, mean=0.9066, median=0.8918, std=0.0514, var=0.0026
Strictly memory improvement, string similarity: min=0.8400, max=0.8696, mean=0.8548, median=0.8548, std=0.0209, var=0.0004



In [51]:
# Supersonic codeforces performance statistics
if __name__ == '__main__' and '__file__' not in globals():
    partitions = ['codeforces']
    root_dirs = [Path('../../predictions/supersonic_predicted_submissions/')]
    codenet_problem_csv = Path('../../data/problem_list.csv')

    problem_name_to_dataset = read_codenet_csv(codenet_problem_csv)

    all_submission_dir_generator = get_submission_dir_generator(root_dirs, partitions, problem_name_to_dataset)
    compute_performance_statistics(all_submission_dir_generator, 'codeforces')

Process all submissions: 100%|█| 300/300 [00:20<00:00


Evaluated predictions for 299 samples.
8 of the original submissions are not accepted.
Caught 1 exceptions while processing the result
num strictly_time_improvements: 300
num optimized_strictly_time_improvements: 78
Strictly time improvement: %OPT=26.0, PI=2.652921006778977, UR=50.5671526382961
num strictly_memory_improvements: 300
num optimized_strictly_memory_improvements: 24
Strictly memory improvement: %OPT=8.0, PI=1.8105613804472274, UR=39.958871251369324

Strictly time improvement, string similarity: min=0.8862, max=0.9995, mean=0.9757, median=0.9875, std=0.0282, var=0.0008
Strictly memory improvement, string similarity: min=0.9497, max=0.9997, mean=0.9903, median=0.9926, std=0.0109, var=0.0001



In [52]:
# Supersonic AIZU performance statistics
if __name__ == '__main__' and '__file__' not in globals():
    partitions = ['aizu']
    root_dirs = [Path('../../predictions/supersonic_predicted_submissions/')]
    codenet_problem_csv = Path('../../data/problem_list.csv')

    problem_name_to_dataset = read_codenet_csv(codenet_problem_csv)

    all_submission_dir_generator = get_submission_dir_generator(root_dirs, partitions, problem_name_to_dataset)
    compute_performance_statistics(all_submission_dir_generator, 'aizu')

Process all submissions: 100%|█| 259/259 [00:00<00:00


Evaluated predictions for 257 samples.
41 of the original submissions are not accepted.
Caught 2 exceptions while processing the result
num strictly_time_improvements: 259
num optimized_strictly_time_improvements: 9
Strictly time improvement: %OPT=3.474903474903475, PI=2.820871913580247, UR=48.641498705940215
num strictly_memory_improvements: 259
num optimized_strictly_memory_improvements: 3
Strictly memory improvement: %OPT=1.1583011583011582, PI=1.2277160929071986, UR=18.538494239034016

Strictly time improvement, string similarity: min=0.9307, max=0.9992, mean=0.9751, median=0.9854, std=0.0236, var=0.0006
Strictly memory improvement, string similarity: min=0.9450, max=0.9993, mean=0.9805, median=0.9972, std=0.0308, var=0.0009



In [10]:
# ChatGPT verdict statistics
if __name__ == '__main__' and '__file__' not in globals():
    partitions = ['codeforces', 'aizu']
    root_dirs = root_dirs = [Path('../../predictions/submissions-cf-gpt35/'), Path('../../predictions/submissions-aizu-gpt35/')]
    codenet_problem_csv = Path('../../data/problem_list.csv')

    problem_name_to_dataset = read_codenet_csv(codenet_problem_csv)

    all_submission_dir_generator = get_submission_dir_generator(root_dirs, partitions, problem_name_to_dataset)
    compute_verdict_statistics(all_submission_dir_generator)

0it [00:00, ?it/s]

num_predicted_predictions: 0
num_malformed_diffs: 0
num_invalid_of_exception: 0
num_discard_of_orig_not_accept: 0
num_compile_errors: 0
num_wrong_answer: 0
num_time_limit_exceeded: 0
num_memory_limit_exceeded: 0
num_runtime_error: 0
num_other: 0
num_accepted: 0
num_improved: 0





In [11]:
# Supersonic verdict statistics
if __name__ == '__main__' and '__file__' not in globals():
    partitions = ['codeforces', 'aizu']
    root_dirs = [Path('../../predictions/supersonic_predicted_submissions/'), Path('../../predictions/supersonic_predicted_submissions/')]
    codenet_problem_csv = Path('../../data/problem_list.csv')

    problem_name_to_dataset = read_codenet_csv(codenet_problem_csv)

    all_submission_dir_generator = get_submission_dir_generator(root_dirs, partitions, problem_name_to_dataset)
    compute_verdict_statistics(all_submission_dir_generator)

559it [00:02, 272.65it/s]

num_predicted_predictions: 5590
num_malformed_diffs: 2045
num_invalid_of_exception: 6
num_discard_of_orig_not_accept: 358
num_compile_errors: 634
num_wrong_answer: 669
num_time_limit_exceeded: 66
num_memory_limit_exceeded: 7
num_runtime_error: 109
num_other: 10
num_accepted: 1686
num_improved: 356





In [55]:
# Supersonic codeforces full output performance statistics
if __name__ == '__main__' and '__file__' not in globals():
    partitions = ['codeforces']
    root_dirs = [Path('../../predictions/full_output_predicted_submissions/')]
    codenet_problem_csv = Path('../../data/problem_list.csv')

    problem_name_to_dataset = read_codenet_csv(codenet_problem_csv)

    all_submission_dir_generator = get_submission_dir_generator(root_dirs, partitions, problem_name_to_dataset)
    compute_performance_statistics(all_submission_dir_generator, 'codeforces')

Process all submissions: 100%|█| 3


9 of the original submissions are not accepted.
Caught 1 exceptions while processing the result
num strictly_time_improvements: 300
num optimized_strictly_time_improvements: 33
Strictly time improvement: %OPT=11.0, PI=3.2791147031342187, UR=51.740950432675334
num strictly_memory_improvements: 300
num optimized_strictly_memory_improvements: 8
Strictly memory improvement: %OPT=2.6666666666666665, PI=1.5681203066885363, UR=34.234150988509384

Strictly time improvement, string similarity: min=0.8598, max=0.9989, mean=0.9751, median=0.9720, std=0.0267, var=0.0007
Strictly memory improvement, string similarity: min=0.9202, max=0.9992, mean=0.9853, median=0.9959, std=0.0268, var=0.0007



In [56]:
# Supersonic AIZU full output performance statistics
if __name__ == '__main__' and '__file__' not in globals():
    partitions = ['aizu']
    root_dirs = [Path('../../predictions/full_output_predicted_submissions/')]
    codenet_problem_csv = Path('../../data/problem_list.csv')

    problem_name_to_dataset = read_codenet_csv(codenet_problem_csv)

    all_submission_dir_generator = get_submission_dir_generator(root_dirs, partitions, problem_name_to_dataset)
    compute_performance_statistics(all_submission_dir_generator, 'aizu')

Process all submissions: 100%|█| 2


41 of the original submissions are not accepted.
Caught 2 exceptions while processing the result
num strictly_time_improvements: 259
num optimized_strictly_time_improvements: 4
Strictly time improvement: %OPT=1.5444015444015444, PI=3.2410714285714284, UR=40.36877394636015
num strictly_memory_improvements: 259
num optimized_strictly_memory_improvements: 0
Strictly memory improvement: %OPT=0.0, PI=0, UR=0

Strictly time improvement, string similarity: min=0.9644, max=0.9989, mean=0.9901, median=0.9986, std=0.0171, var=0.0003
Strictly memory improvement, string similarity: No string sims

