In [2]:
import gzip
import json
import os
import sys
from os import PathLike, path
from paramiko.client import SSHClient, AutoAddPolicy
from typing import List, Tuple, TypeAlias, Generator, Iterable
from huggingface_hub import list_datasets
from datasets import load_dataset, load_dataset_builder
from itertools import islice
from transformers import AutoTokenizer
from transformers.tokenization_utils_base import PreTrainedTokenizerBase
from functools import wraps, reduce
import shutil
from util import ssh_alvis, get_edit_distance_distribution_star, load_jsonl, get_samples, get_samples_greedy

Tokens = List[int]

In [6]:
from util import DATA_DIR, calculate_ratio, tokenize_code


dataset = "gitbug-java"
model = "starcoder"
tokenizer = AutoTokenizer.from_pretrained("bigcode/starcoder")
filename1 = "candidates_GitBugJava_fill-in-the-middle_starcoder_generation_strategy=beam_search_num_beams=10_num_return_sequences=10.jsonl"
filename2 = "candidates_GitBugJava_fill-in-the-middle_starcoder_temperature=0.8_generation_strategy=beam_search_num_beams=10_num_return_sequences=10.jsonl"
greedyfilename = "candidates_GitBugJava_fill-in-the-middle_starcoder_temperature=0.0_n_samples=1_num_return_sequences=1.jsonl"

greedydata = load_jsonl(os.path.join(DATA_DIR, model, dataset, "greedy", greedyfilename))
data1 = load_jsonl(os.path.join(DATA_DIR, model, dataset, "multiple", filename1))
data2 = load_jsonl(os.path.join(DATA_DIR, model, dataset, "multiple", filename2))

greedycandidate = list(map(get_samples_greedy, greedydata))
candidates1 = list(map(get_samples, data1))
candidates2 = list(map(get_samples, data2))

vals = []
for gc, c1, c2 in zip(greedycandidate, candidates1, candidates2):
    if not c1 or not c2 or not gc:
        continue
    
    gc_tokens: List[Tokens] = tokenize_code(gc, tokenizer, 100)
    c1_tokens: List[Tokens] = [
        tokenize_code(s, tokenizer, 100) for s in c1
    ]

    dist1, ml1 = get_edit_distance_distribution_star(c1_tokens, gc_tokens)
    peak1 = calculate_ratio(dist1, 0.05 * ml1)
    c2_tokens: List[Tokens] = [
        tokenize_code(s, tokenizer, 100) for s in c2
    ]
    dist2, ml2 = get_edit_distance_distribution_star(c2_tokens, gc_tokens)
    peak2 = calculate_ratio(dist2, 0.05 * ml2)
    vals.append(abs(peak1 - peak2))

vals



[0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0]