In [None]:
import gzip
import json
import sys
import shutil
import os
import subprocess
from os import PathLike, path
from paramiko.client import SSHClient, AutoAddPolicy
from typing import List, Tuple, TypeAlias, Generator, Iterable
from transformers import AutoTokenizer
from transformers.tokenization_utils_base import PreTrainedTokenizerBase
from huggingface_hub import list_datasets
from datasets import load_dataset, load_dataset_builder
from itertools import islice
from functools import wraps, reduce
from util import get_jobfiles_info, BASE_DIR

FILL_IN_THE_MIDDLE = "fill-in-the-middle"
ZERO_SHOT_CLOZE = "zero-shot-cloze"
SIGNATURE_ONLY_INSTRUCT = "sigonly-instruct"

joblist = [
    # 7b model data sigonly
    ("HumanEvalJava", SIGNATURE_ONLY_INSTRUCT, "codellama-instruct", "meta-llama/CodeLlama-7b-Instruct-hf", "0.8"),
    ("GitBugJava", SIGNATURE_ONLY_INSTRUCT, "codellama-instruct", "meta-llama/CodeLlama-7b-Instruct-hf", "0.8"),
    ("Defects4J", SIGNATURE_ONLY_INSTRUCT, "codellama-instruct", "meta-llama/CodeLlama-7b-Instruct-hf", "0.8"),
    # 13b model data sigonly
    # ("HumanEvalJava", SIGNATURE_ONLY_INSTRUCT, "codellama-infilling", "meta-llama/CodeLlama-13b-Instruct-hf", "0.8"),
    # ("GitBugJava", SIGNATURE_ONLY_INSTRUCT, "codellama-infilling", "meta-llama/CodeLlama-13b-Instruct-hf", "0.8"),
    # ("Defects4J", SIGNATURE_ONLY_INSTRUCT, "codellama-infilling", "meta-llama/CodeLlama-13b-Instruct-hf", "0.8"),
]

# Spec for running the generation and candidate generation
tableitems = []
for job in joblist:
    DATASET, METHOD, PATCH_STRATEGY, CANDIDATE_MODEL_NAME, TEMPERATURE = job
    info = get_jobfiles_info(*job)
    tableitems.append(
        (
            f"{DATASET} {CANDIDATE_MODEL_NAME}", info.samples_file,
            info.samples_exists,             info.candidates_greedy_exists,           info.candidates_multiple_exists, 
        )
    )
import pandas as pd

labels = ["Title", "Samples", "Samples Exist Locally", "Candidates Greedy Exist Locally", "Candidates Multiple Exist Locally", "Samples Exist on Alvis", "Candidates Greedy Exist on Alvis", "Candidates Multiple Exist on Alvis"]

files_df = pd.DataFrame(tableitems, columns=labels)
files_df

Unnamed: 0,Title,samples,Samples Exist Locally,Candidates Greedy Exist Locally,Candidates Multiple Exist Locally,Samples Exist on Alvis,Candidates Greedy Exist on Alvis,Candidates Multiple Exist on Alvis
0,HumanEvalJava meta-llama/CodeLlama-7b-Instruct-hf,samples_HumanEvalJava_sigonly-instruct_.jsonl,True,False,False,True,True,False
1,GitBugJava meta-llama/CodeLlama-7b-Instruct-hf,samples_GitBugJava_sigonly-instruct_.jsonl,True,False,False,True,True,False
2,Defects4J meta-llama/CodeLlama-7b-Instruct-hf,samples_Defects4J_sigonly-instruct_.jsonl,True,False,False,True,False,False


In [None]:
for DATASET, METHOD, PATCH_STRATEGY, CANDIDATE_MODEL_NAME, TEMPERATURE in joblist:
    info = get_jobfiles_info(DATASET, METHOD, PATCH_STRATEGY, CANDIDATE_MODEL_NAME, TEMPERATURE)
    if info.samples_exists:
        print(f"Samples file {info.samples_file} exists locally. Skipping generation.")
        continue
    
    # Generate the samples
    cmd = f"python generate_samples.py {DATASET} {METHOD}"
    # Run cmd in LOCAL_BASE_DIR
    res = subprocess.run(cmd, shell=True, cwd=BASE_DIR)
    # shutil.move(path(), info.DATA_DATASET_DIR)
    # print(f"Generated samples for {DATASET} {METHOD} {SAMPLE_MODEL_NAME}. Moved to {info.DATA_DATASET_DIR}.")

INFO:root:Initializing GitBug-Java benchmark...
INFO:root:Found 199 bugs
Loading GitBug-Java: 100%|██████████| 199/199 [05:10<00:00,  1.56s/it]
INFO:root:Building the prompts...
  0%|          | 0/199 [00:00<?, ?it/s]ERROR:root:Error while generating sample for bug ezylang-EvalEx-7c39c5478a39: Traceback (most recent call last):
  File "/mnt/data/adahen/elle-elle-aime/generate_samples.py", line 65, in entry_point
    results.append(future.result())
  File "/usr/lib/python3.10/concurrent/futures/_base.py", line 451, in result
    return self.__get_result()
  File "/usr/lib/python3.10/concurrent/futures/_base.py", line 403, in __get_result
    raise self._exception
  File "/usr/lib/python3.10/concurrent/futures/thread.py", line 58, in run
    result = self.fn(*self.args, **self.kwargs)
  File "/mnt/data/adahen/elle-elle-aime/generate_samples.py", line 23, in generate_sample
    return prompt_strategy_obj.prompt(bug)
  File "/mnt/data/adahen/elle-elle-aime/elleelleaime/sample/strategies/si

KeyboardInterrupt: 

  1%|          | 2/199 [00:54<1:38:07, 29.89s/it]

In [None]:
# # Move the samples to the server
# for DATASET, METHOD, PATCH_STRATEGY, CANDIDATE_MODEL_NAME, TEMPERATURE in joblist:
#     info = get_jobfiles_info(DATASET, METHOD, PATCH_STRATEGY, CANDIDATE_MODEL_NAME, TEMPERATURE)
#     samples_file = info.samples_file
#     samples_exists_on_alvis = info.samples_exists_on_alvis
#     DATA_DATASET_DIR = info.DATA_DATASET_DIR

#     if samples_exists_on_alvis:
#         print(f"File '{samples_file}' already exists on alvis. Skipping.")
#         continue
    
#     samples_file_data_path = os.path.join(DATA_DATASET_DIR, samples_file)
#     with open(samples_file_data_path, "rb") as f:
#         # convert data to gzip
#         samples_data_gz = gzip.compress(f.read())

#     # send to alvis
#     replaced = write_alvis(f"{samples_file}.gz", samples_data_gz)
#     print(f"File '{samples_file}' was {'replaced' if replaced else 'created'}.")


File 'samples_HumanEvalJava_sigonly-instruct_.jsonl' was created.
File 'samples_GitBugJava_sigonly-instruct_.jsonl' was created.
File 'samples_Defects4J_sigonly-instruct_.jsonl' was created.


In [None]:
# Generate the script template, then run job to generate GREEDY patches on alvis
for DATASET, METHOD, PATCH_STRATEGY, CANDIDATE_MODEL_NAME, TEMPERATURE in joblist[:3]:
    info = get_jobfiles_info(DATASET, METHOD, PATCH_STRATEGY, CANDIDATE_MODEL_NAME, TEMPERATURE)
    template_values = {
        "dataset": DATASET,
        "method": METHOD,
        "patch_strategy": PATCH_STRATEGY,
        "candidate_model_name": CANDIDATE_MODEL_NAME,
    }

    if info.candidates_greedy_exists:
        print(f"File greedy already exists. Skipping.")
        continue
    if not info.samples_exists:
        print(f"Required file does not exist. Skipping.")
        continue
    
    BASH_TEMPLATE_GREEDY = f"bash_template_greedy.sh"
    with open(BASH_TEMPLATE_GREEDY, "r") as f:
        bash_template = f.read()
    bash_script_data = str(reduce(lambda acc, kv: acc.replace(f"<<{kv[0]}>>", kv[1]), template_values.items(), bash_template))
    # Generate hash for run
    hash_run = abs(hash(tuple(template_values.items())))
    script_greedy_file_name = f"job_greedy_{hash_run:x}.sh"
    JOBFILES_DIR = os.path.join(BASE_DIR, "jobfiles")

    # TODO: OLD
    # script_greedy_exists_on_alvis = bool(ssh_alvis([f"ls {script_greedy_file_name}"], base_path=JOBFILES_DIR))
    # # Write to alvis1
    # replaced = write_alvis(script_greedy_file_name, bash_script_data.encode(), base_path=ALVIS_JOBFILES_DIR, replace=True)
    # print(f"File {script_greedy_file_name} {'replaced' if replaced else 'created'}.")
    # # Execute the script on the remote server
    # ssh_alvis([f"sbatch {os.path.join('jobfiles', script_greedy_file_name)}"])
    # print(f"Job {script_greedy_file_name} submitted.")


File greedy already exists on alvis. Skipping.
File greedy already exists on alvis. Skipping.
File job_greedy_6c80107c353538c9.sh replaced.
Job job_greedy_6c80107c353538c9.sh submitted.


In [None]:
# Generate the script template, then run job to generate MULTIPLE patches on alvis

for DATASET, METHOD, PATCH_STRATEGY, CANDIDATE_MODEL_NAME, TEMPERATURE in joblist:
    info = get_jobfiles_info(DATASET, METHOD, PATCH_STRATEGY, CANDIDATE_MODEL_NAME, TEMPERATURE)
    template_values = {
        "dataset": DATASET,
        "method": METHOD,
        "patch_strategy": PATCH_STRATEGY,
        "candidate_model_name": CANDIDATE_MODEL_NAME,
        "temperature": TEMPERATURE,
    }

    if info.candidates_multiple_exists:
        print(f"File multiple already exists. Skipping.")
        continue
    if not info.samples_exists:
        print(f"Required file does not exist. Skipping.")
        continue
    
    BASH_TEMPLATE_MULTIPLE = f"bash_template_multiple.sh"
    with open(BASH_TEMPLATE_MULTIPLE, "r") as f:
        bash_template = f.read()
    bash_script_data = str(reduce(lambda acc, kv: acc.replace(f"<<{kv[0]}>>", kv[1]), template_values.items(), bash_template))
    # Generate hash for run
    hash_run = abs(hash(tuple(template_values.items())))
    script_multiple_file_name = f"job_multiple_{hash_run:x}.sh"
    ALVIS_JOBFILES_DIR = os.path.join(BASE_DIR, "jobfiles")

    # TODO: OLD
    # script_multiple_exists_on_alvis = bool(ssh_alvis([f"ls {script_multiple_file_name}"], base_path=ALVIS_JOBFILES_DIR))
    # if not script_multiple_exists_on_alvis:
    #     # Write to alvis1
    #     replaced = write_alvis(script_multiple_file_name, bash_script_data.encode(), base_path=ALVIS_JOBFILES_DIR)
    #     print(f"File {script_multiple_file_name} {'replaced' if replaced else 'created'}.")
    # else:
    #     print(f"File {script_multiple_file_name} already exists on alvis. Skipping.")
    # # Execute the script on the remote server
    # ssh_alvis([f"sbatch {os.path.join('jobfiles', script_multiple_file_name)}"])
    # print(f"Job {script_multiple_file_name} submitted.")

File job_multiple_43f9cf4776f9e633.sh created.
Job job_multiple_43f9cf4776f9e633.sh submitted.
File job_multiple_40e00a519689f392.sh created.
Job job_multiple_40e00a519689f392.sh submitted.
File job_multiple_36c8dbd7ee6c049f.sh created.
Job job_multiple_36c8dbd7ee6c049f.sh submitted.


In [None]:
# # Read the samples from the server
# for DATASET, METHOD, PATCH_STRATEGY, CANDIDATE_MODEL_NAME, TEMPERATURE in joblist:
#     info = get_jobfiles_info(DATASET, METHOD, PATCH_STRATEGY, CANDIDATE_MODEL_NAME, TEMPERATURE)
#     if not info.samples_exists_on_alvis:
#         print(f"File '{info.samples_file}' does not exist on alvis. Skipping.")
#         continue
#     if info.samples_exists:
#         print(f"File '{info.samples_file}' already exists in the data folder. Skipping.")
#         continue

#     # Read from alvis
#     data_gz = read_alvis(f"{info.samples_file}.gz")
#     data = gzip.decompress(data_gz)
#     # Create the directory if it does not exist
#     print("Writing to: ", info.samples_data_dir_path)
#     with open(info.samples_data_dir_path, "wb") as f:
#         # Write bytes to file
#         f.write(data)


File 'samples_HumanEvalJava_sigonly-instruct_.jsonl' already exists in the data folder. Skipping.
File 'samples_GitBugJava_sigonly-instruct_.jsonl' already exists in the data folder. Skipping.
File 'samples_Defects4J_sigonly-instruct_.jsonl' already exists in the data folder. Skipping.


In [None]:
# # Read the greedy candidates from the server
# for DATASET, METHOD, PATCH_STRATEGY, CANDIDATE_MODEL_NAME, TEMPERATURE in joblist:
#     info = get_jobfiles_info(DATASET, METHOD, PATCH_STRATEGY, CANDIDATE_MODEL_NAME, TEMPERATURE)
#     if info.candidates_greedy_exists:
#         print(f"File already exists in the data folder. Skipping.")
#         continue
#     if not info.candidates_greedy_exists_on_alvis:
#         print(f"Required file does not exist on alvis. Skipping.")
#         continue

#     # Read from alvis
#     data_gz = read_alvis(f"{info.candidates_greedy_file}.gz")
#     data = gzip.decompress(data_gz)
#     # Create the directory if it does not exist
#     print("Writing to: ", info.candidates_greedy_data_dir_path)
#     with open(info.candidates_greedy_data_dir_path, "wb") as f:
#         # Write bytes to file
#         f.write(data)


Required file does not exist on alvis. Skipping.
Required file does not exist on alvis. Skipping.
Required file does not exist on alvis. Skipping.


In [None]:
# # Read the multiple candidates from the server
# for DATASET, METHOD, PATCH_STRATEGY, CANDIDATE_MODEL_NAME, TEMPERATURE in joblist:
#     info = get_jobfiles_info(DATASET, METHOD, PATCH_STRATEGY, CANDIDATE_MODEL_NAME, TEMPERATURE)
#     if info.candidates_multiple_exists:
#         print(f"File already exists in the data folder. Skipping.")
#         continue
#     if not info.candidates_multiple_exists_on_alvis:
#         print(f"Required file does not exist on alvis. Skipping.")
#         continue

#     # Read from alvis
#     data_gz = read_alvis(f"{info.candidates_multiple_file}.gz")
#     data = gzip.decompress(data_gz)
#     # Create the directory if it does not exist
#     print("Writing to: ", info.candidates_multiple_data_dir_path)
#     with open(info.candidates_multiple_data_dir_path, "wb") as f:
#         # Write bytes to file
#         f.write(data)


Required file does not exist on alvis. Skipping.
Required file does not exist on alvis. Skipping.
Required file does not exist on alvis. Skipping.
