## Dataset Preprocessing 

In [1]:
import pandas as pd
import json

def csv_to_jsonl_split(input_path: str, 
                       output_human_path: str, 
                       output_llm_path: str):
    '''
    
    '''
    ds = pd.read_csv(input_path)

    # colonna da tenere
    CODE_CONLUMN = "cleared_code"       # <-- la feature che vuoi salvare
    SPLIT = "LLM"        # <-- la colonna che decide dove va il sample
    target = "Human"              # <-- valore discriminante

    ds = pd.read_csv(input_path)

    # split dataset
    dataset_match = ds[ds[SPLIT] == target]
    dataset_other = ds[ds[SPLIT] != target]

    # salva human.jsonl
    with open(output_human_path, "w", encoding="utf-8") as f:
        for _, row in dataset_match.iterrows():
            f.write(json.dumps(row.to_dict(), ensure_ascii=False) + "\n")

    # salva llm.jsonl
    with open(output_llm_path, "w", encoding="utf-8") as f:
        for _, row in dataset_other.iterrows():
            f.write(json.dumps(row.to_dict(), ensure_ascii=False) + "\n")


In [2]:
INPUT_PATH = './Dataset/AIGCodeSet.csv'
HUMAN_PATH = "./Methods/Code_detection/results/AIGCodeSethuman.json"
LLM_PATH = "./Methods/Code_detection/results/AIGCodeSetllm.json"

In [3]:
csv_to_jsonl_split(input_path = INPUT_PATH, 
                    output_human_path = HUMAN_PATH, 
                    output_llm_path = LLM_PATH)

In [4]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch, json
import random, os
from Methods.Code_detection.utils_batch import InfillingModel
import tqdm

def run_fill_in_the_middle(
    input_path,                # es: "gpt4_python_codecontest.jsonl"
    output_path="./Methods/Code_detection/results/fim.josnl", 
    batch_size=20,
    mask_lines=1,
    model_name="facebook/incoder-6B",
    code_lable = 'cleared_code'
):
    """
    input_path: must be a .jsonl
    Esegue la perturbazione FIM su un dataset JSONL con campo 'code_lable'.
    Salva un nuovo JSONL con campo 'fill_in_middle_gold'.
    """
    
    # Config GPU
    device = "cuda" if torch.cuda.is_available() else "cpu"

    # half precision solo per incoder-6B
    half = True if model_name == "facebook/incoder-6B" else False
    
    infilling_model = InfillingModel(model_name=model_name, cuda=True, half=half, device=device, quantization = "nf4")

    # upload dataset
    with open(input_path, 'r', encoding="utf-8") as f:
        dataset = [json.loads(line) for line in f.readlines()]

    # support functions
    def find_all(substring, string):
        start = 0
        while True:
            start = string.find(substring, start)
            if start == -1: return
            yield start
            start += len(substring)

    def mask_code(parsed_code, mask_lines=mask_lines):
        for _ in range(mask_lines):
            positions = list(find_all(substring='\n', string=parsed_code))
            if positions == []:
                positions = list(find_all(substring=':', string=parsed_code))
            if len(positions) < 2:
                continue
            mask_start = random.choice(range(len(positions)-1))
            mask_start_position = positions[mask_start]
            mask_end_position = positions[mask_start+1]
            parsed_code = parsed_code[:mask_start_position] + '<insert>' + parsed_code[mask_end_position:]
        return parsed_code

    def norm_inserts_num(parsed_code_norm):
        max_num = 0
        for i, x in enumerate(parsed_code_norm):
            if len(list(find_all('<insert>', x))) > max_num:
                max_num = len(list(find_all('<insert>', x)))
                id = i

        new_res = []
        for x in parsed_code_norm:
            if len(list(find_all('<insert>', x))) < max_num:
                new_res.append(parsed_code_norm[id])
            else:
                new_res.append(x)
        return new_res
    ## end support function
    

    # output check
    if os.path.exists(output_path):
        with open(output_path, 'r') as f:
            finished = [json.loads(line) for line in f.readlines()]
        dataset = dataset[len(finished):]


    # MAIN CYCLE
    for idx, ins in tqdm.tqdm(enumerate(dataset), total=len(dataset)):
        code_lable_all = []
        if len(ins[code_lable]) < 2500:
            for _ in range(batch_size):
                gold_codes_masked = mask_code(ins[code_lable], mask_lines=mask_lines)
                code_lable_all.append(gold_codes_masked[:2500])

            code_lable_all = norm_inserts_num(code_lable_all)
            parts_batch = [example.split("<insert>") for example in code_lable_all]
            fill_in_middle_gold = infilling_model.batched_infill(
                parts_batch, max_to_generate=16*mask_lines, temperature=0.7
            )
            ins['fill_in_middle_gold'] = fill_in_middle_gold
        else:
            ins['fill_in_middle_gold'] = ['token exceeds 2500']

        with open(output_path, 'a') as f:
            f.write(json.dumps(ins) + '\n')
    

    return output_path


In [None]:
run_fill_in_the_middle(
    input_path = HUMAN_PATH,
    output_path="./Methods/Code_detection/results/HUMANfim.jsonl", 
    batch_size=20,
    mask_lines=1,
    model_name="facebook/incoder-6B",
    code_lable = 'cleared_code'
)

loading model


model.safetensors:   0%|          | 0.00/26.6G [00:00<?, ?B/s]

loading complete


0it [00:00, ?it/s]


'./Methods/Code_detection/results/HUMANfim.josnl'

In [None]:
run_fill_in_the_middle(
    input_path = LLM_PATH,
    output_path="./Methods/Code_detection/results/LLMfim.jsonl", 
    batch_size=2,
    mask_lines=1,
    model_name="facebook/incoder-6B",
    code_lable = 'cleared_code'
)