In [15]:
from datasets import load_dataset, concatenate_datasets

In [16]:
from Code_preprocessing.code_cleaner import comment_remover

**CodeMirage:** is the test to evaluate:
- Language generalization
- LLMs Generalization
- pharafrazed

**AIGCodeSet:** is the test to evaluate:
- competitive code evaluation
- correct code vs wrong code evaluation

**Sun et al:** is the test to evaluate:
- competitive code evaluation
- correct code vs wrong code evaluation

# CodeMirage

In [17]:
codemirage_split = load_dataset("HanxiGuo/CodeMirage")


codemirage = concatenate_datasets([codemirage_split["train"],codemirage_split["test"]])
print(codemirage)

      
assert (len(codemirage_split["train"])+len(codemirage_split["test"])) == len (codemirage)

codemirage = codemirage.shuffle(seed = 30)

#codemirage = codemirage.select(range(1000))
#print(codemirage)

Dataset({
    features: ['code', 'language', 'source', 'variant'],
    num_rows: 209988
})


In [18]:
human = codemirage.filter(lambda x: x["source"] == "Human" )
not_human = codemirage.filter(lambda x: x["source"] != "Human" )

print(human)
print(not_human)


Dataset({
    features: ['code', 'language', 'source', 'variant'],
    num_rows: 10000
})
Dataset({
    features: ['code', 'language', 'source', 'variant'],
    num_rows: 199988
})


In [19]:
from datasets import Dataset
from collections import defaultdict
import random

def balanced_sample_multi_cols(
    ds: Dataset,
    cols=("language", "source", "variant"),
    desired_n: int = 1000,
    seed: int = 30,
):
    """
    Restituisce un sotto-dataset bilanciato su TUTTE le combinazioni presenti di `cols`,
    con lo stesso numero di esempi per ciascuna tripletta. Se 1000 non è raggiungibile
    mantenendo l’eguaglianza perfetta, restituisce il massimo bilanciato <= desired_n.

    Nota: si considerano solo le combinazioni EFFETTIVAMENTE presenti nel dataset.
    """
    rng = random.Random(seed)

    # 1) indicizza gli indici per gruppo (tripletta)
    idx_by_group = defaultdict(list)
    for i, ex in enumerate(ds):
        key = tuple(ex[c] for c in cols)
        idx_by_group[key].append(i)

    groups = list(idx_by_group.keys())
    G = len(groups)
    if G == 0:
        raise ValueError("Nessun gruppo trovato: controlla i nomi delle colonne.")

    # 2) quanti per gruppo possiamo prendere mantenendo l’uguaglianza?
    #    (a) il target teorico è floor(desired_n / G)
    #    (b) non possiamo superare il minimo disponibile tra i gruppi
    target_per_group = desired_n // G
    cap_min = min(len(idxs) for idxs in idx_by_group.values())
    take = min(target_per_group, cap_min)

    if take == 0:
        raise ValueError(
            f"Impossibile estrarre campione bilanciato: desired_n={desired_n} < numero gruppi={G} "
            f"oppure alcuni gruppi hanno 0 elementi. Aumenta desired_n o riduci i gruppi."
        )

    # 3) campiona esattamente 'take' per ogni gruppo
    chosen = []
    for g in groups:
        idxs = idx_by_group[g]
        chosen.extend(rng.sample(idxs, k=take))

    chosen.sort()
    sub = ds.select(chosen)

    return sub


In [20]:
from collections import Counter

In [21]:
sub_not_human = balanced_sample_multi_cols(not_human,
                                cols=("language","source","variant"),
                                desired_n=500,
                                seed=30)


print("Len subset:", len(sub_not_human))   # sarà take * num_groups

# Verify balance
cnt = Counter((ex["language"], ex["source"], ex["variant"]) for ex in sub_not_human)
print(cnt)

Len subset: 400
Counter({('Python', 'qwen2.5-coder', 'Normal'): 2, ('HTML', 'deepseek-v3', 'Normal'): 2, ('Ruby', 'gpt-4o-mini', 'Paraphrased'): 2, ('C', 'qwen2.5-coder', 'Normal'): 2, ('JavaScript', 'claude-3.5-haiku', 'Normal'): 2, ('JavaScript', 'gemini-2.0-flash-thinking-exp', 'Paraphrased'): 2, ('JavaScript', 'gemini-2.0-flash', 'Paraphrased'): 2, ('Ruby', 'llama3.3-70b', 'Normal'): 2, ('JavaScript', 'deepseek-r1', 'Normal'): 2, ('Python', 'deepseek-v3', 'Paraphrased'): 2, ('Python', 'o3-mini', 'Paraphrased'): 2, ('Ruby', 'deepseek-r1', 'Normal'): 2, ('CSharp', 'gemini-2.0-pro-exp', 'Paraphrased'): 2, ('JavaScript', 'gpt-4o-mini', 'Paraphrased'): 2, ('CPP', 'gemini-2.0-flash', 'Paraphrased'): 2, ('HTML', 'claude-3.5-haiku', 'Paraphrased'): 2, ('HTML', 'deepseek-r1', 'Paraphrased'): 2, ('HTML', 'gemini-2.0-flash', 'Normal'): 2, ('CPP', 'claude-3.5-haiku', 'Paraphrased'): 2, ('Ruby', 'o3-mini', 'Normal'): 2, ('C', 'gemini-2.0-flash-thinking-exp', 'Normal'): 2, ('Go', 'deepseek-v3', 

In [22]:
sub_human = balanced_sample_multi_cols(human,
                                cols=("language",),
                                desired_n=len(sub_not_human),
                                seed=30)


assert len(sub_human) == len(sub_not_human)

# Verify balance
cnt = Counter((ex["language"]) for ex in sub_human)
print(cnt)

Counter({'Java': 40, 'Ruby': 40, 'C': 40, 'Go': 40, 'HTML': 40, 'PHP': 40, 'JavaScript': 40, 'CSharp': 40, 'CPP': 40, 'Python': 40})


In [23]:
codemirage = concatenate_datasets([sub_human, sub_not_human])
codemirage = codemirage.shuffle(seed=30)

In [24]:
all_codes = [comment_remover(x, language) for x, language in zip(codemirage["code"], codemirage["language"])]
newCodeMirage = codemirage.add_column("cleared_code", all_codes)
print(newCodeMirage)

Dataset({
    features: ['code', 'language', 'source', 'variant', 'cleared_code'],
    num_rows: 800
})


In [25]:
newCodeMirage_notNone = newCodeMirage.filter(lambda x: x["code"] is not None and len(x["code"]) >= 10)
print(newCodeMirage_notNone)

Dataset({
    features: ['code', 'language', 'source', 'variant', 'cleared_code'],
    num_rows: 800
})


In [26]:
ds_with_idx = newCodeMirage_notNone.map(lambda example, idx: {"idx": idx}, with_indices=True)
ds_with_idx.to_csv("./Dataset/CodeMirage.csv")

Creating CSV from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

4651019

In [29]:
# saving in a local path
newCodeMirage_notNone.save_to_disk("./Dataset/CodeMirage")

# To upload: #
# from datasets import load_from_disk
# ds2 = load_from_disk("/path/to/my_dataset")


Saving the dataset (0/1 shards):   0%|          | 0/800 [00:00<?, ? examples/s]

# AIGCodeSet

In [30]:
AIGCodeSet_split = load_dataset("basakdemirok/AIGCodeSet")


AIGCodeSet = concatenate_datasets([AIGCodeSet_split["train"],AIGCodeSet_split["test"]])
print(AIGCodeSet)

assert (len(AIGCodeSet_split["train"])+len(AIGCodeSet_split["test"])) == len (AIGCodeSet)

AIGCodeSet = AIGCodeSet.shuffle(seed = 30)

Dataset({
    features: ['problem_id', 'submission_id', 'status_in_folder', 'LLM', 'code', 'ada_embedding', 'label', 'lines', 'code_lines', 'comments', 'functions', 'blank_lines'],
    num_rows: 15166
})


In [31]:
AIGCodeSet_sub = balanced_sample_multi_cols(AIGCodeSet,
                                cols=("LLM","status_in_folder","label"),
                                desired_n=1000,
                                seed=30)


print("Len subset:", len(AIGCodeSet_sub))   # sarà take * num_groups

# Verify balance
cnt = Counter((ex["LLM"], ex["status_in_folder"], ex["label"]) for ex in AIGCodeSet_sub)
print(cnt)

Len subset: 996
Counter({('LLAMA', 'Generate', 1): 83, ('CODESTRAL', 'Wrong', 1): 83, ('Human', 'Accepted', 0): 83, ('Human', 'Runtime', 0): 83, ('LLAMA', 'Runtime', 1): 83, ('GEMINI', 'Wrong', 1): 83, ('GEMINI', 'Generate', 1): 83, ('LLAMA', 'Wrong', 1): 83, ('CODESTRAL', 'Generate', 1): 83, ('Human', 'Wrong', 0): 83, ('CODESTRAL', 'Runtime', 1): 83, ('GEMINI', 'Runtime', 1): 83})


In [32]:
all_codes = [comment_remover(x, "Python") for x in AIGCodeSet_sub["code"]]
newAIGCodeSet = AIGCodeSet_sub.add_column("cleared_code", all_codes)
print(newAIGCodeSet)

Dataset({
    features: ['problem_id', 'submission_id', 'status_in_folder', 'LLM', 'code', 'ada_embedding', 'label', 'lines', 'code_lines', 'comments', 'functions', 'blank_lines', 'cleared_code'],
    num_rows: 996
})


In [33]:
newAIGCodeSet_notNone = newAIGCodeSet.filter(lambda x: x["code"] is not None and len(x["code"]) >= 10)
print(newAIGCodeSet_notNone)

Dataset({
    features: ['problem_id', 'submission_id', 'status_in_folder', 'LLM', 'code', 'ada_embedding', 'label', 'lines', 'code_lines', 'comments', 'functions', 'blank_lines', 'cleared_code'],
    num_rows: 996
})


In [47]:
print(newAIGCodeSet_notNone[0]["label"])
AIGC_with_idx = newAIGCodeSet_notNone.map(lambda example, idx: {"idx": idx}, with_indices=True)
AIGC_with_idx.to_csv("./Dataset/AIGCodeSet.csv")

1


Map:   0%|          | 0/996 [00:00<?, ? examples/s]

Creating CSV from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

35198267

In [48]:
from datasets import DatasetDict

newAIGCodeSet_notNone_wrongcode = AIGC_with_idx.filter(lambda x: x["status_in_folder"] == "Runtime" or x["status_in_folder"] == "Wrong")
newAIGCodeSet_notNone_rightcode = AIGC_with_idx.filter(lambda x: x["status_in_folder"] == "Generate")



AIGCodeSet_Dict = DatasetDict({
    "wrongcode": newAIGCodeSet_notNone_wrongcode,
    "rightcode": newAIGCodeSet_notNone_rightcode,
})

Filter:   0%|          | 0/996 [00:00<?, ? examples/s]

Filter:   0%|          | 0/996 [00:00<?, ? examples/s]

In [50]:
AIGCodeSet_Dict.save_to_disk("./Dataset/AIGCodeSet")

Saving the dataset (0/1 shards):   0%|          | 0/664 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/249 [00:00<?, ? examples/s]

# Sun et al

In [39]:
Sun_et_al = load_dataset('json', data_files="./Dataset/SunEtAl/SunEtAl.jsonl")


Sun_et_al = Sun_et_al["train"]

Sun_et_al = Sun_et_al.flatten()
Sun_et_al = Sun_et_al.remove_columns(["metadata.index", "metadata.local index", "metadata.variant",
                                      "test_code","file_source", "test_result.test_reliability",
                                      "test_result.errors", "solution_code",
                                      'test_result.failed'])

Sun_et_al = Sun_et_al.rename_column("metadata.GPT Answer", "code")
Sun_et_al = Sun_et_al.rename_column("metadata.Source Name", "Source Name")

print(Sun_et_al)

Sun_et_al = Sun_et_al.shuffle(seed = 30)

Dataset({
    features: ['Source Name', 'code', 'instruction', 'test_result.passed'],
    num_rows: 272
})


In [40]:
all_codes = [comment_remover(x, "Python") for x in Sun_et_al["code"]]
newSun_et_al = Sun_et_al.add_column("cleared_code", all_codes)
print(newSun_et_al)

Dataset({
    features: ['Source Name', 'code', 'instruction', 'test_result.passed', 'cleared_code'],
    num_rows: 272
})


In [41]:
newSun_et_al_notNone = newSun_et_al.filter(lambda x: x["code"] is not None and len(x["code"]) >= 10)
print(newSun_et_al_notNone)

Dataset({
    features: ['Source Name', 'code', 'instruction', 'test_result.passed', 'cleared_code'],
    num_rows: 272
})


In [42]:
ds_with_idx = newSun_et_al_notNone.map(lambda example, idx: {"idx": idx}, with_indices=True)
ds_with_idx.to_csv("./Dataset/SunEtAl.csv")

Creating CSV from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

311055

In [43]:
print()




In [44]:
from datasets import DatasetDict

newSun_et_al_notNone_wrongcode = newSun_et_al_notNone.filter(lambda x: x["test_result.passed"] > 0 )
newSun_et_al_notNone_rightcode = newSun_et_al_notNone.filter(lambda x: x["test_result.passed"] == 0 )

print(newSun_et_al_notNone_wrongcode)
print(newSun_et_al_notNone_rightcode)

newSun_et_al_notNone_Dict = DatasetDict({
    "wrongcode": newSun_et_al_notNone_wrongcode,
    "rightcode": newSun_et_al_notNone_rightcode,
})

Dataset({
    features: ['Source Name', 'code', 'instruction', 'test_result.passed', 'cleared_code'],
    num_rows: 166
})
Dataset({
    features: ['Source Name', 'code', 'instruction', 'test_result.passed', 'cleared_code'],
    num_rows: 106
})


In [46]:
newSun_et_al_notNone_Dict.save_to_disk("./Dataset/SunEtAl/SunEtAlNew")

Saving the dataset (0/1 shards):   0%|          | 0/166 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/106 [00:00<?, ? examples/s]