In [None]:
import pandas as pd
import numpy as np
import torch
import matplotlib.pyplot as plt
from datasets import Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import seaborn as sns
from tqdm.auto import tqdm
from google.colab import drive
from torch.utils.data import DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding, TrainingArguments, Trainer

In [None]:
path = '../data/stratified_sample.csv'

data = pd.read_csv(path)

label_mapping = {0: 0, 1: 1, 7: 2, 10: 3}
inverse_label_mapping = {v: k for k, v in label_mapping.items()}

data['label'] = data['label'].map(label_mapping)

data['binary_label'] = data['label'].apply(lambda x: 1 if x > 0 else 0)

#data = data[data['label']!=0]

Mounted at /content/drive


In [None]:
data_binary = data.copy()
data_binary = data.drop(columns=['label'])

data_multi = data[data['label'] != 0].copy()

multi_map_consecutive = {
    1: 0, # DeepSeek
    2: 1, # Meta-Llama
    3: 2  # OpenAI
}

data_multi['labels'] = data_multi['label'].map(multi_map_consecutive)
data_multi = data_multi.drop(columns=['label'])

In [None]:
model_name = 'huggingface/CodeBERTa-small-v1'

tokenizer = AutoTokenizer.from_pretrained(model_name)

model_binary = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
model_multi_source = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

batch_size = 16
max_length = 512

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/19.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/480 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/336M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at huggingface/CodeBERTa-small-v1 and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


model.safetensors:   0%|          | 0.00/336M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at huggingface/CodeBERTa-small-v1 and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:

def tokenize(batch):
    return tokenizer(
        batch["code"],
        truncation=True,
        max_length=max_length,
        padding=False,          # padding dinámico (lo hace el DataCollator)
        return_tensors=None     # evita errores de tensorización
    )



In [None]:

# Crear dataset HF
hf_dataset_binary = Dataset.from_pandas(data_binary)
hf_dataset_multi = Dataset.from_pandas(data_multi)

# Tokenizar
tokenized_binary = hf_dataset_binary.map(tokenize,batched=True)
tokenized_multi = hf_dataset_multi.map(tokenize,batched=True)


# Renombrar columna de etiquetas
tokenized_binary = tokenized_binary.rename_column('binary_label', 'labels')

# Mantener solo columnas necesarias
cols_to_keep = ['input_ids', 'attention_mask', 'labels']

cols_to_remove_bin = [c for c in tokenized_binary.column_names if c not in cols_to_keep]
tokenized_binary = tokenized_binary.remove_columns(cols_to_remove_bin)

cols_to_remove_multi = [c for c in tokenized_binary.column_names if c not in cols_to_keep]
tokenized_multi = tokenized_multi.remove_columns(cols_to_remove_multi)

# Convertir a tensores
tokenized_binary.set_format("torch")
tokenized_multi.set_format("torch")

Map:   0%|          | 0/29698 [00:00<?, ? examples/s]

Map:   0%|          | 0/14849 [00:00<?, ? examples/s]

In [None]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Dataloader
dataloader_binary = DataLoader(
    tokenized_binary,
    shuffle=True,
    batch_size=batch_size,
    collate_fn=data_collator
)

dataloader_multi = DataLoader(
    tokenized_multi,
    shuffle=True,
    batch_size=batch_size,
    collate_fn=data_collator
)


In [None]:
binary_splits = tokenized_binary.train_test_split(test_size=0.1)
multi_splits = tokenized_multi.train_test_split(test_size=0.1)

training_args_binary = TrainingArguments(
    output_dir="./resultados_binario",
    num_train_epochs=3,
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    report_to="none"
)

training_args_multi = TrainingArguments(
    output_dir="./resultados_multi",
    num_train_epochs=3,
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    report_to="none"
)

trainer_binary = Trainer(
    model=model_binary,
    args=training_args_binary,
    train_dataset=binary_splits["train"],
    eval_dataset=binary_splits["test"],
    tokenizer=tokenizer,
    data_collator=data_collator
)

trainer_multi = Trainer(
    model=model_multi_source,
    args=training_args_multi,
    train_dataset=multi_splits["train"],
    eval_dataset=multi_splits["test"],
    tokenizer=tokenizer,
    data_collator=data_collator
)

trainer_binary.train()
trainer_multi.train()

#Estas carpetas despues se organizaron, por lo que, en caso de reproducción, debes cambiarlo a la estructura del github
trainer_binary.save_model("./modelo_binario_final")
trainer_multi.save_model("./modelo_multi_final")

  trainer_binary = Trainer(
  trainer_multi = Trainer(


Step,Training Loss
500,0.2179
1000,0.1414
1500,0.1248
2000,0.1036
2500,0.0827
3000,0.082
3500,0.0723
4000,0.043
4500,0.047
5000,0.0441


Step,Training Loss
500,0.6188
1000,0.4366
1500,0.3564
2000,0.2693
2500,0.2485


In [None]:
# Comprime la carpeta del modelo binario en un archivo ZIP
!zip -r modelo_binario_final.zip modelo_binario_final/

!zip -r modelo_multi_final.zip modelo_multi_final/

  adding: modelo_binario_final/ (stored 0%)
  adding: modelo_binario_final/model.safetensors (deflated 7%)
  adding: modelo_binario_final/training_args.bin (deflated 54%)
  adding: modelo_binario_final/tokenizer_config.json (deflated 76%)
  adding: modelo_binario_final/special_tokens_map.json (deflated 84%)
  adding: modelo_binario_final/tokenizer.json (deflated 81%)
  adding: modelo_binario_final/config.json (deflated 50%)
  adding: modelo_binario_final/merges.txt (deflated 52%)
  adding: modelo_binario_final/vocab.json (deflated 57%)
  adding: modelo_multi_final/ (stored 0%)
  adding: modelo_multi_final/model.safetensors (deflated 7%)
  adding: modelo_multi_final/training_args.bin (deflated 53%)
  adding: modelo_multi_final/tokenizer_config.json (deflated 76%)
  adding: modelo_multi_final/special_tokens_map.json (deflated 84%)
  adding: modelo_multi_final/tokenizer.json (deflated 81%)
  adding: modelo_multi_final/config.json (deflated 52%)
  adding: modelo_multi_final/merges.txt (def