# **Instalar y cargar librerías necesarias**

In [10]:
# Instalamos la librería transformers si no está instalada
!pip install transformers
!pip install datasets

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (1

In [11]:
pip show transformers datasets

Name: transformers
Version: 4.47.1
Summary: State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow
Home-page: https://github.com/huggingface/transformers
Author: The Hugging Face team (past and future) with the help of all our contributors (https://github.com/huggingface/transformers/graphs/contributors)
Author-email: transformers@huggingface.co
License: Apache 2.0 License
Location: /usr/local/lib/python3.11/dist-packages
Requires: filelock, huggingface-hub, numpy, packaging, pyyaml, regex, requests, safetensors, tokenizers, tqdm
Required-by: peft, sentence-transformers
---
Name: datasets
Version: 3.2.0
Summary: HuggingFace community-driven open-source library of datasets
Home-page: https://github.com/huggingface/datasets
Author: HuggingFace Inc.
Author-email: thomas@huggingface.co
License: Apache 2.0
Location: /usr/local/lib/python3.11/dist-packages
Requires: aiohttp, dill, filelock, fsspec, huggingface-hub, multiprocess, numpy, packaging, pandas, pyarrow, pyyaml, requests,

In [12]:
# Importamos las librerías necesarias
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments
from datasets import Dataset
import torch
from google.colab import files
import json

In [24]:
# Cargar archivo desde la computadora
uploaded = files.upload()

# Identificar el nombre del archivo cargado
dataset_file = list(uploaded.keys())[0]

# Cargar el contenido del archivo JSON
with open(dataset_file, "r") as f:
    dataset = json.load(f)

print(f"Dataset cargado exitosamente desde {dataset_file}. Contiene {len(dataset)} entradas.")


Saving dataset.json to dataset (2).json
Dataset cargado exitosamente desde dataset (2).json. Contiene 70 entradas.


In [25]:
# Especificamos el modelo preentrenado que utilizaremos
model_name = "microsoft/DialoGPT-medium"

# Cargamos el modelo y el tokenizador asociados
model = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

print(f"Modelo y tokenizador '{model_name}' cargados exitosamente.")

Modelo y tokenizador 'microsoft/DialoGPT-medium' cargados exitosamente.


In [27]:
# Configurar el token de padding
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

In [28]:
# Preparar los datos para el entrenamiento
train_data = []
for d in dataset:
    inputs = tokenizer(d["pregunta"], truncation=True, padding="max_length", max_length=32, return_tensors="pt")
    labels = tokenizer(d["respuesta"], truncation=True, padding="max_length", max_length=32, return_tensors="pt")

    train_data.append({
        "input_ids": inputs["input_ids"].squeeze(0),
        "attention_mask": inputs["attention_mask"].squeeze(0),
        "labels": labels["input_ids"].squeeze(0)
    })

In [30]:
# Crear una clase personalizada para el dataset
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return {key: val for key, val in self.data[idx].items()}

In [31]:
# Crear el dataset
train_dataset = CustomDataset(train_data)

# Configurar los argumentos de entrenamiento
training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=2,
    num_train_epochs=1,
    logging_dir="./logs",
    save_steps=10,
    save_total_limit=2,
    report_to="none"
)

In [32]:
# Crear el entrenador
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset
)

# Iniciar el entrenamiento
print("Iniciando el entrenamiento...")
trainer.train()
print("Entrenamiento completado.")

Iniciando el entrenamiento...


Step,Training Loss


Entrenamiento completado.


In [33]:
# Guardamos el modelo y el tokenizador entrenados en una carpeta local
output_dir = "./mi-chatbot-tecnologico"
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

print(f"Modelo y tokenizador guardados en {output_dir}.")

Modelo y tokenizador guardados en ./mi-chatbot-tecnologico.
