In [33]:
#Entrenamiento del modelo Gpt2
import json
import re
import time
import numpy as np
import torch
from torch.utils.data import DataLoader
from transformers import GPT2Tokenizer, GPT2LMHeadModel,  Trainer, TrainingArguments, pipeline
from sklearn.model_selection import train_test_split
from crear_dataset import ChatDataset

#Configuraciones principales
RANDOM_STATE = 42
LEARNING_RATE = 5e-5
torch.manual_seed(123)

torch.backends.cudnn.deterministic = True
DEVICE = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

#Abrir json y cargar
with open("./results/astronomia_corpus.json", "r", encoding="utf-8") as f: #Asegurarse estar en el directorio correcto
    docs = json.load(f)

#Obtener solo el texto
texts = [d["text"] for d in docs]

In [34]:

#Dividr los datos 80% train y 20% val
train_texts, val_texts = train_test_split(texts, test_size=0.2, random_state=RANDOM_STATE)

print(len(train_texts), len(val_texts))

#Limpiar los datos/texto 
def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+", "", text)  #quitar URLs
    text = re.sub(r"[^a-z0-9\s.,;:!?()\-']", " ", text)  #quitar rarezas
    text = re.sub(r"\s+", " ", text).strip()
    return text

train_texts = [clean_text(t) for t in train_texts] #textos limpios y preparados para crear datsets de estos
eval_texts = [clean_text(t) for t in val_texts]

228 58


In [35]:
#Comporbar funcionamiento del tokenizer y definir el transformador
#Al tokenizar con tu tokenizer, se añade automáticamente el índice de cada token en el vocabulario del modelo.
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token  
block_size = 128
train_encodigns = tokenizer(list(train_texts), truncation = True, padding = "max_length", max_length = block_size, return_tensors = "pt")
valid_encodings = tokenizer(list(eval_texts), truncation = True, padding = "max_length", max_length = block_size, return_tensors = "pt")

print(valid_encodings['input_ids'][2])
print(valid_encodings['attention_mask'][2])





tensor([  732,   779,   281, 16359, 16161, 46320,  3781,  2446,   284,  5554,
        48668,  3953,   262,  9158,  1483,   286, 27982, 10090, 14500,   355,
        48804,   605,    13,   262,  1366,   900,   973,   329,   262,  3781,
        10874,   286,  3126,    11, 44085, 16161,  4263,   351,  2266, 30846,
         6492,   416,   262,  1017, 24611,  4875,  6766,  5526,   357, 21282,
          824,     8,   290, 10090, 14500,   416, 16161, 26626,    11,   355,
          880,   355,   262, 48321,    18,   290, 12385,   940, 18388,   947,
           13,   356,  3953,   262,  9158,  1483,   286,   262, 27982,   416,
         1262,   262,   308, 38200,  9107,  2446,    11,   543, 31408,   262,
        16161,  2939,   284,   663, 44503, 12245,  7110,   284,  4886, 16161,
         9158,  1483,   326,   318,   287,   867,  2663,  2408,   284,  4003,
          416, 10107, 13432,   286,   262,  8246, 16161,  2939,    13, 11992,
         2482,  1262, 14500, 10090, 48804,   605,   290,   264])

In [36]:
#Crear Dataset de entrenamiento y validación
train_dataset = ChatDataset(train_encodigns)
val_dataset = ChatDataset(valid_encodings)

#Crear Dataloaders de train y val datsets
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
valid_loader = DataLoader(val_dataset, batch_size=16, shuffle=False) 


In [37]:
#Definir el modelo y el optimizador
model = GPT2LMHeadModel.from_pretrained("gpt2")
optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE)
model.to(DEVICE)


GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2SdpaAttention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [38]:
#Para agilizar el entrenamiento y hacerlo más visual, se entrena mediante el Trianer de tranforms y de esta manera sabemos que el modelo se entran bien
#La didferencia entre entranr got2 y no hacerlo con los datsets ,es que l ainformacion que ofrece es distinta a la entrenada y se puede invetar cosas que no son
#Definir los argumentos deñ entrenamiento
training_args = TrainingArguments(
    output_dir="./results",
    overwrite_output_dir=True,
    num_train_epochs=4,
    per_device_train_batch_size=16,   
    per_device_eval_batch_size=16,
    logging_dir="./logs",
    logging_steps=10,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    fp16=torch.cuda.is_available(), 
    save_total_limit=2
)



#definir el trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer   # para logging
)

#Entrenar el modelo
start_time = time.time()
trainer.train()
print(f"Total Training Time: {(time.time() - start_time)/60:.2f} min")

#Evaluar el modelo
results = trainer.evaluate()
print("Evaluation results:", results)

#Perplejidad
perplexity = np.exp(results["eval_loss"])
print(f"Perplexity: {perplexity:.2f}")

 33%|███▎      | 11/33 [22:27<44:55, 122.54s/it]


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()} #en key estan todo el texto encodeadas
 18%|█▊        | 11/60 [00:02<00:11,  4.37it/s]

{'loss': 4.2152, 'grad_norm': 6.360230922698975, 'learning_rate': 4.5e-05, 'epoch': 0.67}


                                               
 25%|██▌       | 15/60 [00:03<00:08,  5.29it/s]

{'eval_loss': 3.2432572841644287, 'eval_runtime': 0.2685, 'eval_samples_per_second': 215.997, 'eval_steps_per_second': 14.896, 'epoch': 1.0}


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()} #en key estan todo el texto encodeadas
 33%|███▎      | 20/60 [00:06<00:13,  2.89it/s]

{'loss': 3.4197, 'grad_norm': 4.830862522125244, 'learning_rate': 3.6666666666666666e-05, 'epoch': 1.33}


 50%|█████     | 30/60 [00:08<00:05,  5.19it/s]

{'loss': 3.2757, 'grad_norm': 8.88134765625, 'learning_rate': 2.8333333333333335e-05, 'epoch': 2.0}


                                               
 50%|█████     | 30/60 [00:08<00:05,  5.19it/s]

{'eval_loss': 3.144075870513916, 'eval_runtime': 0.2329, 'eval_samples_per_second': 248.989, 'eval_steps_per_second': 17.172, 'epoch': 2.0}


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()} #en key estan todo el texto encodeadas
 68%|██████▊   | 41/60 [00:12<00:04,  4.17it/s]

{'loss': 3.1749, 'grad_norm': 4.851055145263672, 'learning_rate': 2e-05, 'epoch': 2.67}


                                               
 75%|███████▌  | 45/60 [00:13<00:02,  5.17it/s]

{'eval_loss': 3.110599994659424, 'eval_runtime': 0.2711, 'eval_samples_per_second': 213.957, 'eval_steps_per_second': 14.756, 'epoch': 3.0}


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()} #en key estan todo el texto encodeadas
 85%|████████▌ | 51/60 [00:16<00:02,  3.22it/s]

{'loss': 3.1269, 'grad_norm': 4.764472007751465, 'learning_rate': 1.1666666666666668e-05, 'epoch': 3.33}


100%|██████████| 60/60 [00:18<00:00,  5.16it/s]

{'loss': 3.1207, 'grad_norm': 8.884801864624023, 'learning_rate': 3.3333333333333333e-06, 'epoch': 4.0}


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()} #en key estan todo el texto encodeadas
                                               
100%|██████████| 60/60 [00:20<00:00,  5.16it/s]

{'eval_loss': 3.0999505519866943, 'eval_runtime': 0.245, 'eval_samples_per_second': 236.737, 'eval_steps_per_second': 16.327, 'epoch': 4.0}


100%|██████████| 60/60 [00:21<00:00,  2.75it/s]
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()} #en key estan todo el texto encodeadas


{'train_runtime': 21.8054, 'train_samples_per_second': 41.824, 'train_steps_per_second': 2.752, 'train_loss': 3.3888439814249676, 'epoch': 4.0}
Total Training Time: 0.37 min


100%|██████████| 4/4 [00:00<00:00, 18.87it/s]

Evaluation results: {'eval_loss': 3.0999505519866943, 'eval_runtime': 0.229, 'eval_samples_per_second': 253.277, 'eval_steps_per_second': 17.467, 'epoch': 4.0}
Perplexity: 22.20





In [39]:
#Probar que tán bien el modelo apredió del dataset
generator = pipeline("text-generation", model=model, tokenizer=tokenizer, device=0 if torch.cuda.is_available() else -1)

prompt = "What is an exoplanet?"
output = generator(prompt, max_length=100, do_sample=True, temperature=0.7)
print(output[0]["generated_text"])


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


What is an exoplanet?

An exoplanet is a planet that is born in the sun's disk. its orbit is like that of a star. it is orbiting it, moving its gravitational pull along its axis, and then orbiting it again. the mass of an exoplanet depends on its mass and the planet's gravity, but the mass of an exoplanet is not a fundamental factor in determining its orbit. exoplanets have a mass that is a factor of five
