# Utilizando el LLM como clasificador de spam

Después de ajustar y evaluar el modelo en las anteriores secciones, ahora se está entrando en la etapa final del capítulo: uso del modelo para clasificar mensajes de spam

![Texto alternativo](./imgs/6.15.png)

Finalmente, se utilizará  el  modelo  de  clasificación  de  spam  basado  en  GPT  optimizado.  La  siguiente  función  classify_review  sigue  pasos  de  preprocesamiento  de  datos  similares  a  los  que  se utilizó  en  el  SpamDataset  implementado  anteriormente  en  esta sección.  Después  de  procesar  el  texto  en  identificadores  de  token,  la  función  utiliza  el  modelo  para  predecir  una  etiqueta  de  clase  entera,  similar  a  la  implementada  en  la  sección 06.6,  y  devuelve  el  nombre  de  clase  correspondiente:

In [1]:
import sys
import os

# Obtiene la ruta de la carpeta principal del proyecto (subiendo un nivel desde seccion05)
ruta_proyecto_principal = os.path.abspath(os.path.join(os.getcwd(), '..'))

# Añade esta ruta a la lista de lugares donde Python busca módulos
if ruta_proyecto_principal not in sys.path:
    sys.path.append(ruta_proyecto_principal)
    

In [2]:
from calcLossClasification import calc_accuracy_loader
from gptModelClasification import return_clasification_model

import tiktoken
import torch

tokenizer = tiktoken.get_encoding("gpt2")

from spamDataset import SpamDataset
from torch.utils.data import DataLoader

num_workers = 0
batch_size = 10
train_dataset = SpamDataset(csv_file="CSV/train.csv",max_length=None,tokenizer=tokenizer)
val_dataset = SpamDataset(csv_file="CSV/validation.csv",max_length=train_dataset.max_length,tokenizer=tokenizer)
test_dataset = SpamDataset(csv_file="CSV/test.csv",max_length=train_dataset.max_length,tokenizer=tokenizer)

torch.manual_seed(123)
train_loader = DataLoader(dataset=train_dataset,batch_size=batch_size,shuffle=True,num_workers=num_workers,drop_last=True,)
val_loader = DataLoader(dataset=val_dataset,batch_size=batch_size,num_workers=num_workers,drop_last=False,)
test_loader = DataLoader(dataset=test_dataset,batch_size=batch_size,num_workers=num_workers,drop_last=False,)

In [3]:
#Cargr el modeloentrenado apra predecir spam o no spam
model2 = return_clasification_model()
model2.load_state_dict(torch.load("model.pth"))
model2

File already exists and is up-to-date: gpt2\124M\checkpoint
File already exists and is up-to-date: gpt2\124M\encoder.json
File already exists and is up-to-date: gpt2\124M\hparams.json
File already exists and is up-to-date: gpt2\124M\model.ckpt.data-00000-of-00001
File already exists and is up-to-date: gpt2\124M\model.ckpt.index
File already exists and is up-to-date: gpt2\124M\model.ckpt.meta
File already exists and is up-to-date: gpt2\124M\vocab.bpe


GPTModel(
  (tok_emb): Embedding(50257, 768)
  (pos_emb): Embedding(1024, 768)
  (drop_emb): Dropout(p=0.1, inplace=False)
  (trf_blocks): Sequential(
    (0): TransformerBlock(
      (att): MultiHeadAttention(
        (W_query): Linear(in_features=768, out_features=768, bias=True)
        (W_key): Linear(in_features=768, out_features=768, bias=True)
        (W_value): Linear(in_features=768, out_features=768, bias=True)
        (out_proj): Linear(in_features=768, out_features=768, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (ff): FeedForward(
        (layers): Sequential(
          (0): Linear(in_features=768, out_features=3072, bias=True)
          (1): GELU()
          (2): Linear(in_features=3072, out_features=768, bias=True)
        )
      )
      (norm1): LayerNorm()
      (norm2): LayerNorm()
      (drop_shortcut): Dropout(p=0.1, inplace=False)
    )
    (1): TransformerBlock(
      (att): MultiHeadAttention(
        (W_query): Linear(in_features=7

In [4]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model2.to(device)
train_accuracy = calc_accuracy_loader(train_loader, model2, device)
val_accuracy = calc_accuracy_loader(val_loader, model2, device)
test_accuracy = calc_accuracy_loader(test_loader, model2, device)

print(f"Training accuracy: {train_accuracy*100:.2f}%")
print(f"Validation accuracy: {val_accuracy*100:.2f}%")
print(f"Test accuracy: {test_accuracy*100:.2f}%")

Training accuracy: 90.00%
Validation accuracy: 80.00%
Test accuracy: 70.00%


In [5]:
def classify_review(text, model, tokenizer, device, max_length=None, pad_token_id=50256):
    model.eval()
    input_ids = tokenizer.encode(text)                            #preparar entradas para el modelo
    supported_context_length = model.pos_emb.weight.shape[1]
    input_ids = input_ids[:min(max_length, supported_context_length)] #truncar secuencias si son demasiado largas
    input_ids += [pad_token_id] * (max_length - len(input_ids))   #pad secuencias a la secuencias más larga
    input_tensor = torch.tensor(input_ids, device=device).unsqueeze(0) #agregar dimensión de lote

    with torch.no_grad():                                         #sin seguimiento de gradiente
        logits = model(input_tensor)[:, -1, :]                    #logits del último token
    predicted_label = torch.argmax(logits, dim=-1).item()
    return "spam" if predicted_label == 1 else "not spam"         #devuelve el resultado clasficado

In [8]:

text_1 = (
    "You are a winner you have been specially"
    " selected to receive $1000 cash or a $2000 award."
)
print(classify_review(
    text_1, model2, tokenizer, device, max_length=train_dataset.max_length
))

text_2 = (
    "Hey, just wanted to check if we're still on"
    " for dinner tonight? Let me know!"
)
print(classify_review(
    text_2, model2, tokenizer, device, max_length=train_dataset.max_length
))

spam
not spam


In [7]:
texts = [
    "Congratulations! You’ve won a $1000 Walmart gift card. Click here to claim your prize.",
    "Get instant approval for a personal loan up to $10,000. Apply now and get cash today.",
    "Limited time offer! Buy 1 get 2 free on all supplements. Visit our website to order now.",
    "Your account has been suspended. Verify your identity immediately at secure-update-login.com.",
    "Earn money from home! Work just 2 hours a day and make $500 daily. Sign up here.",
    "Hey, are you still coming to the gym later? I’ll be there around 7.",
    "Don’t forget to send me the slides before the meeting tomorrow.",
    "Can you pick up some milk on your way home?",
    "Happy birthday! Hope you have a great day!",
    "Just checking if you received the report I sent yesterday."
]

for i, text in enumerate(texts, 1):
    print(f"text_{i}:", classify_review(
        text, model2, tokenizer, device, max_length=train_dataset.max_length
    ))

text_1: spam
text_2: spam
text_3: spam
text_4: spam
text_5: spam
text_6: not spam
text_7: not spam
text_8: not spam
text_9: not spam
text_10: not spam
