In [7]:
from transformers import pipeline
# Crear el pipeline de unmasked
unmasker = pipeline('fill-mask', model='distilbert-base-multilingual-cased')

#Usamos el pipeline para completar la máscara
resultados = unmasker("Hello I'm a [MASK] model.")

for resultado in resultados:
    print(f"Score: {resultado['score']:.4f}")
    print(f"Sequence: {resultado['sequence']}")
    print(f"Token String: {resultado['token_str']}")
    print('---')

Score: 0.0408
Sequence: Hello I'm a virtual model.
Token String: virtual
---
Score: 0.0200
Sequence: Hello I'm a big model.
Token String: big
---
Score: 0.0187
Sequence: Hello I'm a Hello model.
Token String: Hello
---
Score: 0.0174
Sequence: Hello I'm a model model.
Token String: model
---
Score: 0.0142
Sequence: Hello I'm a perfect model.
Token String: perfect
---


PROBAMOS EN ESPAÑOL

In [8]:
#AHORA PROBAMOS EN ESPAÑOL PARA VER LOS RESULTADOS
resultados2= unmasker("Hola, soy un [MASK] virtual")

for resultado in resultados2:
    print(f"Score: {resultado['score']:.4f}")
    print(f"Sequence: {resultado['sequence']}")
    print(f"Token String: {resultado['token_str']}")
    print('---')


Score: 0.0966
Sequence: Hola, soy un mundo virtual
Token String: mundo
---
Score: 0.0425
Sequence: Hola, soy un universo virtual
Token String: universo
---
Score: 0.0411
Sequence: Hola, soy un espacio virtual
Token String: espacio
---
Score: 0.0353
Sequence: Hola, soy un personaje virtual
Token String: personaje
---
Score: 0.0286
Sequence: Hola, soy un hogar virtual
Token String: hogar
---


AHORA HAREMOS EL PIPELINE PERO DE CLASIFICACIÓN DE TEXTOS

In [5]:
classifier = pipeline('text-classification', model='distilbert-base-multilingual-cased')
resultados = classifier("This is a great Product")

label_map = {0: "NEGATIVE", 1: "POSITIVE"}

for resultado in resultados:
    label = label_map.get(resultado['label'], 'UNKNOWN')
    print(f"Label: {label}, Score: {resultado['score']:.4f}")

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Label: UNKNOWN, Score: 0.5028


SE PUEDE UTILIZAR DISTILLBERT PARA REALIZAR MÁS TAREAS, PERO ESTE MODELO HA SIDO ENTRENADO PARA REALIZAR LA ACTIVIDAD DE MASK. ES POR ELLO QUE NO PUEDO REALIZAR LA CLASIFICACIÓN. VOY A ENTRENARLO CON FINE-TUNING PARA QUE APRENDA A CLASIFICAR 

In [None]:
#Cargamos la data que nos brindó el profe 
import urllib.request
import tarfile

urllib.request.urlretrieve("https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz", "aclImdb_v1.tar.gz")
with tarfile.open('aclImdb_v1.tar.gz', 'r:gz') as tar:
    tar.extractall()

In [2]:
import os
import random
from typing import Tuple
from transformers import AutoTokenizer
from torch.utils.data import Dataset, DataLoader
import torch


SAMPLE_SIZE = 500

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-multilingual-cased")

def load_data(directory: str) -> Tuple[dict, list]:
    files = [os.path.join(directory, f) for f in os.listdir(directory) if f.endswith('.txt')]
    files = random.sample(files, SAMPLE_SIZE)
    texts = [open(file, 'r', encoding='utf-8').read() for file in files]
    encodings = tokenizer(texts, truncation=True, padding=True)
    return encodings

def split_data(encodings: dict, labels: list, val_ratio: float = 0.1) -> Tuple[dict, list, dict, list]:
    indices = list(range(len(labels)))
    random.shuffle(indices)
    val_size = int(len(labels) * val_ratio)
    val_indices = indices[:val_size]
    train_indices = indices[val_size:]

    val_encodings = {key: [encodings[key][i] for i in val_indices] for key in encodings.keys()}
    val_labels = [labels[i] for i in val_indices]

    train_encodings = {key: [encodings[key][i] for i in train_indices] for key in encodings.keys()}
    train_labels = [labels[i] for i in train_indices]

    return train_encodings, train_labels, val_encodings, val_labels

class TextDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

train_dir_pos = 'aclImdb/train/pos/'
train_dir_neg = 'aclImdb/train/neg/'

train_encodings_pos, train_labels_pos = load_data(train_dir_pos), [1] * SAMPLE_SIZE
train_encodings_neg, train_labels_neg = load_data(train_dir_neg), [0] * SAMPLE_SIZE

train_encodings = {key: train_encodings_pos[key] + train_encodings_neg[key] for key in train_encodings_pos.keys()}
train_labels = train_labels_pos + train_labels_neg

combined = list(zip(train_encodings['input_ids'], train_labels))
random.shuffle(combined)
train_encodings['input_ids'], train_labels = zip(*combined)

train_encodings, train_labels, val_encodings, val_labels = split_data(train_encodings, train_labels)

train_dataset = TextDataset(train_encodings, train_labels)
val_dataset = TextDataset(val_encodings, val_labels)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

In [3]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-multilingual-cased", num_labels=2)

# Texto -> [BERT] -> [Clasificación] -> [Positivo, Negativo] [0.5, 0.5] [0, 1]
# 1. Inicialización al azar de los pesos del modelo
# 2. Feed Forward
# 3. Cálculo de la pérdida
# 3. Retropropagación
# 4. Actualización 

#TODO: Entrenar el modelo
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs'
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

trainer.train()

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 171/171 [36:20<00:00, 12.75s/it]

{'train_runtime': 2180.003, 'train_samples_per_second': 1.239, 'train_steps_per_second': 0.078, 'train_loss': 0.6703060217070997, 'epoch': 3.0}





TrainOutput(global_step=171, training_loss=0.6703060217070997, metrics={'train_runtime': 2180.003, 'train_samples_per_second': 1.239, 'train_steps_per_second': 0.078, 'total_flos': 357661976371200.0, 'train_loss': 0.6703060217070997, 'epoch': 3.0})

Ahora evaluaremos su performance con los datos test

In [4]:
from torch.utils.data import DataLoader
import torch

#TODO: Cargar los datos de prueba
test_dir_pos = 'aclImdb/test/pos/'
test_dir_neg = 'aclImdb/test/neg/'

test_encodings_pos = load_data(test_dir_pos)
test_encodings_neg = load_data(test_dir_neg)

test_labels_pos = [1] * len(test_encodings_pos['input_ids'])
test_labels_neg = [0] * len(test_encodings_neg['input_ids'])

test_encodings = {key: test_encodings_pos[key] + test_encodings_neg[key] for key in test_encodings_pos.keys()}
test_labels = test_labels_pos + test_labels_neg

#TODO: Crear el dataset y el data loader de prueba
test_dataset= TextDataset(test_encodings, test_labels)
test_loader= DataLoader(test_dataset, batch_size=32, shuffle=False)

#TODO: Realizar las predicciones. Guardar los logits en preds.
predictions = trainer.predict(test_dataset)
preds = predictions.predictions.argmax(-1)

#TODO: Calcular el accuracy
accuracy = (preds == test_labels).astype(float).mean()
print(f"Accuracy: {accuracy:.4f}")


100%|██████████| 16/16 [04:39<00:00, 17.46s/it]

Accuracy: 0.6940





GUARDAMOS EL TOKENIZER DENTRO DE LA CARPETA DEL MODELO PARA PODER UTILIZARLO 

In [14]:
tokenizer.save_pretrained('./results/checkpoint-171')

('./results/checkpoint-171\\tokenizer_config.json',
 './results/checkpoint-171\\special_tokens_map.json',
 './results/checkpoint-171\\vocab.txt',
 './results/checkpoint-171\\added_tokens.json',
 './results/checkpoint-171\\tokenizer.json')

AHORA PROBAMOS EL NUEVO MODELO ENTRENADO, NOS DAREMOS CUENTA QUE AHORA SI PREDICE CORRECTAMENTE EN LA MAYORÍA DE LOS CASOS, EL ÚLTIMO TEXTO QUE LE PASÉ ES DEL TEST Y LO PREDIJO CORRECTAMENTE. 

In [26]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

# Ruta al checkpoint guardado
checkpoint_path = "./results/checkpoint-171"

# Cargar el tokenizer y el modelo desde el checkpoint guardado
tokenizer = AutoTokenizer.from_pretrained(checkpoint_path)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint_path)

# Asegúrate de que el modelo esté en modo de evaluación
model.eval()

def predict_texts(texts, model, tokenizer):
    # Preprocesar y tokenizar el texto de entrada
    inputs = tokenizer(texts, return_tensors='pt', truncation=True, padding=True)

    # Realizar la inferencia
    with torch.no_grad():
        outputs = model(**inputs)
    
    logits = outputs.logits

    # Obtener la predicción (0 o 1) a partir de los logits
    predicted_classes = torch.argmax(logits, dim=-1).tolist()

    return predicted_classes

# Ejemplo de texto para clasificar
nuevos_textos = [
    "This model is a bit slow and wastes my time",
    "I am very happy with the results obtained",
    "This product is a waste of time and money.",
    "I loved the movie, it was really good.",
    "Customer service was very bad, would not recommend."
    "Once again Mr. Costner has dragged out a movie for far longer than necessary. Aside from the terrific sea rescue sequences, of which there are very few I just did not care about any of the characters. Most of us have ghosts in the closet, and Costner's character are realized early on, and then forgotten until much later, by which time I did not care. The character we should really care about is a very cocky, overconfident Ashton Kutcher. The problem is he comes off as kid who thinks he's better than anyone else around him and shows no signs of a cluttered closet. His only obstacle appears to be winning over Costner. Finally when we are well past the half way point of this stinker, Costner tells us all about Kutcher's ghosts. We are told why Kutcher is driven to be the best with no prior inkling or foreshadowing. No magic here, it was all I could do to keep from turning it off an hour in."
]
# Realizar la predicción

predicciones = predict_texts(nuevos_textos, model, tokenizer)

# Imprimir los resultados
for texto, prediccion in zip(nuevos_textos, predicciones):
    resultado = "positivo" if prediccion == 1 else "negativo"
    print(f"Texto: \"{texto}\" \nPredicción: El modelo predice que el texto es {resultado}.\n")


Texto: "This model is a bit slow and wastes my time" 
Predicción: El modelo predice que el texto es negativo.

Texto: "I am very happy with the results obtained" 
Predicción: El modelo predice que el texto es positivo.

Texto: "This product is a waste of time and money." 
Predicción: El modelo predice que el texto es positivo.

Texto: "I loved the movie, it was really good." 
Predicción: El modelo predice que el texto es positivo.

Texto: "Customer service was very bad, would not recommend.Once again Mr. Costner has dragged out a movie for far longer than necessary. Aside from the terrific sea rescue sequences, of which there are very few I just did not care about any of the characters. Most of us have ghosts in the closet, and Costner's character are realized early on, and then forgotten until much later, by which time I did not care. The character we should really care about is a very cocky, overconfident Ashton Kutcher. The problem is he comes off as kid who thinks he's better than 

AHORA HAREMOS LA GENERACIÓN DE EMBEDDINGS SIMPLE PARA FINALIZAR Y SELECCIONAMOS EL TOKEN CLS

In [31]:
from transformers import AutoTokenizer, AutoModel
#Cargamos el modelo
tokenizer = AutoTokenizer.from_pretrained(checkpoint_path)
model = AutoModel.from_pretrained(checkpoint_path)

#Creamos el texto y tokenizamos
texto = "I want a chocolate ice cream"
inputs = tokenizer(texto, return_tensors='pt', max_length=512, truncation=True, padding='max_length')

#Mandamos los token al modelo para que lo procese y genere embeddings
with torch.no_grad():
    outputs = model(**inputs)

#Ahora seleccionamos los embeddings del token cls que están contextualizados
embeddings = outputs.last_hidden_state[:, 0, :].numpy()

#Imprimimos los embeddings
print(embeddings)


[[-2.63572857e-03 -6.29550219e-03  9.89595950e-02 -1.69998318e-01
   9.80851293e-01 -2.77944356e-01 -4.12420779e-01  4.92719620e-01
  -1.42921969e-01  2.55764574e-01  2.30696067e-01  9.23453122e-02
  -7.97102451e-02 -2.52078295e-01 -3.27376053e-02 -3.92252713e-01
  -2.02432007e-01  7.36566365e-01  9.16739777e-02  4.39471364e-01
   4.69357252e-01 -9.30169076e-02  1.27542749e-01  1.21282354e-01
   1.46919966e-01 -3.04094374e-01  7.71197379e-02 -3.50230336e-01
   5.08903742e-01 -3.17815900e-01  6.49582371e-02  5.79408169e-01
  -1.07027754e-01  2.81879723e-01  2.12223738e-01  3.40128541e-01
  -1.24587142e+00 -2.22344235e-01 -2.95936733e-01  2.41364554e-01
  -2.58156598e-01  4.73059826e-02 -9.95852053e-03  1.61905944e-01
   2.53077805e-01  6.40345812e-01  2.28748202e-01  2.13981166e-01
   9.09286916e-01 -6.02205276e-01  4.21185732e-01 -2.38718480e-01
   7.68581808e-01 -1.00393522e+00  3.01997006e-01  4.70188707e-01
   3.19930196e-01 -5.78333378e-01  2.00729683e-01  2.27662638e-01
  -1.10483

Ahora generamos embeddings del texto no del token CLS

In [48]:
tokenizer = AutoTokenizer.from_pretrained(checkpoint_path)
model = AutoModel.from_pretrained(checkpoint_path)

# Asegúrate de que el modelo esté en modo de evaluación
model.eval()

# Verificar si hay GPU disponible y usarla
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Texto de entrada para generar embeddings
texto = "I want a chocolate ice cream"

# Tokenización del texto
inputs = tokenizer(texto, return_tensors='pt')

# Mover los tensores a la GPU si está disponible
inputs = {key: val.to(device) for key, val in inputs.items()}

# Desactivar el cálculo de gradientes
with torch.no_grad():
    outputs = model(**inputs)

# Obtener los embeddings de todos los tokens de la última capa
embeddings = outputs.last_hidden_state.cpu().numpy()

# Imprimir los embeddings para cada token, omitiendo el [CLS]
for i, token_embedding in enumerate(embeddings[0][1:], start=1):
    token = tokenizer.decode(inputs['input_ids'][0][i])
    print(f"Token: {token}, Embedding: {token_embedding}")

Token: I, Embedding: [ 3.35070252e-01 -3.37862819e-01  1.47619337e-01  1.94667429e-02
  1.61093855e+00 -2.74964631e-01 -5.28084993e-01  1.47237450e-01
 -6.68300465e-02  3.31193030e-01  4.82559264e-01 -6.77156448e-02
 -7.61414647e-01 -1.18841916e-01  1.15055621e+00 -9.81874168e-01
  2.95427829e-01  9.35262442e-02  3.89490187e-01  7.15464830e-01
  1.05080152e+00  1.67461991e-01 -8.42295706e-01  6.23372570e-02
 -1.52548715e-01  4.97923613e-01  1.08353674e-01 -5.58532357e-01
  3.97067279e-01 -5.28453112e-01  4.39625293e-01  7.34310269e-01
 -5.07388532e-01  9.07560408e-01  4.87341881e-01  8.55587184e-01
  3.54480267e-01 -2.75949419e-01  5.98988980e-02 -1.29302904e-01
  2.44634122e-01  1.38007119e-01  5.75728059e-01 -1.46100521e-01
  1.26373097e-01 -3.16009164e-01  5.28604627e-01  7.94600129e-01
  5.97586572e-01 -8.79482627e-01  2.75731504e-01 -3.60415876e-03
  6.72833264e-01  3.30386519e-01  3.39327991e-01  5.04083514e-01
  3.09743404e-01 -2.88981467e-01  6.67069197e-01 -5.51546812e-01
 -2.

AHORA HACEMOS EL PROCESO PARA CONVERTIR LOS EMBEDDINGS Y LAS PALABRAS A ARCHIVOS TSV

In [53]:
from transformers import AutoTokenizer, AutoModel
import torch
from gensim.utils import simple_preprocess

tokenizer = AutoTokenizer.from_pretrained(checkpoint_path)
model = AutoModel.from_pretrained(checkpoint_path)

# Asegúrate de que el modelo esté en modo de evaluación
model.eval()

# Verificar si hay GPU disponible y usarla
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Texto de entrada para generar embeddings
texto = "Jacqueline Hyde starts like any other normal day for telemarketing individual Jackie Hyde (co-producer Gabriella Hall) until her boss (Robert Donovan) fires her for taking personal calls at work, however it's not all bad news as the call she took was from a lawyer informing her that her Grandfather (Malcolm Bennett) has recently died & that he left her his mansion & fortune (why doesn't stuff like that ever happen to me? Sigh). Very excited Jackie heads on over there & makes herself right at home, while looking for the thermostat late one night Jackie stumbles upon a secret room where her Grandfather stashes the bright red formula that he invented that allows whoever drinks it to change their appearance. Being a bit on the porky side Jackie finally settles on the glamorous Jacqueline (Blythe Metz), however Jackie's better looking alter-ego starts to take control...<br /><br />Written, co-produced & directed by Rolfe Kanefsky I thought Jacqueline Hyde was complete total & utter crap from start to finish & it's as simple & straight forward as that. According to the opening credits Jacqueline Hyde was 'inspired' by the classic Robert Louis Stevenson novel 'The Strange Case of Dr. Jekyll and Mr. Hyde', frankly if Mr. Stevenson could see what was being done to his story here he'd turn in his grave. For a start I think Jacqueline Hyde was/is intended to be a horror film, the IMDb certainly lists it as such but there isn't any horror in it at all apart from just how bad it is. I would say that Jacqueline Hyde is more a soft-core porno than anything else & extremely tame with it, why sit down & watch this softer than soft porno crap when you can watch you proper hard-core stuff that actually delivers the goods? Why, that's the question I ask here. It's not even good porn either, besides being far too soft it's dull, boring & the not-worth-mentioning sex scenes are few & far between. The most intelligent aspect of this film is the title which would have been quite clever if not for the fact that another film used the Jacqueline Hyde (1998) title during the last century & judging by the IMDb's plot summary it sounds a hell of a lot better than this piece of rubbish. This is one of those films you have to watch yourself to see just how bad it is but just hope that you never get the opportunity.<br /><br />Director Kanefsky was obviously working on a low budget but that's not an excuse these days, shot on a digital camcorder the film looks cheap & the few instances of CGI look like they came from a Nintendo Gameboy, the final 'shocking' twist has probably the worst morph effect I've ever seen & is pretty good for a laugh as is the scene when Jackie's breasts grow via more terrible CGI. That's another thing, the film takes itself far too seriously. The subject matter sucks, is far too predictable & makes for a poor film but maybe if the dialogue had been intentionally funny with some dirty porn talk the film might have been more fun to watch, alas it isn't so it isn't. Forget about any decent horror, violence or gore as there isn't any apart from a surprisingly bloodless decapitation at the end.<br /><br />Technically Jacqueline Hyde is home made film type stuff, the photography is of the flat hand held point-&-shoot variety, the music, production design & special effects are of a suitably low standard to match the script. The acting was awful, seriously this is bad.<br /><br />Jacqueline Hyde in my opinion a load of crap, there is not one positive thing about this turgid film that I can think of. Any proper film lover will have an almost impossible time trying to find any redeeming value in this crap, definitely one to avoid."

#Limpiamos el texto con Gensim
texto_limpio = ' '.join(simple_preprocess(texto, deacc=True))

# Tokenización del texto
inputs = tokenizer(texto_limpio, return_tensors='pt', truncation=True, padding=True)

# Mover los tensores a la GPU si está disponible
inputs = {key: val.to(device) for key, val in inputs.items()}

# Desactivar el cálculo de gradientes
with torch.no_grad():
    outputs = model(**inputs)

# Obtener los embeddings de todos los tokens de la última capa
embeddings = outputs.last_hidden_state[0].cpu().numpy()

# Obtener las IDs tokenizadas
input_ids = inputs['input_ids'][0].cpu().tolist()

# Filtrar los tokens y embeddings (omitimos el token [CLS] y [PAD])
tokens = []
embeddings_list = []

for i, (token_id, token_embedding) in enumerate(zip(input_ids[1:], embeddings[1:]), start=1):
    if token_id == [tokenizer.pad_token_id, tokenizer.sep_token_id]:
        break  #  Detenerse si se encuentra el token de padding o [SEP]
    token = tokenizer.decode([token_id])
    tokens.append(token)
    embeddings_list.append(token_embedding)

# Guardar los embeddings en un archivo TSV
with open('text_embeddings.tsv', 'w', encoding='utf-8') as f_embeddings:
    for embedding in embeddings_list:
        # Convierte cada embedding en una línea de texto separado por tabulaciones
        embedding_str = '\t'.join(map(str, embedding))
        f_embeddings.write(embedding_str + '\n')

# Guardar los tokens en un archivo TSV
with open('text_labels.tsv', 'w', encoding='utf-8') as f_labels:
    for token in tokens:
        f_labels.write(token + '\n')

print("Embeddings y tokens guardados exitosamente.")

Embeddings y tokens guardados exitosamente.
