In [None]:
!pip install transformers



In [None]:
import pandas as pd
import re
import math
import torch
import transformers
import numpy as np
from tqdm.auto import tqdm

# Esto es para progress_apply
tqdm.pandas()

In [None]:
def normalize_text(text):
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text) # Conserva solo letras y espacios
    text = ' '.join(text.split()) # Elimina espacios múltiples
    return text

# Carga el archivo (asegúrate que el nombre coincida con el que subiste)
df_reviews = pd.read_csv('/content/imdb_reviews.tsv', sep='\t', dtype={'votes': 'Int64'})

print("Normalizando texto...")
df_reviews['review_norm'] = df_reviews['review'].progress_apply(normalize_text)

print("Separando conjuntos de train/test...")
df_reviews_train = df_reviews.query('ds_part == "train"').copy()
df_reviews_test = df_reviews.query('ds_part == "test"').copy()

print(f"Train shape: {df_reviews_train.shape}")
print(f"Test shape: {df_reviews_test.shape}")

Normalizando texto...


  0%|          | 0/47331 [00:00<?, ?it/s]

Separando conjuntos de train/test...
Train shape: (23796, 18)
Test shape: (23535, 18)


In [None]:
tokenizer = transformers.BertTokenizer.from_pretrained('bert-base-uncased')
config = transformers.BertConfig.from_pretrained('bert-base-uncased')
model = transformers.BertModel.from_pretrained('bert-base-uncased')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [None]:
def BERT_text_to_embeddings(texts, max_length=512, batch_size=100, force_device=None, disable_progress_bar=False):
    ids_list = []
    attention_mask_list = []
    for text in tqdm(texts, disable=disable_progress_bar):
        encoded_dict = tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=max_length,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
        )
        ids_list.append(encoded_dict['input_ids'])
        attention_mask_list.append(encoded_dict['attention_mask'])
    if force_device is not None:
        device = torch.device(force_device)
    else:
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    if not disable_progress_bar:
        print(f'Uso del dispositivo {device}.')
    embeddings = []
    for i in tqdm(range(math.ceil(len(ids_list)/batch_size)), disable=disable_progress_bar):
        ids_batch = torch.LongTensor(ids_list[batch_size*i:batch_size*(i+1)]).to(device)
        attention_mask_batch = torch.LongTensor(attention_mask_list[batch_size*i:batch_size*(i+1)]).to(device)
        with torch.no_grad():
            model.eval()
            batch_embeddings = model(input_ids=ids_batch, attention_mask=attention_mask_batch)
        embeddings.append(batch_embeddings[0][:,0,:].detach().cpu().numpy())
    return np.concatenate(embeddings)

In [None]:
# Ahora usamos la GPU de Colab ('cuda') y un batch_size más grande
# Esto procesará el CONJUNTO COMPLETO de entrenamiento y prueba

print("Generando embeddings de ENTRENAMIENTO...")
train_features_9 = BERT_text_to_embeddings(
    df_reviews_train['review_norm'],
    batch_size=100, # La GPU puede manejar lotes más grandes
    force_device='cuda'
)

print("Generando embeddings de PRUEBA...")
test_features_9 = BERT_text_to_embeddings(
    df_reviews_test['review_norm'],
    batch_size=100,
    force_device='cuda'
)

print("¡Embeddings generados!")

Generando embeddings de ENTRENAMIENTO...


  0%|          | 0/23796 [00:00<?, ?it/s]

Uso del dispositivo cuda.


  0%|          | 0/238 [00:00<?, ?it/s]

Generando embeddings de PRUEBA...


  0%|          | 0/23535 [00:00<?, ?it/s]

Uso del dispositivo cuda.


  0%|          | 0/236 [00:00<?, ?it/s]

¡Embeddings generados!


In [None]:
# Guardamos los arrays de numpy en un archivo .npz comprimido
print("Guardando archivos...")
np.savez_compressed('features_9.npz',
                    train=train_features_9,
                    test=test_features_9)
print("¡Archivos guardados en 'features_9.npz'!")

Guardando archivos...
¡Archivos guardados en 'features_9.npz'!


#Embeding para my_reviews


In [None]:
# 1. Añadir las 8 reseñas
my_reviews_colab = pd.DataFrame([
    'I did not simply like it, not my kind of movie.',
    'Well, I was bored and felt asleep in the middle of the movie.',
    'I was really fascinated with the movie',
    'Even the actors looked really old and disinterested, and they got paid to be in the movie. What a soulless cash grab.',
    'I didn\'t expect the reboot to be so good! Writers really cared about the source material',
    'The movie had its upsides and downsides, but I feel like overall it\'s a decent flick. I could see myself going to see it again.',
    'What a rotten attempt at a comedy. Not a single joke lands, everyone acts annoying and loud, even kids won\'t like this!',
    'Launching on Netflix was a brave move & I really appreciate being able to binge on episode after episode, of this exciting intelligent new drama.'
], columns=['review'])

# 2. Normalízarlas
my_reviews_colab['review_norm'] = my_reviews_colab['review'].apply(normalize_text)

# 3. Generar los embeddings (esto será muy rápido)
my_reviews_features_9_colab = BERT_text_to_embeddings(
    my_reviews_colab['review_norm'],
    batch_size=8, # Lote pequeño
    force_device='cuda'
)

# 4. Guarda los embeddings en un archivo .npy
np.save('my_reviews_features_9.npy', my_reviews_features_9_colab)

print("Archivo 'my_reviews_features_9.npy' guardado. ¡Descárgalo!")

  0%|          | 0/8 [00:00<?, ?it/s]

Uso del dispositivo cuda.


  0%|          | 0/1 [00:00<?, ?it/s]

Archivo 'my_reviews_features_9.npy' guardado. ¡Descárgalo!
