In [1]:
import pandas as pd
from transformers import DistilBertTokenizer, DistilBertModel
import torch 
from typing import List, Tuple
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def load_data(file_path: str) -> pd.DataFrame:
    """Carga el archivo CSV y retorna un DataFrame."""
    return pd.read_csv(file_path)

In [3]:
def combine_text_columns(df: pd.DataFrame) -> pd.DataFrame:
    """Combina las columnas 'title' y 'description_es' en una sola columna 'text'."""
    df['text'] = df['title']
    return df

In [4]:
def tokenize_function(text: str, tokenizer: DistilBertTokenizer) -> dict:
    """Tokeniza el texto utilizando el tokenizer especificado."""
    return tokenizer(text, padding='max_length', truncation=True, max_length=128, return_tensors="pt")

In [5]:
def tokenize_dataset(df: pd.DataFrame, tokenizer: DistilBertTokenizer) -> pd.DataFrame:
    """Tokeniza todo el dataset y retorna un DataFrame con los tokens."""
    tokens = df['text'].apply(lambda x: tokenize_function(x, tokenizer))
    tokens_df = tokens.apply(lambda x: x['input_ids'].squeeze().tolist()).apply(pd.Series)
    return tokens_df

In [6]:
def save_tokenized_data(df: pd.DataFrame, file_path: str) -> None:
    """Guarda el DataFrame tokenizado en un archivo CSV."""
    df.to_csv(file_path, index=False)

In [7]:
def generate_embeddings(text: str, tokenizer: DistilBertTokenizer, model: DistilBertModel) -> torch.Tensor:
    """Genera embeddings a partir del texto utilizando el modelo y el tokenizer especificados."""
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128)
    outputs = model(**inputs)
    embeddings = outputs.last_hidden_state.mean(dim=1)  # Obtener el embedding promedio
    return embeddings

In [8]:
def main():
    input_file_path = 'netflix_titles_es.csv'
    tokenized_output_file_path = 'netflix_titles_tokens.csv'
    embeddings_output_file_path = 'netflix_titles_embeddings.csv'
    
    df = load_data(input_file_path)
    df = combine_text_columns(df)
    
    tokenizer = DistilBertTokenizer.from_pretrained('mrm8488/distill-bert-base-spanish-wwm-cased-finetuned-spa-squad2-es')
    model = DistilBertModel.from_pretrained('mrm8488/distill-bert-base-spanish-wwm-cased-finetuned-spa-squad2-es')
    
    df_tokens = tokenize_dataset(df, tokenizer)
    save_tokenized_data(df_tokens, tokenized_output_file_path)
    print(f"Datos tokenizados guardados en '{tokenized_output_file_path}'")
    
    # Generar embeddings para los textos combinados
    df['embeddings'] = df['text'].apply(lambda x: generate_embeddings(x, tokenizer, model))
    
    # Convertir los embeddings a DataFrame para guardarlos
    embeddings_df = pd.DataFrame(df['embeddings'].tolist())
    embeddings_df.to_csv(embeddings_output_file_path, index=False)
    print(f"Embeddings generados y guardados en '{embeddings_output_file_path}'")

if __name__ == "__main__":
    main()

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'BertTokenizer'. 
The class this function is called from is 'DistilBertTokenizer'.
You are using a model of type bert to instantiate a model of type distilbert. This is not supported for all configurations of models and can yield errors.

A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.0.1 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "C:\Python310\lib\runpy.py", line 196, in _run_module_as_main
    return 

Datos tokenizados guardados en 'netflix_titles_tokens.csv'


RuntimeError: Numpy is not available