<a href="https://colab.research.google.com/github/Armando5347/polaridad-opinion/blob/main/polaridad_de_opinion.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Imports a utilizar**

In [2]:
!pip install imblearn

Collecting imblearn
  Downloading imblearn-0.0-py2.py3-none-any.whl.metadata (355 bytes)
Collecting imbalanced-learn (from imblearn)
  Downloading imbalanced_learn-0.12.4-py3-none-any.whl.metadata (8.3 kB)
Downloading imblearn-0.0-py2.py3-none-any.whl (1.9 kB)
Downloading imbalanced_learn-0.12.4-py3-none-any.whl (258 kB)
Installing collected packages: imbalanced-learn, imblearn
Successfully installed imbalanced-learn-0.12.4 imblearn-0.0


In [None]:
import pandas as pd
import numpy as np
import stanza
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.neural_network import MLPClassifier
import re, os
import pickle
import threading
from scipy.sparse import hstack
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import StratifiedKFold
from imblearn.pipeline import make_pipeline
from spellchecker import SpellChecker

**Normalizar el texto**

In [4]:
def limpiar_texto(texto):
    texto = texto.lower()
    reemplazos = {
        'á': 'a', 'é': 'e', 'í': 'i', 'ó': 'o', 'ú': 'u',
        'à': 'a', 'è': 'e', 'ì': 'i', 'ò': 'o', 'ù': 'u',
        'ä': 'a', 'ë': 'e', 'ï': 'i', 'ö': 'o', 'ü': 'u',
        'ñ': 'n'
    }

    for acentuada, normal in reemplazos.items():
        texto = texto.replace(acentuada, normal)

    return texto

config = {
    'processors': 'tokenize,mwt,pos,lemma',
    'lang': 'es'
}

nlp = stanza.Pipeline(**config)

def normalizarTexto(texto, limpia, quita_stopwords, lematiza):
    if limpia:
        texto = limpiar_texto(texto)

    try:
        doc = nlp(texto)
        cadenaNorm = ""
        for sent in doc.sentences:
            for token in sent.words:
                if quita_stopwords and lematiza:
                    if token.pos not in {'ADP', 'CCONJ', 'DET', 'SCONJ', 'PRON'}:
                        cadenaNorm += token.lemma + " "
                elif quita_stopwords:
                    if token.pos not in {'ADP', 'CCONJ', 'DET', 'SCONJ', 'PRON'}:
                        cadenaNorm += token.text + " "
                elif lematiza:
                    cadenaNorm += token.lemma + " "
                else:
                    cadenaNorm += token.text + " "
    except:
        cadenaNorm = ""

    return cadenaNorm

2024-11-23 11:43:28 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.9.0.json:   0%|   …

2024-11-23 11:43:29 INFO: Downloaded file to C:\Users\emidh\stanza_resources\resources.json
2024-11-23 11:43:29 INFO: Loading these models for language: es (Spanish):
| Processor | Package           |
---------------------------------
| tokenize  | combined          |
| mwt       | combined          |
| pos       | combined_charlm   |
| lemma     | combined_nocharlm |

2024-11-23 11:43:29 INFO: Using device: cuda
2024-11-23 11:43:29 INFO: Loading: tokenize


KeyboardInterrupt: 

**Normalización adicional**

In [None]:
def corregir_texto(texto):
    
    # Inicializa el corrector para el idioma español
    spell = SpellChecker(language='es')
    
    # Dividir el texto en palabras, preservando la puntuación
    palabras = texto.split()
    texto_corregido = []
    
    for palabra in palabras:
        # Extrae signos de puntuación al inicio y al final
        inicio = ''.join(char for char in palabra if not char.isalnum())
        final = ''.join(char for char in reversed(palabra) if not char.isalnum())
        palabra_central = palabra[len(inicio):-len(final) or None]
        
        # Si la palabra está mal escrita, corrige
        if palabra_central and palabra_central not in spell:
            sugerencia = spell.correction(palabra_central)
            palabra_central = sugerencia if sugerencia else palabra_central
        
        # Reconstruye la palabra con signos de puntuación
        texto_corregido.append(f"{inicio}{palabra_central}{final}")
    
    # Une las palabras corregidas en el texto final
    return ' '.join(texto_corregido)



def corregir_repeticiones(texto):
    
    # Elimina repeticiones consecutivas de letras en una palabra (e.g., "caaaarro" -> "carro")
    def corregir_letras_repetidas(palabra):
        return re.sub(r'(.)\1{2,}', r'\1', palabra)

    # Dividir el texto en palabras
    palabras = texto.split()
    palabras_corregidas = []
    ultima_palabra = None

    for palabra in palabras:
        # Corregir letras repetidas en exceso
        palabra_corregida = corregir_letras_repetidas(palabra)

        # Eliminar palabras repetidas consecutivamente
        if palabra_corregida != ultima_palabra:
            palabras_corregidas.append(palabra_corregida)
            ultima_palabra = palabra_corregida

    # Reconstruir el texto corregido
    return ' '.join(palabras_corregidas)


def procesar_texto(texto, aplicar_repeticiones=False, aplicar_ortografia=False):
    
    if aplicar_repeticiones:
        texto = corregir_repeticiones(texto)
    if aplicar_ortografia:
        texto = corregir_texto(texto)
    return texto

**Obtener el dataset**

In [3]:
data = pd.read_csv("corpusNorm.csv", sep="\t")
print (data['Content'])

train_split, test_split = train_test_split(
        data,
        test_size=0.2,  # 20% para prueba y 80% para entrenamiento
        random_state=0,  # Semilla para asegurar reproducibilidad
        stratify=data['Polarity']  # Mantener proporciones de clase
    )

X_train = train_split["Content"]
X_train_copy = X_train.copy()
y_train = train_split["Polarity"]
y_train_copy = y_train.copy()
X_test = test_split["Content"]
X_test_copy = X_test.copy()
y_test = test_split["Polarity"]
y_test_copy = y_test.copy()

0        pésimo lugar pensar dos vez antes ir hotel , m...
1        no vayas lugar eddie cuatro ir recientemente E...
2        mala relación calidad-precio seguir corto simp...
3        minusválido ? ¡ no alojar aquí ! reservar hote...
4        ser porqueria no perder tiempo no perder tiemp...
                               ...                        
30207    verdadera joya arquitectónico ser construcción...
30208    romántico mucho estilo romeo julieta ser sitio...
30209    parecer castillo ideal subir escalinata divisa...
30210    imperdible ser imperdible , ahí poder ver much...
30211    mucho bonito vista no poder ir guanajuato visi...
Name: Content, Length: 30212, dtype: object


**Calcular polaridad**

In [4]:
def getSELFeatures(cadenas, lexicon_sel):
	#'hastiar': [('Enojo\n', '0.629'), ('Repulsi\xf3n\n', '0.596')]
	polaridad_cadenas = []
	for cadena in cadenas:
		valor_alegria = 0.0
		valor_enojo = 0.0
		valor_miedo = 0.0
		valor_repulsion = 0.0
		valor_sorpresa = 0.0
		valor_tristeza = 0.0
		cadena_palabras = re.split('\s+', cadena)
		dic = {}
		for palabra in cadena_palabras:
			if palabra in lexicon_sel:
				caracteristicas = lexicon_sel[palabra]
				for emocion, valor in caracteristicas:
					if emocion == 'Alegría':
						valor_alegria = valor_alegria + float(valor)
					elif emocion == 'Tristeza':
						valor_tristeza = valor_tristeza + float(valor)
					elif emocion == 'Enojo':
						valor_enojo = valor_enojo + float(valor)
					elif emocion == 'Repulsión':
						valor_repulsion = valor_repulsion + float(valor)
					elif emocion == 'Miedo':
						valor_miedo = valor_miedo + float(valor)
					elif emocion == 'Sorpresa':
						valor_sorpresa = valor_sorpresa + float(valor)
		dic['__alegria__'] = valor_alegria
		dic['__tristeza__'] = valor_tristeza
		dic['__enojo__'] = valor_enojo
		dic['__repulsion__'] = valor_repulsion
		dic['__miedo__'] = valor_miedo
		dic['__sorpresa__'] = valor_sorpresa

		#Esto es para los valores acumulados del mapeo a positivo (alegría + sorpresa) y negativo (enojo + miedo + repulsión + tristeza)
		dic['acumuladopositivo'] = dic['__alegria__'] + dic['__sorpresa__']
		dic['acumuladonegative'] = dic['__enojo__'] + dic['__miedo__'] + dic['__repulsion__'] + dic['__tristeza__']

		polaridad_pos = np.array([dic['acumuladopositivo']])
		polaridad_neg = np.array([dic['acumuladonegative']])
		polaridad_cadena = np.concatenate((polaridad_pos, polaridad_neg), axis=0)
		polaridad_cadenas.append(polaridad_cadena)

	return polaridad_cadenas

if (os.path.exists('lexicon_sel.pkl')):
    lexicon_sel_file = open ('lexicon_sel.pkl','rb')
    lexicon_sel = pickle.load(lexicon_sel_file)
else:
    print("No se ha encontrado el archivo lexicon_sel.pkl")
    exit()

polaridad_train = getSELFeatures(X_train, lexicon_sel)
polaridad_test = getSELFeatures(X_test, lexicon_sel)
# print(polaridad_train)


**Vectorización**

In [5]:
tfidf_vectorizer = TfidfVectorizer(token_pattern= r'(?u)\w+|\w+\n|\.|\¿|\?', ngram_range=(1,1))
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train_copy)
X_test_tfidf = tfidf_vectorizer.transform(X_test_copy)

frequency_vectorizer = CountVectorizer(binary=False, ngram_range=(1, 1))
X_train_freq = frequency_vectorizer.fit_transform(X_train_copy)
X_test_freq = frequency_vectorizer.transform(X_test_copy)

X_tain_pol = None
X_test_pol = None

**Añadir polarización a la vectorización**

In [14]:
X_train_pol = hstack([X_train_tfidf, polaridad_train]).toarray()
X_test_pol = hstack([X_test_tfidf, polaridad_test]).toarray()

**Prueba de balanceo de clases con cross validation con modelos no pesados**

Utilizando polaridad de texto, undersampling y oversampling

In [16]:
pliegues = 5
pliegues_estratificados = StratifiedKFold(n_splits=pliegues, shuffle=True, random_state=0)
results = []
modelo_prueba = LogisticRegression()

for train_index, test_index in pliegues_estratificados.split(X_train_pol, y_train_copy):


    # Dividir los datos en entrenamiento y prueba
    X_trainn, X_testt = X_train_pol[train_index], X_train_pol[test_index]
    y_trainn, y_testt = y_train_copy.iloc[train_index], y_train_copy.iloc[test_index]

    # Aplicar under-sampling y over-sampling en los datos de entrenamiento
    under_sampler = RandomUnderSampler(random_state=0)
    over_sampling = RandomOverSampler(random_state=0)

    # Paso 1: Under-sampling
    X_resampled, y_resampled = under_sampler.fit_resample(X_trainn, y_trainn)

    # Paso 2: Over-sampling
    X_resampled, y_resampled = over_sampling.fit_resample(X_resampled, y_resampled)

    # Entrenar el modelo con los datos balanceados
    modelo_prueba.fit(X_resampled, y_resampled)

    y_pred = modelo_prueba.predict(X_testt)
    report = classification_report(y_testt, y_pred, output_dict=True)
    results.append(report['macro avg']['f1-score'])
average_macro_f1 = sum(results) / len(results)
print("Promedio del f1-score de 'macro avg' en todas las iteraciones:", average_macro_f1)
#print(results)


Promedio del f1-score de 'macro avg' en todas las iteraciones: 0.41603843745733526


Utilizando undersampling y oversampling

In [7]:
pliegues = 5
pliegues_estratificados = StratifiedKFold(n_splits=pliegues, shuffle=True, random_state=0)
results = []
modelo_prueba = LogisticRegression()

for train_index, test_index in pliegues_estratificados.split(X_train_tfidf, y_train_copy):


    # Dividir los datos en entrenamiento y prueba
    X_trainn, X_testt = X_train_tfidf[train_index], X_train_tfidf[test_index]
    y_trainn, y_testt = y_train_copy.iloc[train_index], y_train_copy.iloc[test_index]

    # Aplicar under-sampling y over-sampling en los datos de entrenamiento
    under_sampler = RandomUnderSampler(random_state=0)
    over_sampling = RandomOverSampler(random_state=0)

    # Paso 1: Under-sampling
    X_resampled, y_resampled = under_sampler.fit_resample(X_trainn, y_trainn)

    # Paso 2: Over-sampling
    X_resampled, y_resampled = over_sampling.fit_resample(X_resampled, y_resampled)

    # Entrenar el modelo con los datos balanceados
    modelo_prueba.fit(X_resampled, y_resampled)

    y_pred = modelo_prueba.predict(X_testt)
    report = classification_report(y_testt, y_pred, output_dict=True)
    results.append(report['macro avg']['f1-score'])
average_macro_f1 = sum(results) / len(results)
print("Promedio del f1-score de 'macro avg' en todas las iteraciones:", average_macro_f1)
#print(results)


Promedio del f1-score de 'macro avg' en todas las iteraciones: 0.41664715122745777


Utilizando solo undersampling

In [12]:
pliegues = 5
pliegues_estratificados = StratifiedKFold(n_splits=pliegues, shuffle=True, random_state=0)
results = []
modelo_prueba = LogisticRegression()

for train_index, test_index in pliegues_estratificados.split(X_train_tfidf, y_train_copy):


    # Dividir los datos en entrenamiento y prueba
    X_trainn, X_testt = X_train_tfidf[train_index], X_train_tfidf[test_index]
    y_trainn, y_testt = y_train_copy.iloc[train_index], y_train_copy.iloc[test_index]

    # Aplicar under-sampling y over-sampling en los datos de entrenamiento
    under_sampler = RandomUnderSampler(random_state=0)
    over_sampling = RandomOverSampler(random_state=0)

    # Paso 1: Under-sampling
    X_resampled, y_resampled = under_sampler.fit_resample(X_trainn, y_trainn)

    # Paso 2: Over-sampling
    #X_resampled, y_resampled = over_sampling.fit_resample(X_resampled, y_resampled)

    # Entrenar el modelo con los datos balanceados
    modelo_prueba.fit(X_resampled, y_resampled)

    y_pred = modelo_prueba.predict(X_testt)
    report = classification_report(y_testt, y_pred, output_dict=True)
    results.append(report['macro avg']['f1-score'])
average_macro_f1 = sum(results) / len(results)
print("Promedio del f1-score de 'macro avg' en todas las iteraciones:", average_macro_f1)
#print(results)


Promedio del f1-score de 'macro avg' en todas las iteraciones: 0.41664715122745777


Utilizando solo oversampling

In [20]:
pliegues = 5
pliegues_estratificados = StratifiedKFold(n_splits=pliegues, shuffle=True, random_state=0)
results = []
modelo_prueba = LogisticRegression()

for train_index, test_index in pliegues_estratificados.split(X_train_tfidf, y_train_copy):


    # Dividir los datos en entrenamiento y prueba
    X_trainn, X_testt = X_train_tfidf[train_index], X_train_tfidf[test_index]
    y_trainn, y_testt = y_train_copy.iloc[train_index], y_train_copy.iloc[test_index]

    # Aplicar under-sampling y over-sampling en los datos de entrenamiento
    under_sampler = RandomUnderSampler(random_state=0)
    over_sampling = RandomOverSampler(random_state=0)

    # Paso 1: Under-sampling
    #X_resampled, y_resampled = under_sampler.fit_resample(X_trainn, y_trainn)

    # Paso 2: Over-sampling
    X_resampled, y_resampled = over_sampling.fit_resample(X_trainn, y_trainn)

    # Entrenar el modelo con los datos balanceados
    modelo_prueba.fit(X_resampled, y_resampled)

    y_pred = modelo_prueba.predict(X_testt)
    report = classification_report(y_testt, y_pred, output_dict=True)
    results.append(report['macro avg']['f1-score'])
average_macro_f1 = sum(results) / len(results)
print("Promedio del f1-score de 'macro avg' en todas las iteraciones:", average_macro_f1)
#print(results)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Promedio del f1-score de 'macro avg' en todas las iteraciones: 0.4759675912656955


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Usando oversampling y polaridad de texto (no funciona)

In [21]:
pliegues = 5
pliegues_estratificados = StratifiedKFold(n_splits=pliegues, shuffle=True, random_state=0)
results = []
modelo_prueba = LogisticRegression()

for train_index, test_index in pliegues_estratificados.split(X_train_pol, y_train_copy):


    # Dividir los datos en entrenamiento y prueba
    X_trainn, X_testt = X_train_pol[train_index], X_train_pol[test_index]
    y_trainn, y_testt = y_train_copy.iloc[train_index], y_train_copy.iloc[test_index]

    # Aplicar under-sampling y over-sampling en los datos de entrenamiento
    under_sampler = RandomUnderSampler(random_state=0)
    over_sampling = RandomOverSampler(random_state=0)

    # Paso 1: Under-sampling
    #X_resampled, y_resampled = under_sampler.fit_resample(X_trainn, y_trainn)

    # Paso 2: Over-sampling
    X_resampled, y_resampled = over_sampling.fit_resample(X_trainn, y_trainn)

    # Entrenar el modelo con los datos balanceados
    modelo_prueba.fit(X_resampled, y_resampled)

    y_pred = modelo_prueba.predict(X_testt)
    report = classification_report(y_testt, y_pred, output_dict=True)
    results.append(report['macro avg']['f1-score'])
average_macro_f1 = sum(results) / len(results)
print("Promedio del f1-score de 'macro avg' en todas las iteraciones:", average_macro_f1)
#print(results)


KeyboardInterrupt: 

Usando polaridad de texto y undersampling

In [22]:
pliegues = 5
pliegues_estratificados = StratifiedKFold(n_splits=pliegues, shuffle=True, random_state=0)
results = []
modelo_prueba = LogisticRegression()

for train_index, test_index in pliegues_estratificados.split(X_train_pol, y_train_copy):


    # Dividir los datos en entrenamiento y prueba
    X_trainn, X_testt = X_train_pol[train_index], X_train_pol[test_index]
    y_trainn, y_testt = y_train_copy.iloc[train_index], y_train_copy.iloc[test_index]

    # Aplicar under-sampling y over-sampling en los datos de entrenamiento
    under_sampler = RandomUnderSampler(random_state=0)
    over_sampling = RandomOverSampler(random_state=0)

    # Paso 1: Under-sampling
    X_resampled, y_resampled = under_sampler.fit_resample(X_trainn, y_trainn)

    # Paso 2: Over-sampling
    #X_resampled, y_resampled = over_sampling.fit_resample(X_trainn, y_trainn)

    # Entrenar el modelo con los datos balanceados
    modelo_prueba.fit(X_resampled, y_resampled)

    y_pred = modelo_prueba.predict(X_testt)
    report = classification_report(y_testt, y_pred, output_dict=True)
    results.append(report['macro avg']['f1-score'])
average_macro_f1 = sum(results) / len(results)
print("Promedio del f1-score de 'macro avg' en todas las iteraciones:", average_macro_f1)
#print(results)


Promedio del f1-score de 'macro avg' en todas las iteraciones: 0.41603843745733526


**Crear pipeline, junto con los grid_search_view para los clasificadores**

In [None]:
clasificadores = [SVC(random_state=0), MLPClassifier(max_iter=1000, random_state=0)]
param_grid_svc = {
                'classifier__C': [0.1, 1, 10],  # Hiperparámetro C para SVM
                'classifier__kernel': ['linear', 'rbf', 'poly'],  # Tipo de kernel
                'classifier__gamma': ['scale', 'auto']  # Parámetro gamma
            }

param_grid_mlp = {
                'classifier__hidden_layer_sizes': [(50,), (100,), (50, 50)],
                'classifier__activation': ['tanh', 'relu'], #funcion de activacion
                'classifier__alpha': [0.0001, 0.001, 0.01]
            }
def probarClasificador(clasificador, parametros, X_train, y_train, X_test, y_test, lock):



  pipe = Pipeline([('text_representation', TfidfVectorizer(token_pattern= r'(?u)\w+|\w+\n|\.|\¿|\?', ngram_range=(1,1))), ('classifier',clasificador)])
  #aqui, cv hace cross validation por su cuenta, y busca ajustar los mejores hipermarametros a partir del f1-macro
  grid_search = GridSearchCV(pipe, parametros, cv=5,scoring='f1_macro')
  if isinstance(clasificador, SVC):
    print("Resultados de la maquina de soporte vectorial")
  else:
    print("Resultados del perceptrón multicapa")
  # Entrenar el modelo con GridSearchCV
  grid_search.fit(X_train, y_train)
  y_pred = grid_search.predict(X_test)
  with lock:
    print(str(grid_search.best_params_))
    print(classification_report(y_test, y_pred))

lock = threading.Lock()

hilo_svc = threading.Thread(name="Experimento Maquina de soporte vectorial",target=probarClasificador, args=(clasificadores[0], param_grid_svc, X_train, y_train, X_test, y_test, lock))
hilo_mlp = threading.Thread(name="Experimento Perceptron multicapa",target=probarClasificador, args=(clasificadores[1], param_grid_mlp, X_train_copy, y_train_copy, X_test_copy, y_test_copy, lock))
#ejecutar hilos
hilo_svc.start()
hilo_mlp.start()
#esperar a que terminen
hilo_svc.join()
hilo_mlp.join()

Resultados de la maquina de soporte vectorial


ValueError: 
All the 90 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
90 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/pipeline.py", line 469, in fit
    Xt = self._fit(X, y, routed_params)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/pipeline.py", line 406, in _fit
    X, fitted_transformer = fit_transform_one_cached(
  File "/usr/local/lib/python3.10/dist-packages/joblib/memory.py", line 312, in __call__
    return self.func(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/pipeline.py", line 1310, in _fit_transform_one
    res = transformer.fit_transform(X, y, **params.get("fit_transform", {}))
  File "/usr/local/lib/python3.10/dist-packages/sklearn/feature_extraction/text.py", line 2091, in fit_transform
    X = super().fit_transform(raw_documents)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/feature_extraction/text.py", line 1372, in fit_transform
    vocabulary, X = self._count_vocab(raw_documents, self.fixed_vocabulary_)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/feature_extraction/text.py", line 1259, in _count_vocab
    for feature in analyze(doc):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/feature_extraction/text.py", line 103, in _analyze
    doc = decoder(doc)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/feature_extraction/text.py", line 236, in decode
    raise ValueError(
ValueError: np.nan is an invalid document, expected byte or unicode string.
