<a href="https://colab.research.google.com/github/Armando5347/polaridad-opinion/blob/main/polaridad_de_opinion.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Imports a utilizar**

In [None]:
import pandas as pd
import stanza
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
from sklearn.neural_network import MLPClassifier
import re
import pickle

**Normalizar el texto**

In [None]:
def limpiar_texto(texto):
    texto = texto.lower()
    reemplazos = {
        'á': 'a', 'é': 'e', 'í': 'i', 'ó': 'o', 'ú': 'u',
        'à': 'a', 'è': 'e', 'ì': 'i', 'ò': 'o', 'ù': 'u',
        'ä': 'a', 'ë': 'e', 'ï': 'i', 'ö': 'o', 'ü': 'u',
        'ñ': 'n'
    }
    
    for acentuada, normal in reemplazos.items():
        texto = texto.replace(acentuada, normal)
    
    return texto

config = {
    'processors': 'tokenize,mwt,pos,lemma',
    'lang': 'es'
}

nlp = stanza.Pipeline(**config)

def normalizarTexto(texto, limpia, quita_stopwords, lematiza):
    if limpia:
        texto = limpiar_texto(texto)

    try:
        doc = nlp(texto)
        cadenaNorm = ""
        for sent in doc.sentences:
            for token in sent.words:
                if quita_stopwords and lematiza:
                    if token.pos not in {'ADP', 'CCONJ', 'DET', 'SCONJ', 'PRON'}:
                        cadenaNorm += token.lemma + " "
                elif quita_stopwords:
                    if token.pos not in {'ADP', 'CCONJ', 'DET', 'SCONJ', 'PRON'}:
                        cadenaNorm += token.text + " "
                elif lematiza:
                    cadenaNorm += token.lemma + " "
                else:
                    cadenaNorm += token.text + " "
    except:
        cadenaNorm = ""

    return cadenaNorm

**Obtener el dataset**

In [12]:

data = pd.read_excel("Rest_Mex_2022.xlsx")
print (data['Opinion'])

train_split, test_split = train_test_split(
        data,
        test_size=0.2,  # 20% para prueba y 80% para entrenamiento
        random_state=0,  # Semilla para asegurar reproducibilidad
        stratify=data['Polarity']  # Mantener proporciones de clase
    )

X_train = train_split["Opinion"]
y_train = train_split["Polarity"]
X_test = test_split["Opinion"]
y_test = test_split["Polarity"]

0        Piensen dos veces antes de ir a este hotel, te...
1        Cuatro de nosotros fuimos recientemente a Eddi...
2        seguiré corta y simple: limpieza\n- bad. Tengo...
3        Al reservar un hotel con multipropiedad Mayan ...
4        No pierdan su tiempo ni dinero, venimos porque...
                               ...                        
30207    Es una construcción majestuosa, creo que de la...
30208    Muy al estilo de Romeo y Julieta es este sitio...
30209    Ideal para subir las escalinatas y divisar su ...
30210    Es imperdible, de ahí puedes ver muy bien la c...
30211    No te puedes ir de Guanajuato sin visitarlo......
Name: Opinion, Length: 30212, dtype: object


**Crear pipeline, junto con los grid_search_view para los clasificadores**

In [13]:
clasificadores = [SVC(random_state=0), MLPClassifier(max_iter=1000, random_state=0)]
param_grid_svc = {
                'classifier__C': [0.1, 1, 10],  # Hiperparámetro C para SVM
                'classifier__kernel': ['linear', 'rbf', 'poly'],  # Tipo de kernel
                'classifier__gamma': ['scale', 'auto']  # Parámetro gamma
            }

param_grid_mlp = {
                'classifier__hidden_layer_sizes': [(50,), (100,), (50, 50)],
                'classifier__activation': ['tanh', 'relu'], #funcion de activacion
                'classifier__alpha': [0.0001, 0.001, 0.01]
            }
for clasificador in clasificadores:
  pipe = Pipeline([('text_representation', TfidfVectorizer(token_pattern= r'(?u)\w+|\w+\n|\.|\¿|\?', ngram_range=(1,1))),
                   ('scaler', StandardScaler()), ('classifier',clasificador)])
  #aqui, cv hace cross validation por su cuenta, y busca ajustar los mejores hipermarametros a partir del f1-macro
  if isinstance(clasificador, SVC):
    grid_search = GridSearchCV(pipe, param_grid_svc, cv=5,scoring='accuracy')
    print("Resultados de la maquina de soporte vectorial")
  else:
    grid_search = GridSearchCV(pipe, param_grid_mlp, cv=5, scoring='accuracy')
    print("Resultados del perceptrón multicapa")
  # Entrenar el modelo con GridSearchCV
  grid_search.fit(X_train, y_train)
  print(str(grid_search.best_params_))
  y_pred = grid_search.predict(X_test)
  print(classification_report(y_test, y_pred))


Resultados de la maquina de soporte vectorial


ValueError: 
All the 90 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
90 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/pipeline.py", line 469, in fit
    Xt = self._fit(X, y, routed_params)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/pipeline.py", line 406, in _fit
    X, fitted_transformer = fit_transform_one_cached(
  File "/usr/local/lib/python3.10/dist-packages/joblib/memory.py", line 312, in __call__
    return self.func(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/pipeline.py", line 1310, in _fit_transform_one
    res = transformer.fit_transform(X, y, **params.get("fit_transform", {}))
  File "/usr/local/lib/python3.10/dist-packages/sklearn/feature_extraction/text.py", line 2091, in fit_transform
    X = super().fit_transform(raw_documents)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/feature_extraction/text.py", line 1372, in fit_transform
    vocabulary, X = self._count_vocab(raw_documents, self.fixed_vocabulary_)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/feature_extraction/text.py", line 1259, in _count_vocab
    for feature in analyze(doc):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/feature_extraction/text.py", line 103, in _analyze
    doc = decoder(doc)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/feature_extraction/text.py", line 236, in decode
    raise ValueError(
ValueError: np.nan is an invalid document, expected byte or unicode string.
