<a href="https://colab.research.google.com/github/Armando5347/polaridad-opinion/blob/main/polaridad_de_opinion.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Imports a utilizar**

In [None]:
!pip install stanza

In [6]:
import pandas as pd
import numpy as np
import stanza
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
from sklearn.neural_network import MLPClassifier
import re, os
import pickle
import threading
from scipy.sparse import hstack

**Normalizar el texto**

In [None]:
def limpiar_texto(texto):
    texto = texto.lower()
    reemplazos = {
        'á': 'a', 'é': 'e', 'í': 'i', 'ó': 'o', 'ú': 'u',
        'à': 'a', 'è': 'e', 'ì': 'i', 'ò': 'o', 'ù': 'u',
        'ä': 'a', 'ë': 'e', 'ï': 'i', 'ö': 'o', 'ü': 'u',
        'ñ': 'n'
    }
    
    for acentuada, normal in reemplazos.items():
        texto = texto.replace(acentuada, normal)
    
    return texto

config = {
    'processors': 'tokenize,mwt,pos,lemma',
    'lang': 'es'
}

nlp = stanza.Pipeline(**config)

def normalizarTexto(texto, limpia, quita_stopwords, lematiza):
    if limpia:
        texto = limpiar_texto(texto)

    try:
        doc = nlp(texto)
        cadenaNorm = ""
        for sent in doc.sentences:
            for token in sent.words:
                if quita_stopwords and lematiza:
                    if token.pos not in {'ADP', 'CCONJ', 'DET', 'SCONJ', 'PRON'}:
                        cadenaNorm += token.lemma + " "
                elif quita_stopwords:
                    if token.pos not in {'ADP', 'CCONJ', 'DET', 'SCONJ', 'PRON'}:
                        cadenaNorm += token.text + " "
                elif lematiza:
                    cadenaNorm += token.lemma + " "
                else:
                    cadenaNorm += token.text + " "
    except:
        cadenaNorm = ""

    return cadenaNorm

**Obtener el dataset**

In [4]:

data = pd.read_csv("corpusNorm.csv", sep="\t")
print (data['Content'])

train_split, test_split = train_test_split(
        data,
        test_size=0.2,  # 20% para prueba y 80% para entrenamiento
        random_state=0,  # Semilla para asegurar reproducibilidad
        stratify=data['Polarity']  # Mantener proporciones de clase
    )

X_train = train_split["Content"]
X_train_copy = X_train.copy()
y_train = train_split["Polarity"]
y_train_copy = y_train.copy()
X_test = test_split["Content"]
X_test_copy = X_test.copy()
y_test = test_split["Polarity"]
y_test_copy = y_test.copy()

0        pésimo lugar pensar dos vez antes ir hotel , m...
1        no vayas lugar eddie cuatro ir recientemente E...
2        mala relación calidad-precio seguir corto simp...
3        minusválido ? ¡ no alojar aquí ! reservar hote...
4        ser porqueria no perder tiempo no perder tiemp...
                               ...                        
30207    verdadera joya arquitectónico ser construcción...
30208    romántico mucho estilo romeo julieta ser sitio...
30209    parecer castillo ideal subir escalinata divisa...
30210    imperdible ser imperdible , ahí poder ver much...
30211    mucho bonito vista no poder ir guanajuato visi...
Name: Content, Length: 30212, dtype: object


**Calcular polaridad**

In [5]:
def getSELFeatures(cadenas, lexicon_sel):
	#'hastiar': [('Enojo\n', '0.629'), ('Repulsi\xf3n\n', '0.596')]
	polaridad_cadenas = []
	for cadena in cadenas:
		valor_alegria = 0.0
		valor_enojo = 0.0
		valor_miedo = 0.0
		valor_repulsion = 0.0
		valor_sorpresa = 0.0
		valor_tristeza = 0.0
		cadena_palabras = re.split('\s+', cadena)
		dic = {}
		for palabra in cadena_palabras:
			if palabra in lexicon_sel:
				caracteristicas = lexicon_sel[palabra]
				for emocion, valor in caracteristicas:
					if emocion == 'Alegría':
						valor_alegria = valor_alegria + float(valor)
					elif emocion == 'Tristeza':
						valor_tristeza = valor_tristeza + float(valor)
					elif emocion == 'Enojo':
						valor_enojo = valor_enojo + float(valor)
					elif emocion == 'Repulsión':
						valor_repulsion = valor_repulsion + float(valor)
					elif emocion == 'Miedo':
						valor_miedo = valor_miedo + float(valor)
					elif emocion == 'Sorpresa':
						valor_sorpresa = valor_sorpresa + float(valor)
		dic['__alegria__'] = valor_alegria
		dic['__tristeza__'] = valor_tristeza
		dic['__enojo__'] = valor_enojo
		dic['__repulsion__'] = valor_repulsion
		dic['__miedo__'] = valor_miedo
		dic['__sorpresa__'] = valor_sorpresa
		
		#Esto es para los valores acumulados del mapeo a positivo (alegría + sorpresa) y negativo (enojo + miedo + repulsión + tristeza)
		dic['acumuladopositivo'] = dic['__alegria__'] + dic['__sorpresa__']
		dic['acumuladonegative'] = dic['__enojo__'] + dic['__miedo__'] + dic['__repulsion__'] + dic['__tristeza__']
		
		polaridad_pos = np.array([dic['acumuladopositivo']])
		polaridad_neg = np.array([dic['acumuladonegative']])
		polaridad_cadena = np.concatenate((polaridad_pos, polaridad_neg), axis=0)
		polaridad_cadenas.append(polaridad_cadena)
	
	return polaridad_cadenas

if (os.path.exists('lexicon_sel.pkl')):
    lexicon_sel_file = open ('lexicon_sel.pkl','rb')
    lexicon_sel = pickle.load(lexicon_sel_file)
else:
    print("No se ha encontrado el archivo lexicon_sel.pkl")
    exit()

polaridad_train = getSELFeatures(X_train, lexicon_sel)
polaridad_test = getSELFeatures(X_test, lexicon_sel)
print(polaridad_train)


[array([0., 0.]), array([0.   , 0.264]), array([0., 0.]), array([0., 0.]), array([4.565, 4.172]), array([0.966, 0.   ]), array([0., 0.]), array([0.165, 0.   ]), array([0.132, 0.264]), array([0.966, 0.   ]), array([0.   , 0.629]), array([0., 0.]), array([0., 0.]), array([0.298, 1.025]), array([2.859, 0.099]), array([3.696, 0.959]), array([0.966, 0.   ]), array([0., 0.]), array([0.   , 0.198]), array([0., 0.]), array([1.296, 0.099]), array([0., 0.]), array([0.   , 0.462]), array([0., 0.]), array([0.165, 0.   ]), array([0., 0.]), array([1.932, 0.   ]), array([2.451, 0.231]), array([0., 0.]), array([1.865, 0.   ]), array([0.   , 0.898]), array([0., 0.]), array([0.462, 0.762]), array([1.563, 0.   ]), array([0.   , 1.195]), array([0.198, 0.   ]), array([5.611, 4.839]), array([0.165, 0.   ]), array([2.095, 1.527]), array([0., 0.]), array([0.   , 0.198]), array([0., 0.]), array([0., 0.]), array([0.33, 0.  ]), array([0.   , 1.124]), array([0.698, 0.   ]), array([1.097, 0.   ]), array([0.   , 1.

**Vectorización**

In [7]:
tfidf_vectorizer = TfidfVectorizer(token_pattern= r'(?u)\w+|\w+\n|\.|\¿|\?', ngram_range=(1,1))
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train_copy)
X_test_tfidf = tfidf_vectorizer.transform(X_test_copy)

frequency_vectorizer = CountVectorizer(binary=False, ngram_range=(1, 1))
X_train_freq = frequency_vectorizer.fit_transform(X_train_copy)
X_test_freq = frequency_vectorizer.transform(X_test_copy)

X_tain_pol = None
X_test_pol = None

**Añadir polarización a la vectorización**

In [9]:
X_train_pol = hstack([X_train_tfidf, polaridad_train]).toarray()
X_test_pol = hstack([X_test_tfidf, polaridad_test]).toarray()

**Crear pipeline, junto con los grid_search_view para los clasificadores**

In [None]:
clasificadores = [SVC(random_state=0), MLPClassifier(max_iter=1000, random_state=0)]
param_grid_svc = {
                'classifier__C': [0.1, 1, 10],  # Hiperparámetro C para SVM
                'classifier__kernel': ['linear', 'rbf', 'poly'],  # Tipo de kernel
                'classifier__gamma': ['scale', 'auto']  # Parámetro gamma
            }

param_grid_mlp = {
                'classifier__hidden_layer_sizes': [(50,), (100,), (50, 50)],
                'classifier__activation': ['tanh', 'relu'], #funcion de activacion
                'classifier__alpha': [0.0001, 0.001, 0.01]
            }
def probarClasificador(clasificador, parametros, X_train, y_train, X_test, y_test, lock):
  pipe = Pipeline([('text_representation', TfidfVectorizer(token_pattern= r'(?u)\w+|\w+\n|\.|\¿|\?', ngram_range=(1,1))), ('classifier',clasificador)])
  #aqui, cv hace cross validation por su cuenta, y busca ajustar los mejores hipermarametros a partir del f1-macro
  grid_search = GridSearchCV(pipe, parametros, cv=5,scoring='f1_macro')
  if isinstance(clasificador, SVC):
    print("Resultados de la maquina de soporte vectorial")
  else:
    print("Resultados del perceptrón multicapa")
  # Entrenar el modelo con GridSearchCV
  grid_search.fit(X_train, y_train)
  y_pred = grid_search.predict(X_test)
  with lock:
    print(str(grid_search.best_params_))
    print(classification_report(y_test, y_pred))

lock = threading.Lock()

hilo_svc = threading.Thread(name="Experimento Maquina de soporte vectorial",target=probarClasificador, args=(clasificadores[0], param_grid_svc, X_train, y_train, X_test, y_test, lock))
hilo_mlp = threading.Thread(name="Experimento Perceptron multicapa",target=probarClasificador, args=(clasificadores[1], param_grid_mlp, X_train_copy, y_train_copy, X_test_copy, y_test_copy, lock))
#ejecutar hilos
hilo_svc.start()
hilo_mlp.start()
#esperar a que terminen
hilo_svc.join()
hilo_mlp.join()

Resultados de la maquina de soporte vectorial


ValueError: 
All the 90 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
90 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/pipeline.py", line 469, in fit
    Xt = self._fit(X, y, routed_params)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/pipeline.py", line 406, in _fit
    X, fitted_transformer = fit_transform_one_cached(
  File "/usr/local/lib/python3.10/dist-packages/joblib/memory.py", line 312, in __call__
    return self.func(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/pipeline.py", line 1310, in _fit_transform_one
    res = transformer.fit_transform(X, y, **params.get("fit_transform", {}))
  File "/usr/local/lib/python3.10/dist-packages/sklearn/feature_extraction/text.py", line 2091, in fit_transform
    X = super().fit_transform(raw_documents)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/feature_extraction/text.py", line 1372, in fit_transform
    vocabulary, X = self._count_vocab(raw_documents, self.fixed_vocabulary_)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/feature_extraction/text.py", line 1259, in _count_vocab
    for feature in analyze(doc):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/feature_extraction/text.py", line 103, in _analyze
    doc = decoder(doc)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/feature_extraction/text.py", line 236, in decode
    raise ValueError(
ValueError: np.nan is an invalid document, expected byte or unicode string.
