# Análisis de sentiemientos

Comparativo de diferentes herramientas para el análisis de sentimientos en español.

In [1]:
# for transformers are required pytorch
#%conda install -c pytorch pytorch torchvision transformers  
#%pip install torch transformers textblob sentiment_analysis_spanish pysentimiento openpyxl

In [2]:
# Importar librerías
import re
import pandas as pd
from textblob import TextBlob 
from sentiment_analysis_spanish import sentiment_analysis
from pysentimiento import create_analyzer
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline

pd.options.mode.chained_assignment = None  # Evita error al copiar df. Default='warn'

## Diferentes librerías, metodos o transformers

Se encontraron diferentes librerías o herramientas, entre las que destacan los transformers. 

### Clases para cada uno (primera vez en ejecutarse descargan modelo, tokens..) 

In [3]:
class Sagorsarker:
    
    def __init__(self):
        # https://huggingface.co/sagorsarker/codeswitch-spaeng-sentiment-analysis-lince
        self.tokenizer = AutoTokenizer.from_pretrained("sagorsarker/codeswitch-spaeng-sentiment-analysis-lince")
        self.model = AutoModelForSequenceClassification.from_pretrained("sagorsarker/codeswitch-spaeng-sentiment-analysis-lince")
        self.nlp = pipeline('sentiment-analysis', model=self.model, tokenizer=self.tokenizer)
    
    def analyse(self, text):
        output = self.nlp(text)
        return output[0]['label']
    
    
class Deveni:
    
    def __init__(self):
        # https://huggingface.co/daveni/twitter-xlm-roberta-emotion-es
        self.model_path = "daveni/twitter-xlm-roberta-emotion-es"
        self.nlp = pipeline("text-classification", framework="pt", model=self.model_path, tokenizer=self.model_path)
    
    def analyse(self, text):
        output = self.nlp(text)
        return output[0]['label']
    
    
class NlpTown:
    
    def __init__(self):
        # https://huggingface.co/nlptown/bert-base-multilingual-uncased-sentiment 
        self.tokenizer = AutoTokenizer.from_pretrained("nlptown/bert-base-multilingual-uncased-sentiment")
        self.model = AutoModelForSequenceClassification.from_pretrained("nlptown/bert-base-multilingual-uncased-sentiment")
        self.nlp = pipeline('sentiment-analysis', model=self.model, tokenizer=self.tokenizer)
    
    def analyse(self, text):
        output = self.nlp(text)
        return output[0]['label']
    
    
# Pysentimiento
class PySenti:
    
    def __init__(self):
        self.nlp = create_analyzer(task="sentiment", lang="es")

    def analyse(self, text):
        output = self.nlp.predict(text)
        return output.output
    

# Textblob
# ejemplo para inglés https://github.com/l33pif/Tutoriales/blob/main/Sentiment_analisis/Senitment_analysis.ipynb
# traduciendo a inglés los comentarios https://github.com/sloria/TextBlob/issues/209 
class Blob:
    
    def analyse(self, text):
        analysis = TextBlob(text)
        try:
            analysis_ready = analysis.translate(to='en')
            if analysis_ready.sentiment.polarity > 0: 
                label = 'positive'
            elif analysis_ready.sentiment.polarity == 0: 
                label = 'neutral'
            else: 
                label = 'negative'
            return label
        except:
            return 'error'

        
# https://github.com/sentiment-analysis-spanish/sentiment-spanish
class Sas:
    
    def __init__(self):
        self.nlp = sentiment_analysis.SentimentAnalysisSpanish()
       
    def analyse(self, text):
        output = self.nlp.sentiment(text)
        if output < 0.5: 
            label = 'negative'
        elif output == 0.5: 
            label = 'neutral'
        else: 
            label = 'positivo'
        return label

### Instanciar objetos para cada método

In [4]:
s = Sagorsarker()
d = Deveni()
n = NlpTown()
ps = PySenti()
b = Blob()
sas = Sas()

https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations


### Probando métodos

In [5]:
text = 'me caes gordo'

print('Método \t\t etiqueta')
print('Devini:\t\t', d.analyse(text))
print('Sagorsarker:\t', s.analyse(text))
print('NlpTown:\t', n.analyse(text))
print('PySentimiento:\t', ps.analyse(text))
print('BlobText:\t', b.analyse(text))
print('s-a-s:\t\t', sas.analyse(text))

Método 		 etiqueta
Devini:		 anger
Sagorsarker:	 LABEL_1
NlpTown:	 1 star
PySentimiento:	 NEG
BlobText:	 error
s-a-s:		 negative


# Carga, preprocesado y limpieza

### Definición de archivo y hoja para dataframe

In [6]:
filename = 'virgilio'
df = pd.read_excel(filename+'.xlsx', sheet_name='Comments')

In [7]:
df

Unnamed: 0,post_id,comment_id,profile_id,created_date,created_time,from_name,message,gender,reactions
0,289341965883602,289342362550229,100048495712314,2021-03-06,16:21:00,Jesus Alejandro Miranda,¿En cuánto?,H,0
1,289341965883602,289343555883443,100006494843811,2021-03-06,16:23:31,Pride Soon,A mi noo,H,0
2,289341965883602,289343625883436,100006494843811,2021-03-06,16:23:36,Pride Soon,\n[PHOTO] https://scontent.fchc2-1.fna.fbcdn.n...,H,0
3,289341965883602,289344799216652,100024666001605,2021-03-06,16:25:57,Danny Carrey Adepez,💚💚 espero con ansias mi pedido ☺️\n[PHOTO] htt...,H,0
4,289341965883602,289345022549963,100000808626042,2021-03-06,16:26:30,Sandra Paola Rodríguez,La camiseta bien puesta !,H,0
...,...,...,...,...,...,...,...,...,...
26256,344867256997739,348472223303909,1280858542,2021-06-08,15:41:17,Wendy Salas,\n[PHOTO] https://scontent-lcy1-1.xx.fbcdn.net...,H,0
26257,344895870328211,348770776607387,1010840116,2021-06-09,02:51:07,An Es,Juras y juras y no cumplessss,M,0
26258,335739541243844,353307062820425,100029588617314,2021-06-16,13:01:11,Jose Angel Delgado,Para q sigas robando,M,0
26259,344225097061955,354945432656588,100012668928741,2021-06-19,05:16:02,Yesenia Mendoza,Cómo se llama esa orquesta?,M,0


### Limpieza

In [8]:
# Se seleccionan solo algunas columnas
df = df[['comment_id', 'message']]

In [9]:
# Método para limpieza
def clean_text(text):
    text = str(text)
    text = re.sub(r"(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))", '', text)
    text = re.sub(r'@[A-Za-z0-9]+', '', text) # para menciones
    text = re.sub(r'#', '', text)
    text = re.sub(r'\.', ' ', text)
    text = re.sub(r'\n', ' ', text)
    text = re.sub(r'RT[\s]+', '', text)
    text = re.sub(r'\[PHOTO\]','', text)
    text = text.strip()
    return text

In [10]:
# Limpiar comentarios
df['message'] = df['message'].apply(clean_text)

In [11]:
# Eliminar comentarios sin contenido
df = df[df['message'] != ''].reset_index(drop=True)

In [12]:
df = df.head(100) # Aplicar solo a un subconjunto
df

Unnamed: 0,comment_id,message
0,289342362550229,¿En cuánto?
1,289343555883443,A mi noo
2,289344799216652,💚💚 espero con ansias mi pedido ☺️
3,289345022549963,La camiseta bien puesta !
4,289345815883217,Yo también el verde es lo mejor
...,...,...
95,289446622539803,💚💚💚💚💚
96,289446642539801,💚 👕 ✅
97,289447155873083,Por acá lo esperamos en barrio 4 valle de las ...
98,289447342539731,"Hacia la meta, virgilio 💯"


# Análisis sentientos

Se aplica sobre los comentarios cada uno de los métodos y se reescribe el dataframe con la nueva columna (aplicar los deseados)

In [13]:
df['sagorsarker'] = df['message'].apply(s.analyse)

In [14]:
df['pySentimiento'] = df['message'].apply(ps.analyse)

In [15]:
df['nlpTown'] = df['message'].apply(n.analyse)

In [16]:
df['deveni'] = df['message'].apply(d.analyse)

In [17]:
df['sas'] = df['message'].apply(sas.analyse)

In [18]:
# Para grandes cantidades de comentarios presentó errores
# df['textBlob'] = df['message'].apply(b.analyse)

In [19]:
df

Unnamed: 0,comment_id,message,sagorsarker,pySentimiento,nlpTown,deveni,sas
0,289342362550229,¿En cuánto?,LABEL_1,NEU,1 star,others,negative
1,289343555883443,A mi noo,LABEL_0,NEU,1 star,others,negative
2,289344799216652,💚💚 espero con ansias mi pedido ☺️,LABEL_1,NEU,5 stars,joy,negative
3,289345022549963,La camiseta bien puesta !,LABEL_1,POS,5 stars,joy,positivo
4,289345815883217,Yo también el verde es lo mejor,LABEL_1,POS,5 stars,joy,positivo
...,...,...,...,...,...,...,...
95,289446622539803,💚💚💚💚💚,LABEL_1,NEU,5 stars,joy,negative
96,289446642539801,💚 👕 ✅,LABEL_1,NEU,5 stars,others,negative
97,289447155873083,Por acá lo esperamos en barrio 4 valle de las ...,LABEL_1,NEU,3 stars,others,negative
98,289447342539731,"Hacia la meta, virgilio 💯",LABEL_1,NEU,2 stars,others,negative


## Escribir dataframe en archivo CSV

In [20]:
df.to_csv(filename+'.csv', index=False)

## Referencias

- https://huggingface.co/docs/transformers/index
- https://huggingface.co/sagorsarker/codeswitch-spaeng-sentiment-analysis-lince
- https://huggingface.co/daveni/twitter-xlm-roberta-emotion-es
- https://huggingface.co/nlptown/bert-base-multilingual-uncased-sentiment
- https://github.com/pysentimiento/pysentimiento
- https://github.com/l33pif/Tutoriales/blob/main/Sentiment_analisis/Senitment_analysis.ipynb
- https://github.com/sentiment-analysis-spanish/sentiment-spanish
- https://github.com/sloria/TextBlob/issues/209