In [1]:
import pandas as pd 
import functools
import numpy as np
import warnings
import operator
from tqdm import tqdm
warnings.filterwarnings("ignore")

# Depechemood++ 
Este cuaderno contiene las funciones necesarias para obtener el valor moral y emocional de los textos utilizando el enfoque basado en el léxico.

# Functions

In [27]:
def extract_emo_representation(words, emo_vocab=None, emotion_lex=None, n_emotions=None):
    """
    Extrae una representación de emociones a partir de una lista de palabras y devuelve también
    las palabras que contribuyen a esta representación.
    
    Args:
    words (list of str): Lista de palabras del texto.
    emo_vocab (set): Conjunto de palabras en el léxico emocional.
    emotion_lex (dict): Diccionario que mapea palabras a sus vectores de emoción.
    n_emotions (int): Número de emociones en los vectores de emoción.
    
    Returns:
    dict: Diccionario con 'emotion_vector' (representación promedio de emociones)
          y 'matched_words' (palabras usadas en el cálculo).
    """
    intersection = emo_vocab & set(words)
    matched_words = list(intersection) 
    v = np.zeros((len(intersection), n_emotions))
    
    for i, word in enumerate(intersection):
        v[i, :] = emotion_lex[word]
    
    # Calculamos el vector de emociones (usamos la media)
    emotion_vector = np.mean(v, axis=0) if len(intersection) > 0 else np.zeros(n_emotions)
    
    # Retornamos tanto el vector de emociones como las palabras coincidentes
    return {
        'emotion_vector': emotion_vector,
        'matched_words': matched_words
    }

    
def dictionary_emotion(text):
    """
    Convierte la lista puntuaciones de emociones en diccionario con emoción como clave
    Args:
    text (list): Lista de puntuaciones de emociones.
    
    Returns:
    dict: Diccionario con las emociones y sus respectivas emociones.
    """
    test_keys = ["fear", "amusement", "anger","annoyance","indifference","happiness","inspiration","sadness"]
    dictionary = dict(map(lambda i,j : (i,j) , test_keys,text))
    return dictionary


def top_n_emotions_names(emotion_scores, n=3):
    """
    Obtiene las n emociones con los puntajes más altos (solo nombres).
    
    Args:
    emotion_scores (dict): Diccionario que mapea etiquetas de emociones a sus respectivos puntajes.
    n (int): Número de emociones con los puntajes más altos a devolver.
    
    Returns:
    list: Lista de las n emociones con mayores puntajes.
    """
    sorted_emotions = sorted(emotion_scores.items(), key=lambda item: item[1], reverse=True)
    return [emotion for emotion, _ in sorted_emotions[:n]]


def get_max_emotion_name(emotion_scores):
    """
    Obtiene el nombre de la emoción con el puntaje más alto.
    
    Args:
    emotion_scores (dict): Diccionario que mapea etiquetas de emociones a sus respectivos puntajes.
    
    Returns:
    str: El nombre de la emoción con el puntaje más alto.
    """
    max_emotion = max(emotion_scores, key=emotion_scores.get)
    return max_emotion

# DepecheMood++ Lexicon

In [21]:
# Read the DepecheMood lexicon
#lexicon=pd.read_csv('DATASETS/DepecheMood_english_lemma_full.tsv',sep='\t',index_col=[0])
#lexicon.to_csv('DATASETS/DepecheMood_english_lemma_full.csv')

#Filter lexicon to include only rows with 'freq' >= , 134278 values were discarded (23%), 41314 lemmas
lexicon=pd.read_csv('DepecheMood_english_lemma_full.csv',index_col=[0])
lexicon=lexicon[lexicon['freq'] >= 10] 

#Convert the lexicon to a dictionary
lexicon=lexicon.drop('freq',axis=1)
lexicon=lexicon.reset_index()
lexicon_dict = lexicon.set_index('index').T.to_dict('list')
#lexicon_dict
#lexicon.loc[200:250,:]

  lexicon_dict = lexicon.set_index('index').T.to_dict('list')


Unnamed: 0,index,AFRAID,AMUSED,ANGRY,ANNOYED,DONT_CARE,HAPPY,INSPIRED,SAD
200,abstinence,0.010791,0.215323,0.098104,0.282082,0.090112,0.066984,0.203085,0.033519
201,abstract,0.044336,0.201894,0.056911,0.09068,0.075127,0.146059,0.328487,0.056506
202,abstraction,0.099769,0.229726,0.056256,0.043741,0.096377,0.093452,0.380002,0.000676
203,absurd,0.049905,0.149006,0.122266,0.279938,0.138893,0.058723,0.136655,0.064614
204,absurdity,0.071087,0.211499,0.063726,0.113263,0.143294,0.107244,0.205989,0.083899
205,abu,0.178752,0.07642,0.287083,0.098101,0.050531,0.089705,0.079862,0.139545
206,abubakar,0.170586,0.005417,0.320873,0.062468,0.017862,0.076771,0.081872,0.264151
207,abucay,0.214619,0.025325,0.154594,0.075068,0.010297,0.015331,0.008931,0.495835
208,abuda,0.01744,0.104254,0.022197,0.0,0.159592,0.347372,0.349145,0.0
209,abueva,0.033035,0.138778,0.031825,0.231798,0.107679,0.260889,0.125636,0.070361


## Emotion Extraction

In [35]:
df=pd.read_csv('data/train/dataset_en_train_augmented.csv')

'massive australian senator malcolm roberts expose nanotech find covid vaccine say genocide politician expose share lauraabolichannel'

In [23]:
# Aplicamos las transformaciones de emociones
emo_vocab = set(lexicon_dict.keys())
n_emotions = 8  # Número de emociones

# Creamos listas vacías para almacenar los resultados
matched_words_list = []
top_3_emotions_list = []
max_emotion_list = []

# Actualizamos el bucle para usar el nuevo nombre de la función
for text in df['text']:
    # Extraemos el vector de emociones y las palabras coincidentes
    data = extract_emo_representation(text.split(' '), emo_vocab, lexicon_dict, n_emotions)
    emotion_vector = data['emotion_vector']
    matched_words = data['matched_words']
    
    # Convertimos el vector de emociones en un diccionario
    emotion_scores = dictionary_emotion(emotion_vector)
    
    # Calculamos los resultados necesarios
    top_3_emotions = top_n_emotions_names(emotion_scores, 3)  # Obtenemos solo los nombres
    max_emotion = get_max_emotion_name(emotion_scores)  # Obtenemos solo el nombre
    
    # Guardamos los resultados en las listas
    matched_words_list.append(matched_words)
    top_3_emotions_list.append(top_3_emotions)
    max_emotion_list.append(max_emotion)

# Añadimos las nuevas columnas al DataFrame
df['matched_words'] = matched_words_list
df['top_3_emotions'] = top_3_emotions_list
df['max_emotion'] = max_emotion_list


In [24]:
df

Unnamed: 0,id,text,category,annotations,spacy_tokens,matched_words,top_3_emotions,max_emotion
0,5206,this is massive australian senator malcolm rob...,CONSPIRACY,[{'span_text': 'Australian Senator Malcolm Rob...,WyJUSElTIiwgIklTIiwgIk1BU1NJVkUiLCAiQXVzdHJhbG...,"[malcolm, senator, first, roberts, found, aust...","[inspiration, amusement, anger]",inspiration
1,1387,i m deeply concerned that the push to vaccinat...,CRITICAL,[{'span_text': 'I ’m deeply concerned that the...,WyJcdTIwMWMiLCAiSSIsICJcdTIwMTltIiwgImRlZXBseS...,"[push, young, texas, experiment, nothing, conc...","[inspiration, amusement, fear]",inspiration
2,13116,they wanted to know your vaccination status an...,CRITICAL,"[{'span_text': 'someone who died suddenly', 'c...",WyIyMDIxIiwgIjoiLCAiVGhleSIsICJ3YW50ZWQiLCAidG...,"[nt, be, allowed, know, want, who, someone, su...","[inspiration, indifference, amusement]",inspiration
3,11439,anthony fauci once again defended brutal chine...,CRITICAL,"[{'span_text': 'brutal Chinese lockdowns', 'ca...",WyJBbnRob255IiwgIkZhdWNpIiwgIm9uY2UiLCAiYWdhaW...,"[communist, forcefully, okay, people, brutal, ...","[anger, annoyance, inspiration]",anger
4,98,proof has emerged showing that death from wuha...,CRITICAL,[{'span_text': 'death from Wuhan coronavirus (...,WyJQcm9vZiIsICJoYXMiLCAiZW1lcmdlZCIsICJzaG93aW...,"[alive, also, creation, death, proteins, body,...","[inspiration, amusement, fear]",inspiration
...,...,...,...,...,...,...,...,...
3995,4829,police in australia are warning that unvaccina...,CRITICAL,"[{'span_text': 'Police in Australia', 'categor...",WyJQb2xpY2UiLCAiaW4iLCAiQXVzdHJhbGlhIiwgImFyZS...,"[will, double, receive, police, australia, apa...","[anger, inspiration, sadness]",anger
3996,10899,i personally do nt believe putin would set off...,CONSPIRACY,"[{'span_text': 'Deep State', 'category': 'AGEN...",WyJJIiwgInBlcnNvbmFsbHkiLCAiZG8iLCAiblx1MjAxOX...,"[also, filled, know, off, unchecked, may, deep...","[inspiration, amusement, annoyance]",inspiration
3997,10637,pfizer lied we know that there s no doubt abou...,CRITICAL,"[{'span_text': 'Pfizer', 'category': 'AGENT', ...",WyJQZml6ZXIiLCAibGllZCIsICIuIiwgIldlIiwgImtub3...,"[own, health, recorded, know, road, can, europ...","[annoyance, amusement, inspiration]",annoyance
3998,11338,it is utterly bizarre and inexplicable dr john...,CRITICAL,"[{'span_text': 'Dr. John Campbell', 'category'...",WyJcIiIsICJJdCIsICJpcyIsICJ1dHRlcmx5IiwgImJpem...,"[rollout, vaccination, inexplicable, thank, bi...","[inspiration, amusement, indifference]",inspiration


# Save

In [25]:
df.to_csv('data/train/dataset_en_train_with_emotions.csv')

# Emotion Spanish Model

In [8]:
#https://github.com/pysentimiento/pysentimiento , https://pypi.org/project/pysentimiento/0.5.2rc3/



In [13]:
import pandas as pd
from pysentimiento import create_analyzer


df = pd.read_csv("data/train/dataset_es_train_augmented.csv")
df

analyzer = create_analyzer(task="sentiment", lang="es")


def get_sentiment(text):
    # Obtener el resultado de la predicción
    result = analyzer.predict(text)
    # El sentimiento predicho es POS, NEG, o NEU
    return result.output

# Aplicar la función a la columna 'text' del DataFrame
df['sentiment'] = df['text'].apply(get_sentiment)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [19]:
df.to_csv('data/train/dataset_es_train_with_sentiment.csv',index=False)

In [22]:
df=pd.read_csv('data/train/dataset_es_train_with_sentiment.csv')
df

Unnamed: 0,id,text,category,annotations,spacy_tokens,sentiment
0,2807,fallo en matrix hoy el señor joan ramón laport...,CRITICAL,[{'span_text': 'el señor Joan Ramón Laporte Ro...,WyJGYWxsbyIsICJlbiIsICJNYXRyaXgiLCAiMDgvMDIvMj...,NEG
1,3054,siento ya tdas las vacunas vienen contaminadas...,CRITICAL,"[{'span_text': 'mi sobrina', 'category': 'VICT...",WyJTaWVudG8iLCAieWEiLCAidGRhcyIsICJsYXMiLCAidm...,NEG
2,268,veo que curiosamente te autoproclamados interl...,CONSPIRACY,"[{'span_text': 'todo el grupo', 'category': 'C...",WyJWZW8iLCAicXVlIiwgImN1cmlvc2FtZW50ZSIsICJ0ZS...,NEG
3,2669,documental vacunas una inyección en la oscurid...,CRITICAL,[{'span_text': '[ Documental ] Vacunas : Una i...,WyJbIiwgIkRvY3VtZW50YWwiLCAiXSIsICJWYWN1bmFzIi...,NEU
4,3205,una sugerencia para los que se han vacunado y ...,CONSPIRACY,[{'span_text': 'los que se han vacunado y no q...,WyJVbmEiLCAic3VnZXJlbmNpYSIsICJwYXJhIiwgImxvcy...,POS
...,...,...,...,...,...,...
7995,mr5w0,Dr. Robert Malone . co inventor de la tecnolog...,CRITICAL,,,NEG
7996,uYwCK,una pregunta la vacuna también causa hipotiroi...,CRITICAL,,,NEG
7997,CFz4d,Eric Clapton el famoso guitarrista cuenta cómo...,CRITICAL,,,NEU
7998,JstSN,"No es un médico, no es un científico, no es un...",CONSPIRACY,,,NEG
