In [43]:
import pandas as pd 
import numpy as np
import spacy
from scipy.special import softmax
from spacy.lang.es.stop_words import STOP_WORDS

spanish_lexicon_path = "NRC-Emotion-Lexicon/OneFilePerLanguage/Spanish-NRC-EmoLex.txt"
spanish_dataset_path = "dataset_oppositional/dataset_es_train.json"
english_dataset_path = "dataset_oppositional/dataset_en_train.json"

spanish_lexicon_df = pd.read_csv(spanish_lexicon_path, sep="\t")
spanish_dataset_df = pd.read_json(spanish_dataset_path)
english_dataset_df = pd.read_json(english_dataset_path)
english_dataset_df.head()


Unnamed: 0,id,text,category,annotations,spacy_tokens
0,5206,THIS IS MASSIVE Australian Senator Malcolm Rob...,CONSPIRACY,[{'span_text': 'Australian Senator Malcolm Rob...,WyJUSElTIiwgIklTIiwgIk1BU1NJVkUiLCAiQXVzdHJhbG...
1,1387,“ I ’m deeply concerned that the push to vacci...,CRITICAL,[{'span_text': 'I ’m deeply concerned that the...,WyJcdTIwMWMiLCAiSSIsICJcdTIwMTltIiwgImRlZXBseS...
2,13116,2021 : They wanted to know your vaccination st...,CRITICAL,"[{'span_text': 'someone who died suddenly', 'c...",WyIyMDIxIiwgIjoiLCAiVGhleSIsICJ3YW50ZWQiLCAidG...
3,11439,Anthony Fauci once again defended brutal Chine...,CRITICAL,"[{'span_text': 'brutal Chinese lockdowns', 'ca...",WyJBbnRob255IiwgIkZhdWNpIiwgIm9uY2UiLCAiYWdhaW...
4,98,Proof has emerged showing that death from Wuha...,CRITICAL,[{'span_text': 'death from Wuhan coronavirus (...,WyJQcm9vZiIsICJoYXMiLCAiZW1lcmdlZCIsICJzaG93aW...


In [44]:
def create_emotion_vector(tokens: list, lexicon: pd.DataFrame=spanish_lexicon_df):
    rows = lexicon[lexicon["English Word"].isin(tokens)][["anger", "fear", "joy", "sadness"]]
    res = np.sum(rows.values, axis=0)
    return softmax(res)
    if np.sum(res) == 0:
        return res
    return res / np.sum(res)

def tokenize_text(text: str, nlp):
    doc = nlp(text)
    # Tokenizar el texto, eliminando números, signos de puntuación, menciones y stopwords
    token_list = [token.lemma_.lower() for token in doc 
                  if not token.is_punct 
                  and not token.is_digit 
                  and not token.text.startswith('@')
                  and token.text.lower() not in STOP_WORDS
]
    
    return token_list


nlp=spacy.load('en_core_web_sm')

text_tokenized = english_dataset_df["text"].apply(lambda x: tokenize_text(x, nlp))
emotion_vectors = text_tokenized.apply(create_emotion_vector)

In [45]:
english_dataset_df["emotions"] = emotion_vectors

english_dataset_df.to_json("dataset_oppositional/dataset_en_train_emolex_3.json", orient="records")

In [62]:
from transformers import pipeline
from transformers import AutoModelForSequenceClassification, AutoTokenizer


model = AutoModelForSequenceClassification.from_pretrained("SamLowe/roberta-base-go_emotions")
tokenizer = AutoTokenizer.from_pretrained("SamLowe/roberta-base-go_emotions")
classifier = pipeline(task="text-classification", model=model, top_k=None, tokenizer=tokenizer)
sentences = ["https :// t . me / rtnews / 17878   Kazakhstan ’s President Tokayev announces he is taking over as chairman of the Security Council Tokayev said that he will not leave the country under any circumstances in his address to the nation Wednesday . The Kazakh leader also told citizens : a new package of proposals will be released in the near future authorities intend to act as harshly as possible against offenders during the protests law enforcement officers have been killed during the confrontations THIS IS ALL A [ DS ] ARKITE / SINTI CABAL BELGAE BLACK - OP TO STOP ROGUE KAZAKHSTAN FROM TAKING OVER CRYPTOCURRENCY MINING FROM [ DS ] MONGOL CCP CHINA https :// t . me / sineinjuria / 53018 # DeepStateInFullOvertPanic "]

model_outputs = classifier(sentences)
print(model_outputs[0])
# produces a list of dicts for each of the labels




[{'label': 'neutral', 'score': 0.9479948282241821}, {'label': 'approval', 'score': 0.034526925534009933}, {'label': 'realization', 'score': 0.012649599462747574}, {'label': 'annoyance', 'score': 0.005309795029461384}, {'label': 'optimism', 'score': 0.005123916082084179}, {'label': 'disappointment', 'score': 0.002947468077763915}, {'label': 'sadness', 'score': 0.0029114806093275547}, {'label': 'disapproval', 'score': 0.0028327126055955887}, {'label': 'excitement', 'score': 0.0028063568752259016}, {'label': 'desire', 'score': 0.0027442832943052053}, {'label': 'fear', 'score': 0.002469258150085807}, {'label': 'joy', 'score': 0.0018979558954015374}, {'label': 'admiration', 'score': 0.0017544840229675174}, {'label': 'disgust', 'score': 0.0016264236764982343}, {'label': 'amusement', 'score': 0.0015824248548597097}, {'label': 'confusion', 'score': 0.0015744833508506417}, {'label': 'anger', 'score': 0.001533654984086752}, {'label': 'caring', 'score': 0.001272394903935492}, {'label': 'love', 's

In [65]:
model.config.label2id

{'admiration': 0,
 'amusement': 1,
 'anger': 2,
 'annoyance': 3,
 'approval': 4,
 'caring': 5,
 'confusion': 6,
 'curiosity': 7,
 'desire': 8,
 'disappointment': 9,
 'disapproval': 10,
 'disgust': 11,
 'embarrassment': 12,
 'excitement': 13,
 'fear': 14,
 'gratitude': 15,
 'grief': 16,
 'joy': 17,
 'love': 18,
 'nervousness': 19,
 'neutral': 27,
 'optimism': 20,
 'pride': 21,
 'realization': 22,
 'relief': 23,
 'remorse': 24,
 'sadness': 25,
 'surprise': 26}