In [1]:
import pandas as pd 
import numpy as np
import spacy
from spacy.lang.es.stop_words import STOP_WORDS as ES_STOP_WORDS
from spacy.lang.en.stop_words import STOP_WORDS as EN_STOP_WORDS

spanish_lexicon_path = "NRC-Emotion-Lexicon/OneFilePerLanguage/Spanish-NRC-EmoLex.txt"
spanish_dataset_path = "dataset_oppositional/dataset_es_train.json"
english_dataset_path = "dataset_oppositional/dataset_en_train.json"

spanish_lexicon_df = pd.read_csv(spanish_lexicon_path, sep="\t")
spanish_dataset_df = pd.read_json(spanish_dataset_path)
english_dataset_df = pd.read_json(english_dataset_path)
english_dataset_df.head()


Unnamed: 0,id,text,category,annotations,spacy_tokens
0,5206,THIS IS MASSIVE Australian Senator Malcolm Rob...,CONSPIRACY,[{'span_text': 'Australian Senator Malcolm Rob...,WyJUSElTIiwgIklTIiwgIk1BU1NJVkUiLCAiQXVzdHJhbG...
1,1387,“ I ’m deeply concerned that the push to vacci...,CRITICAL,[{'span_text': 'I ’m deeply concerned that the...,WyJcdTIwMWMiLCAiSSIsICJcdTIwMTltIiwgImRlZXBseS...
2,13116,2021 : They wanted to know your vaccination st...,CRITICAL,"[{'span_text': 'someone who died suddenly', 'c...",WyIyMDIxIiwgIjoiLCAiVGhleSIsICJ3YW50ZWQiLCAidG...
3,11439,Anthony Fauci once again defended brutal Chine...,CRITICAL,"[{'span_text': 'brutal Chinese lockdowns', 'ca...",WyJBbnRob255IiwgIkZhdWNpIiwgIm9uY2UiLCAiYWdhaW...
4,98,Proof has emerged showing that death from Wuha...,CRITICAL,[{'span_text': 'death from Wuhan coronavirus (...,WyJQcm9vZiIsICJoYXMiLCAiZW1lcmdlZCIsICJzaG93aW...


In [3]:
def create_emotion_vector(tokens: list, lexicon: pd.DataFrame=spanish_lexicon_df, key="Spanish Word"):
    rows = lexicon[lexicon[key].isin(tokens)][["anger", "fear", "joy", "sadness"]]
    res = np.sum(rows.values, axis=0)
    if np.sum(res) == 0:
        return res
    return res / np.sum(res)

def tokenize_text(text: str, stopwords, nlp):
    doc = nlp(text)
    # Tokenizar el texto, eliminando números, signos de puntuación, menciones y stopwords
    token_list = [token.lemma_.lower() for token in doc 
                  if not token.is_punct 
                  and not token.is_digit 
                  and not token.text.startswith('@')
                  and token.text.lower() not in stopwords
]
    
    return token_list


In [4]:
nlp=spacy.load('es_core_news_sm')
text_tokenized = spanish_dataset_df["text"].apply(lambda x: tokenize_text(x, ES_STOP_WORDS, nlp))
emotion_vectors = text_tokenized.apply(create_emotion_vector)
spanish_dataset_df["emotions"] = emotion_vectors
spanish_dataset_df.to_json("dataset_oppositional/dataset_es_train_emolex.json", orient="records")

In [6]:
nlp=spacy.load('en_core_web_sm')
text_tokenized = english_dataset_df["text"].apply(lambda x: tokenize_text(x, EN_STOP_WORDS, nlp))
emotion_vectors = text_tokenized.apply(lambda x:create_emotion_vector(x, key="English Word"))
english_dataset_df["emotions"] = emotion_vectors
english_dataset_df.to_json("dataset_oppositional/dataset_en_train_emolex.json", orient="records")

In [24]:
from transformers import TextClassificationPipeline
from datasets import Dataset
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import pandas as pd
import numpy as np

model = AutoModelForSequenceClassification.from_pretrained("SamLowe/roberta-base-go_emotions")
tokenizer = AutoTokenizer.from_pretrained("SamLowe/roberta-base-go_emotions")
label2id = model.config.label2id
english_dataset_path = "dataset_oppositional/dataset_en_train.json"
english_dataset_df = pd.read_json(english_dataset_path)
english_dataset_df["input"] = english_dataset_df["text"]
all_feature_vectors = len(label2id)

dst = Dataset.from_pandas(english_dataset_df)

feature_vects = np.zeros((len(dst), len(label2id)))

classifier = TextClassificationPipeline(model=model, top_k=None, tokenizer=tokenizer, batch_size=32, device="cuda", function_to_apply='sigmoid', max_length=512,  truncation=True)
sentences = ["https :// t . me / rtnews / 17878   Kazakhstan ’s President Tokayev announces he is taking over as chairman of the Security Council Tokayev said that he will not leave the country under any circumstances in his address to the nation Wednesday . The Kazakh leader also told citizens : a new package of proposals will be released in the near future authorities intend to act as harshly as possible against offenders during the protests law enforcement officers have been killed during the confrontations THIS IS ALL A [ DS ] ARKITE / SINTI CABAL BELGAE BLACK - OP TO STOP ROGUE KAZAKHSTAN FROM TAKING OVER CRYPTOCURRENCY MINING FROM [ DS ] MONGOL CCP CHINA https :// t . me / sineinjuria / 53018 # DeepStateInFullOvertPanic "]

model_outputs = classifier(dst["text"])

for row, output in enumerate(model_outputs):
    for current_output in output:
        values = current_output.values()
        id = label2id[current_output["label"]]
        feature_vects[row][id] = current_output["score"]
#print(model_outputs[0])
# produces a list of dicts for each of the labels

english_dataset_df["emotions"] = feature_vects.tolist()
english_dataset_df.to_json("dataset_oppositional/dataset_en_train_transformer.json", orient="records")




[[{'label': 'neutral', 'score': 0.8581225872039795}, {'label': 'approval', 'score': 0.037626493722200394}, {'label': 'admiration', 'score': 0.02927643433213234}, {'label': 'annoyance', 'score': 0.02747703529894352}, {'label': 'realization', 'score': 0.01666192337870598}, {'label': 'disgust', 'score': 0.010202311910688877}, {'label': 'disappointment', 'score': 0.008684122934937477}, {'label': 'anger', 'score': 0.008533907122910023}, {'label': 'disapproval', 'score': 0.006628877483308315}, {'label': 'fear', 'score': 0.003741883672773838}, {'label': 'sadness', 'score': 0.0031045235227793455}, {'label': 'surprise', 'score': 0.0030986317433416843}, {'label': 'excitement', 'score': 0.0024418821558356285}, {'label': 'embarrassment', 'score': 0.00219937227666378}, {'label': 'optimism', 'score': 0.0016259668627753854}, {'label': 'pride', 'score': 0.0013376882998272777}, {'label': 'amusement', 'score': 0.0012305364944040775}, {'label': 'joy', 'score': 0.0009597459575161338}, {'label': 'curiosity