The manually labelled data is uploaded. Columns of the dataframe contain: name (name of the text note), text, spanish (having a binary class for 1 Spanish, 0 else), and sentiment.

In [None]:
import pandas as pd
df = pd.read_excel('/content/manual_labelling.xlsx')
df = df.iloc[:, :-1]
df

**BERT**

The necessary libraries are installed and imported.

In [None]:
!pip install pysentimiento

from pysentimiento import create_analyzer
import transformers

transformers.logging.set_verbosity(transformers.logging.ERROR)

analyzer = create_analyzer(task="sentiment", lang="es")

Sentiments are predicted with BERT and added in a column. The total number of sentiments for each cases are summed.

In [None]:
sentiments = []

for index, row in df.iterrows():
    text = row['text']
    result_df = analyzer.predict(text)

    sentiments.append(result_df.output)

df['BERT'] = sentiments

positive_count = (df['BERT'] == 'POS').sum()
neutral_count = (df['BERT'] == 'NEU').sum()
negative_count = (df['BERT'] == 'NEG').sum()

print("Positive:", positive_count)
print("Neutral:", neutral_count)
print("Negative:", negative_count)

In [None]:
#The format of the labels is modified.
df['BERT'] = df['BERT'].replace({'POS': 'Positive', 'NEG': 'Negative', 'NEU': 'Neutral'})
df

**AFINN**

We install and import the necessary libraries.

In [None]:
!pip install textblob
from textblob import TextBlob, Word

!pip install nltk
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

import string


import os
stop_words = set(stopwords.words('spanish'))

Firstly, we initialise an empty dictionary to store FINN lexicon. Afterwards, we load the AFINN lexicon from a CSV file into a DataFrame. Only the first two columns are used. the, we iterate through each row of the DataFrame, extracting the word and its sentiment score.

In [None]:
afinn = {}

lexicon_df = pd.read_csv('/content/lexico_afinn.csv', header=0)
lexicon_df = lexicon_df.iloc[:, :2]

for index, row in lexicon_df.iterrows():
    word = row['palabra']
    sentiment_score_str = row['puntuacion']

    try:
        sentiment_score = float(sentiment_score_str)
        afinn[word] = sentiment_score
    except ValueError:
        print(f"Invalid sentiment score '{sentiment_score_str}' for word '{word}'. Skipping row.")

We define a function that calculates the sentiment of the text. First, convert the text to lowercase and split it into words. Then, calculate the sentiment score by summing the scores of individual words using the AFINN dictionary.

In [None]:
def calculate_sentiment(text):
    words = text.lower().split()
    sentiment_score = sum(afinn.get(word, 0) for word in words)
    if sentiment_score > 0:
        return 'Positive'
    elif sentiment_score < 0:
        return 'Negative'
    else:
        return 'Neutral'

We also upload stopwords for Catalan and add them to those for Spanish.

In [None]:
catalan_stopwords = catalan_stopwords = [
    'de', 'es', 'i', 'a', 'o', 'un', 'una', 'unes', 'uns', 'un', 'tot',
    'també', 'altre', 'algun', 'alguna', 'alguns', 'algunes', 'ser', 'és',
    'soc', 'ets', 'som', 'estic', 'està', 'estem', 'esteu', 'estan', 'com',
    'en', 'per', 'perquè', 'per que', 'estat', 'estava', 'ans', 'abans',
    'éssent', 'ambdós', 'però', 'per', 'poder', 'potser', 'puc', 'podem',
    'podeu', 'poden', 'vaig', 'va', 'van', 'fer', 'faig', 'fa', 'fem',
    'feu', 'fan', 'cada', 'fi', 'inclòs', 'primer', 'des de', 'conseguir',
    'consegueixo', 'consigueix', 'consigueixes', 'conseguim', 'consigueixen',
    'anar', 'haver', 'tenir', 'tinc', 'te', 'tenim', 'teniu', 'tene', 'el',
    'la', 'les', 'els', 'seu', 'aquí', 'meu', 'teu', 'ells', 'elles', 'ens',
    'nosaltres', 'vosaltres', 'si', 'dins', 'sols', 'solament', 'saber',
    'saps', 'sap', 'sabem', 'sabeu', 'saben', 'últim', 'llarg', 'bastant',
    'fas', 'molts', 'aquells', 'aquelles', 'seus', 'llavors', 'sota', 'dalt',
    'ús', 'molt', 'era', 'eres', 'erem', 'eren', 'mode', 'bé', 'quant',
    'quan', 'on', 'mentre', 'qui', 'amb', 'entre', 'sense', 'jo', 'aquell'
]

stop_words.update(catalan_stopwords)

We define a function to preprocess text data. The text is converted to lower case, punctuation removed, tokenised using TextBlob, lemmatised, and stopwords removed.

In [None]:
def preprocess_text(text):
    if pd.isna(text):
        return ""
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    tokens = TextBlob(text).words
    tokens = [word for word in tokens if word not in stop_words]
    lemmatized_tokens = [Word(word).lemmatize() for word in tokens]
    preprocessed_text = ' '.join(lemmatized_tokens)

    return preprocessed_text

df['preprocessed_text'] = df['text'].apply(preprocess_text)

We apply the calculate_sentiment function to the 'preprocessed_text' column of the DataFrame and store the sentiment labels in a new column 'afinn'.

In [None]:
#Now I calculate sentiments:
df['afinn'] = df['preprocessed_text'].apply(lambda text: calculate_sentiment(text))

We ount the number of occurrences of each sentiment label ('Positive', 'Negative', 'Neutral') in the 'afinn' column.

In [None]:
# Count the number of positive, negative, and neutral sentiments for qualified
sentiment_counts = df['afinn'].value_counts()
sentiment_counts

**Evaluation RoBERTuito**

Two dataframes are created with Spanish, and Catalan and bilingual instances, respectively, from the original dataframe 'df'.

In [None]:
spanish = df[df['spanish'] == 1]
cat_bi = df[df['spanish'] == 0]

We install and upload the necessary libraries.

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score

Accuracy, F1 score and Confusion matrix are calculated for the overall manually labelled instaces ('df' dataframe), the Spanish instances ('spanish' dataframe), and the Catalan and bilingual instances ('cat_bi' dataframe) for RoBERTuito.

In [None]:
#OVERALL
#accuracy
accuracy = accuracy_score(df['sentiment'], df['BERT'])
print("Accuracy:", accuracy)

#f1
f1_bert = f1_score(df['sentiment'], df['BERT'], average='weighted')
print("F1 Score:", f1_bert)

#confusion matrix
conf_matrix = confusion_matrix(df['sentiment'], df['BERT'])
true_positives = conf_matrix[1, 1]
print("Confusion Matrix:")
print(conf_matrix)
print("True Positives:", true_positives)

In [None]:
#Catalan & Bilingual

#accuracy
accuracy_b_cb = accuracy_score(cat_bi['sentiment'], cat_bi['BERT'])
print("Accuracy:", accuracy_b_cb)

#f1
f1_bert_catbi = f1_score(cat_bi['sentiment'], cat_bi['BERT'], average='weighted')
print("F1 Score:", f1_bert_catbi)

#confusion matrix
conf_matrix_b_cb = confusion_matrix(cat_bi['sentiment'], cat_bi['BERT'])
true_positives_b_cb = conf_matrix_b_cb[1, 1]
print("Confusion Matrix:")
print(conf_matrix_b_cb)
print("True Positives:", true_positives_b_cb)

In [None]:
#Spanish

#accuracy
accuracy_b_s = accuracy_score(spanish['sentiment'], spanish['BERT'])
print("Accuracy:", accuracy_b_s)

#f1
f1_bert_es = f1_score(spanish['sentiment'], spanish['BERT'], average='weighted')
print("F1 Score:", f1_bert_es)

#confusion matrix
conf_matrix_b_s = confusion_matrix(spanish['sentiment'], spanish['BERT'])
true_positives_b_s = conf_matrix_b_s[1, 1]
print("Confusion Matrix:")
print(conf_matrix_b_s)
print("True Positives:", true_positives_b_s)

**Evaluation AFINN**

Accuracy, F1 score and Confusion matrix are calculated for the overall manually labelled instaces ('df' dataframe), the Spanish instances ('spanish' dataframe), and the Catalan and bilingual instances ('cat_bi' dataframe) for AFINN.

In [None]:
#OVERALL

#accuracy
accuracy_afinn = accuracy_score(df['sentiment'], df['afinn'])
print("Accuracy:", accuracy_afinn)

#f1
f1_afinn = f1_score(df['sentiment'], df['afinn'], average='weighted')
print("F1 Score:", f1_afinn)

#confusion matrix
conf_matrix_afinn = confusion_matrix(df['sentiment'], df['afinn'])
true_positives_afinn = conf_matrix_afinn[1, 1]
print("Confusion Matrix:")
print(conf_matrix_afinn)
print("True Positives:", true_positives_afinn)

In [None]:
#Catalan & Bilingual

#accuracy
accuracy_a_cb = accuracy_score(cat_bi['sentiment'], cat_bi['afinn'])
print("Accuracy:", accuracy_a_cb)

#f1
f1_afinn_es = f1_score(cat_bi['sentiment'], cat_bi['afinn'], average='weighted')
print("F1 Score:", f1_afinn_es)

#confusion matrix
conf_matrix_a_cb = confusion_matrix(cat_bi['sentiment'], cat_bi['afinn'])
true_positives_a_cb = conf_matrix_a_cb[1, 1]
print("Confusion Matrix:")
print(conf_matrix_a_cb)
print("True Positives:", true_positives_a_cb)

In [None]:
#Spanish

#accuracy
accuracy_a_s = accuracy_score(spanish['sentiment'], spanish['afinn'])
print("Accuracy:", accuracy_a_s)

#f1
f1_afinn_catbi = f1_score(spanish['sentiment'], spanish['afinn'], average='weighted')
print("F1 Score:", f1_afinn_catbi)

#confusion matrix
conf_matrix_a_s = confusion_matrix(spanish['sentiment'], spanish['afinn'])
true_positives_a_s = conf_matrix_a_s[1, 1]
print("Confusion Matrix:")
print(conf_matrix_a_s)
print("True Positives:", true_positives_a_s)