## 1. Seleccione un data-set para una aplicación clasificación de textos con más de dos clases proveniente de algún repositorio de datos, o de otra fuente

Para este caso, se opto por usar un repositorio de HuggingFace de emociones, conocido como GoEmotions simplificado traído de Hugging, orientado a una clasificación multiclase de texto (28 clases).


> Admiration [0], Amusement, Anger, Annoyance, Approval, Caring, Confusion, Curiosity, Desire, Disappointment, Disapproval, Disgust, Embarrassment, Excitement, Fear, Gratitude, Gried, Joy, Love, Nervousness, Optimism, Pride, Realization, Relief, Remorse, Sadness, Surprise, Neutral [27].
<!-- BRINDAR MÄS INFO -->

## Importando librerías

In [None]:
%pip install datasets==2.15.0 fsspec==2023.9.2 --quiet

In [2]:
import sys

from packaging import version
import tensorflow as tf

assert version.parse(tf.__version__) >= version.parse("2.8.0")

print("TensorFlow versión:", tf.__version__)

TensorFlow versión: 2.19.0


In [None]:
from datasets import load_dataset
import pandas as pd
import numpy as np
from tensorflow.keras import layers
from tensorflow import keras

## Carga del dataset

In [5]:
ds = load_dataset("google-research-datasets/go_emotions", "simplified")

In [7]:
print(f"{ds}\n")
ds['train'][0]['labels']

DatasetDict({
    train: Dataset({
        features: ['text', 'labels', 'id'],
        num_rows: 43410
    })
    validation: Dataset({
        features: ['text', 'labels', 'id'],
        num_rows: 5426
    })
    test: Dataset({
        features: ['text', 'labels', 'id'],
        num_rows: 5427
    })
})



[27]

In [8]:
# El dataset contiene ejemplos multietiquetas más de dos labels en un solo text (ej. [5, 27]), se adapta a quedar en multiclase

# Multietiqueta -> Cada texto puede tener varias etiquetas simultáneamente
# Multiclase -> Cada texto tiene una sola etiqueta de entre varias posibles

def to_multiclass(example):
    example["label"] = example["labels"][0] if example["labels"] else 27  # 27 es "neutral"
    return example

ds = ds.map(to_multiclass)


Map: 100%|██████████| 43410/43410 [00:02<00:00, 14480.96 examples/s]
Map: 100%|██████████| 5426/5426 [00:00<00:00, 15108.52 examples/s]
Map: 100%|██████████| 5427/5427 [00:00<00:00, 15202.18 examples/s]


In [9]:
ds['train'][0]

{'text': "My favourite food is anything I didn't have to cook myself.",
 'labels': [27],
 'id': 'eebbqej',
 'label': 27}

## Análisis del Dataset **GoEmotions**

In [10]:
# Para mejores consultas, usamos pandas para analizar en detalle el dataset de HuggingFace

df_train = pd.DataFrame(ds["train"])
df_test = pd.DataFrame(ds["test"])
df_val = pd.DataFrame(ds["validation"])

print(f"Vista de Dataset datos de entrenamiento\n{df_train.head()}\n{df_train.columns}")
print(f"\n{df_train.tail()}\n{df_train.columns}")

Vista de Dataset datos de entrenamiento
                                                text labels       id  label
0  My favourite food is anything I didn't have to...   [27]  eebbqej     27
1  Now if he does off himself, everyone will thin...   [27]  ed00q6i     27
2                     WHY THE FUCK IS BAYLESS ISOING    [2]  eezlygj      2
3                        To make her feel threatened   [14]  ed7ypvh     14
4                             Dirty Southern Wankers    [3]  ed0bdzj      3
Index(['text', 'labels', 'id', 'label'], dtype='object')

                                                    text labels       id  \
43405  Added you mate well I’ve just got the bow and ...   [18]  edsb738   
43406  Always thought that was funny but is it a refe...    [6]  ee7fdou   
43407  What are you talking about? Anything bad that ...    [3]  efgbhks   
43408            More like a baptism, with sexy results!   [13]  ed1naf8   
43409                                    Enjoy the ride!   [17]  e

In [11]:
print(df_train.info())
print("_"*50)
print(df_test.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43410 entries, 0 to 43409
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    43410 non-null  object
 1   labels  43410 non-null  object
 2   id      43410 non-null  object
 3   label   43410 non-null  int64 
dtypes: int64(1), object(3)
memory usage: 1.3+ MB
None
__________________________________________________
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5427 entries, 0 to 5426
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    5427 non-null   object
 1   labels  5427 non-null   object
 2   id      5427 non-null   object
 3   label   5427 non-null   int64 
dtypes: int64(1), object(3)
memory usage: 169.7+ KB
None


In [12]:
print(df_train["label"].value_counts())

label
27    12823
0      4130
4      2596
1      2244
3      2138
15     2096
7      1772
10     1651
2      1547
18     1533
6      1268
9      1028
17     1013
20      974
5       966
25      874
26      751
13      700
22      698
11      580
8       543
14      510
24      404
12      248
19      105
23       96
16       65
21       57
Name: count, dtype: int64


## Construcción

In [13]:
#@title Configuración variables
batch_size = 32
max_length = 600
max_tokens = 20000
num_classes = 28

### Modelo con codificación one hot
```
embedded = tf.one_hot(inputs, depth=max_tokens)
```

In [14]:
def dataset_to_tf(ds_split):
    texts = [ex["text"] for ex in ds_split]
    labels = [ex["label"] for ex in ds_split]
    ds_tf = tf.data.Dataset.from_tensor_slices((texts, labels))
    return ds_tf

train_ds = dataset_to_tf(ds["train"])
val_ds = dataset_to_tf(ds["validation"])
test_ds = dataset_to_tf(ds["test"])

In [15]:
train_ds = train_ds.shuffle(10000).batch(batch_size).prefetch(tf.data.AUTOTUNE)
val_ds = val_ds.batch(batch_size).prefetch(tf.data.AUTOTUNE)
test_ds = test_ds.batch(batch_size).prefetch(tf.data.AUTOTUNE)

In [16]:
#Visualicemos algunos de los Batchs generados
for example, label in train_ds.take(1):
  print('text: ', example.numpy()[:5])
  print('label: ', label.numpy()[:5])

text:  [b'Also the fishing has been terrible for a month now. I am about to take the 5 story jump my friends.'
 b"*Hey just noticed..* it's your **2nd Cakeday** yosoyjackiejorpjomp! ^(hug)"
 b'Except for [NAME] whose remains are either still on the way to Winterfell or lost altogether'
 b"Downside of city based police. State level policing is where it's at. Problem on the Gold Coast? Send police from Brisbane."
 b'people be sleeping on greens fastball sheesh']
label:  [11 17 27  4 27]


In [17]:
text_vectorization = layers.TextVectorization(
    max_tokens=max_tokens,
    output_mode="int",
    output_sequence_length=max_length,
)
text_vectorization.adapt(train_ds.map(lambda x, y: x))

int_train_ds = train_ds.map(
    lambda x, y: (text_vectorization(x), y),
    num_parallel_calls=tf.data.AUTOTUNE)
int_val_ds = val_ds.map(
    lambda x, y: (text_vectorization(x), y),
    num_parallel_calls=tf.data.AUTOTUNE)
int_test_ds = test_ds.map(
    lambda x, y: (text_vectorization(x), y),
    num_parallel_calls=tf.data.AUTOTUNE)

In [18]:
#Visualicemos algunos de los Batchs generados
for example2, label2 in int_train_ds.take(1):
  print('text: ', example2.numpy()[:3])
  print('label: ', label2.numpy()[:3])

text:  [[  2 607   8 ...   0   0   0]
 [ 40   6  23 ...   0   0   0]
 [104   3 124 ...   0   0   0]]
label:  [11 27 27]


In [19]:
# Miremos algunas palabras del vocabulario con su respectivo índice
print("2 ---> ",text_vectorization.get_vocabulary()[2])
print(" 3 ---> ",text_vectorization.get_vocabulary()[3])
print('Vocabulary size: {}'.format(len(text_vectorization.get_vocabulary())))

2 --->  the
 3 --->  i
Vocabulary size: 20000


In [22]:
# Se hace un primer modelo usando como entrada codificación one hot
embedding_dim = 64  # ejemplo

inputs = keras.Input(shape=(None,), dtype="int64")
embedded = layers.Embedding(input_dim=max_tokens, output_dim=embedding_dim)(inputs)

x = layers.Bidirectional(layers.LSTM(32))(embedded)
x = layers.Dropout(0.5)(x)
outputs = layers.Dense(num_classes, activation="softmax")(x)

model = keras.Model(inputs, outputs)
model.compile(optimizer="rmsprop",
              loss="sparse_categorical_crossentropy",
              metrics=["accuracy"])

model.summary()
keras.utils.plot_model(model, to_file='model_plot.png', show_shapes=True, show_layer_names=True)


# inputs = layers.Input(shape=(None,), dtype="int64")
# model = models.Model(inputs, outputs)

You must install pydot (`pip install pydot`) for `plot_model` to work.


In [None]:
# model.save('ModeloOneHot.h5')

In [23]:
model.fit(int_train_ds, validation_data=int_val_ds, epochs=7)
print(f"Test acc: {model.evaluate(int_test_ds)[1]:.3f}")

Epoch 1/7


[1m1357/1357[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m241s[0m 176ms/step - accuracy: 0.3209 - loss: 2.6792 - val_accuracy: 0.4370 - val_loss: 2.1755
Epoch 2/7
[1m1357/1357[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m229s[0m 168ms/step - accuracy: 0.4412 - loss: 2.1745 - val_accuracy: 0.4779 - val_loss: 1.9700
Epoch 3/7
[1m1357/1357[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m228s[0m 168ms/step - accuracy: 0.4730 - loss: 1.9862 - val_accuracy: 0.5000 - val_loss: 1.9000
Epoch 4/7
[1m1357/1357[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m228s[0m 168ms/step - accuracy: 0.5060 - loss: 1.8486 - val_accuracy: 0.5116 - val_loss: 1.8207
Epoch 5/7
[1m1357/1357[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m241s[0m 178ms/step - accuracy: 0.5239 - loss: 1.7524 - val_accuracy: 0.5147 - val_loss: 1.7998
Epoch 6/7
[1m1357/1357[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m231s[0m 170ms/step - accuracy: 0.5394 - loss: 1.6697 - val_accuracy: 0.5118 - val_loss: 1.7800
Epoch 7/

In [24]:
print(model)

<Functional name=functional, built=True>


In [29]:
model.save('ModeloEntrenado.h5')



In [30]:
model.save('ModeloEntrenado.keras')

In [25]:
# Guardar vocabulario
vocab = text_vectorization.get_vocabulary()
with open("vocabulario.txt", "w", encoding="utf-8") as f:
    for palabra in vocab:
        f.write(palabra + "\n")

# Guardar etiquetas (emociones)
# Si usaste la versión simplificada:
etiquetas = ['admiration', 'amusement', 'anger', 'annoyance', 'approval', 'caring',
             'confusion', 'curiosity', 'desire', 'disappointment', 'disapproval',
             'disgust', 'embarrassment', 'excitement', 'fear', 'gratitude', 'grief',
             'joy', 'love', 'nervousness', 'optimism', 'pride', 'realization',
             'relief', 'remorse', 'sadness', 'surprise', 'neutral']

with open("etiquetas.txt", "w", encoding="utf-8") as f:
    for e in etiquetas:
        f.write(e + "\n")

In [26]:
# Prueba del modelo con un texto personalizado
Entrada = keras.Input(shape=(1,), dtype="string")
Entrada_procesada=text_vectorization(Entrada)
Salida=model(Entrada_procesada)
Inferencia=keras.Model(Entrada,Salida)

Texto_Ejemplo = tf.convert_to_tensor([["Happy birtday my dude"],])
# Ehhh, it's an opinion, it's not wrong or right, just highly unpopular and ill-informed.
# Yikes. I admire your patience
Prediccion=Inferencia(Texto_Ejemplo)
pred = Prediccion.numpy()[0]
print(pred)



[0.08329658 0.03282053 0.00195078 0.00263113 0.0184025  0.0240633
 0.00528014 0.0084544  0.00633586 0.00565686 0.00578733 0.00177717
 0.00439875 0.15509003 0.00914509 0.03902415 0.00115293 0.44840187
 0.0470527  0.0024989  0.03390584 0.00112894 0.00638766 0.0022532
 0.00277917 0.00428299 0.01751015 0.02853099]


In [27]:
# Opcional: Mostrar clase con mayor probabilidad
clase_pred = np.argmax(pred, axis=-1)
print(f"Etiqueta predicha: {clase_pred}")

# Etiquetas de GoEmotions para referencia
labels_names = ds["train"].features["labels"].feature.names
print(f"Clase predicha corresponde a: {labels_names[clase_pred].capitalize()}")

Etiqueta predicha: 17
Clase predicha corresponde a: Joy


In [28]:
# Asumiendo que ya tienes: pred = Prediccion.numpy()[0]

# Ordenar de mayor a menor
indices_ordenados = np.argsort(pred)[::-1]

print("Top 3 emociones predichas:")
for i in range(3):
    idx = indices_ordenados[i]
    print(f"{labels_names[idx].capitalize():<12} → {pred[idx]:.3f}")


Top 3 emociones predichas:
Joy          → 0.448
Excitement   → 0.155
Admiration   → 0.083
