In [2]:
#importamos las librerias necesarias
import tensorflow as tf
import numpy as np
import pandas as pd
from transformers import TFAutoModelForSequenceClassification, AutoTokenizer
from tensorflow import keras

In [3]:
#Leemos los datos, que en en este caso son tweets comentando sobre una serie de videojuegos,
#en este caso nos vamos a centrar en el texto y en su etiqueta
df = pd.read_csv("twitter_training.csv", header=None)
df.head()

Unnamed: 0,0,1,2,3
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...


In [4]:
df = df.drop([0,1], axis=1)
df.head()

Unnamed: 0,2,3
0,Positive,im getting on borderlands and i will murder yo...
1,Positive,I am coming to the borders and I will kill you...
2,Positive,im getting on borderlands and i will kill you ...
3,Positive,im coming on borderlands and i will murder you...
4,Positive,im getting on borderlands 2 and i will murder ...


In [5]:
df.shape

(74682, 2)

In [8]:
#Eliminamos las palabras que pueden hacer que nuestro modelo empeore su rendimiento
import re
def clean_text(text):
  if not isinstance(text, str):  # Verifica si no es una cadena
    text = str(text)

  text = re.sub(r'http\S+', '', text)  # Elimina URLs
  text = re.sub(r'@\w+', '', text)  # Elimina menciones (@usuario)
  text = re.sub(r'#\w+', '', text)  # Elimina hashtags
  text = re.sub(r'\W+', ' ', text)  # Elimina caracteres no alfanuméricos
  return text

In [9]:
#Pasamos el texto a la variable "X" y las etiquetas a la variable "y"
X = np.array([clean_text(text) for text in df[3]], dtype=str)
y = np.array(df[2])

In [10]:
#Vemos las clases de "y"
np.unique(y)

array(['Irrelevant', 'Negative', 'Neutral', 'Positive'], dtype=object)

In [11]:
#Vemos si las clases están desvalanceadas
from collections import Counter
print(Counter(y))

Counter({'Negative': 22542, 'Positive': 20832, 'Neutral': 18318, 'Irrelevant': 12990})


In [12]:
#Pasamos las etiquetas a tipo númerico para poder entrenar el modelo
condition = [

      y == "Positive",
      y == "Neutral",
      y == "Irrelevant",
      y == "Negative"
]
values = [3,2,1,0]
y = np.select(condition, values)

In [13]:
#Dividimos las datos en entrenamiento y test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 42, shuffle = True)

In [None]:
#Inicializamos las siguientes variables, correspondientes al modelo que utilizaremos
#y al numero de clases que tenemos
model_name = "distilbert-base-uncased"
num_classes = 4

In [14]:
#Inicializamos el tokenizador
tokenizer = AutoTokenizer.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [15]:
#Tokenizamos las variables de texto para poder introducirlas en nuestro modelo DistilBert
max_length= 128
train_encodings = tokenizer(X_train.tolist(), truncation=True, padding=True,max_length=max_length)
test_encodings = tokenizer(X_test.tolist(), truncation=True, padding=True)

In [16]:
#Como última parte del preprocesamiento, utilizamos la siguiente función para convertir
#nuestros datos a tensores, lo cual optimiza nuestra red de Tensorflow
train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),
    y_train
))

test_dataset = tf.data.Dataset.from_tensor_slices((
    dict(test_encodings),
    y_test
))

In [18]:
#Utilizamos esta función para inicializar nuestro modelo de análisis de texto
model = TFAutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_classes)

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.bias']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFDistilBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']
You should 

In [19]:
#A continuación asignamos el optimizador y la función de perdida para nuestro modelo.
#En este caso especificamos un learning rate tan pequeño para tener cuidado con ajustar los pesos
#de este tipo de modelos que ya han sido entrenados.
opt = tf.keras.optimizers.Adam(learning_rate=1e-5)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
model.compile(optimizer=opt, loss=loss, metrics=["accuracy"])

In [20]:
#Entrenamos el modelo y vemos que obtenemos un buen resultado
history = model.fit(train_dataset.batch(16),
                    validation_data = test_dataset.batch(16),
                    epochs=3
                    )

Epoch 1/3
Epoch 2/3
Epoch 3/3
