## Modelos de prueba sin un Modelo pre-entrenado

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import stopwords
import re
from sklearn.preprocessing import LabelEncoder
import io
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences




In [50]:
df = pd.read_csv("dataSocialMedia.csv")

In [51]:
columnas = ['message','sentiment']
df = df[columnas]

In [52]:
print(df.isna().sum())
df = df.dropna()
print(df.isna().sum())

message      1
sentiment    0
dtype: int64
message      0
sentiment    0
dtype: int64


PREPROCESAMIENTO

In [53]:
import pandas as pd
import re

def clean_text(text):
    cleaned_text = re.sub(r'[^a-zA-Z\s]', '', text)
    return cleaned_text
df['message'] = df['message'].apply(clean_text)

In [54]:

df['message'] = df['message'].str.lower()

In [56]:

nltk.download('stopwords')
stop_words = set(stopwords.words('spanish'))

def remove_stopwords(text):

    words = text.split()
    filtered_words = [word for word in words if word.lower() not in stop_words]
    filtered_text = ' '.join(filtered_words)
    return filtered_text
df['message'] = df['message'].apply(remove_stopwords)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Administrator\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [57]:

def remove_extra_spaces(text):
    # Eliminar los espacios en blanco al principio y al final de la cadena
    text = text.strip()
    # Reemplazar múltiples espacios consecutivos con un solo espacio
    text = re.sub(r'\s+', ' ', text)
    return text

df['message'] = df['message'].apply(remove_extra_spaces)
df["message"] = df["message"].astype(str)

In [21]:
df.head()

Unnamed: 0,message,sentiment
0,hacer miss venezuela retirarse miss uni seal p...,negativo
1,imagino dentro plan accin est tener sistema vo...,negativo
2,seleccionar mejor organizacion tomando cuentas...,negativo
3,hermosas,positivo
4,ando enojada clase robo super descarado tena f...,negativo


In [26]:
label_encoder = LabelEncoder()

# Convertir las etiquetas de texto a valores numéricos
y_encoded = label_encoder.fit_transform(df['sentiment'])

class_mapping = {index: label for index, label in enumerate(label_encoder.classes_)}
y_labels = label_encoder.inverse_transform(y_encoded)

# Imprimir los valores numéricos y las etiquetas correspondientes
print(y_encoded)  
print(y_labels)   

[0 0 0 ... 1 2 2]
['negativo' 'negativo' 'negativo' ... 'neutro' 'positivo' 'positivo']


In [58]:

label_encoder = LabelEncoder()

# Convertir las etiquetas de texto a valores numéricos
df['label'] = label_encoder.fit_transform(df['sentiment'])

# Mapear los valores numéricos a las etiquetas originales
class_mapping = {index: label for index, label in enumerate(label_encoder.classes_)}


print(df.head())

                                             message sentiment  label
0  hacer miss venezuela retirarse miss uni seal p...  negativo      0
1  imagino dentro plan accin est tener sistema vo...  negativo      0
2  seleccionar mejor organizacion tomando cuentas...  negativo      0
3                                           hermosas  positivo      2
4  ando enojada clase robo super descarado tena f...  negativo      0


Division de los datos y tokenizacion / pruena 1

In [29]:
X = df["message"]
Y = df["label"]
X_train, X_test, y_train, y_test = train_test_split(X,Y, test_size=0.5, random_state=42)

In [30]:

# Crear conjuntos de datos de TensorFlow a partir de los datos divididos
train_dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train))
test_dataset = tf.data.Dataset.from_tensor_slices((X_test, y_test))
BUFFER_SIZE = 10000
BATCH_SIZE = 20
train_dataset = train_dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
test_dataset = test_dataset.batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
for example, label in train_dataset.take(1):
  print('texts: ', example.numpy()[:3])
  print()
  print('labels: ', label.numpy()[:3])

texts:  [b'excelente dios bendiga'
 b'amanda dios bendiga hiciste excelente excelente orgullo verte tv tan hermosa tan segura pblico hacindote'
 b'chamaaaaa reina universaaaal hiciste trabajo impecable dejaste nombre venezuela altooo']

labels:  [2 2 2]


In [31]:
VOCAB_SIZE = 1000
encoder = tf.keras.layers.TextVectorization(
    max_tokens=VOCAB_SIZE)
encoder.adapt(train_dataset.map(lambda text, label: text))

In [32]:
vocab = np.array(encoder.get_vocabulary())
vocab[:20]

array(['', '[UNK]', 'reina', 'corona', 'universo', 'miss', 'amanda',
       'ganadora', 'venezuela', 'gracias', 't', 'mejor', 'hermosa',
       'dios', 'ms', 'amandadudamel', 'robaron', 'siempre', 'hiciste',
       'bella'], dtype='<U16')

In [33]:
encoded_example = encoder(example)[:3].numpy()
encoded_example

array([[ 38,  13,  24,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0],
       [  6,  13,  24,  18,  38,  38,  42, 277,   1,  28,  12,  28, 174,
        836,   1,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0],
       [  1,   2,   1,  18,  35,  46, 107,  63,   8,   1,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0]],
      dtype=int64)

In [34]:
for n in range(3):
  print("Original: ", example[n].numpy())
  print("Round-trip: ", " ".join(vocab[encoded_example[n]]))
  print()

Original:  b'excelente dios bendiga'
Round-trip:  excelente dios bendiga                      

Original:  b'amanda dios bendiga hiciste excelente excelente orgullo verte tv tan hermosa tan segura pblico hacindote'
Round-trip:  amanda dios bendiga hiciste excelente excelente orgullo verte [UNK] tan hermosa tan segura pblico [UNK]          

Original:  b'chamaaaaa reina universaaaal hiciste trabajo impecable dejaste nombre venezuela altooo'
Round-trip:  [UNK] reina [UNK] hiciste trabajo impecable dejaste nombre venezuela [UNK]               



División de los datos y tokenizacion / prueba 2

In [35]:

tokenizer = Tokenizer(num_words=1000, oov_token="<00V>")
tokenizer.fit_on_texts(df["message"])


word_index = tokenizer.word_index

In [36]:
sequences =  tokenizer.texts_to_sequences(df["message"])

padded = pad_sequences(sequences, padding="post")

print(padded[1])
print(padded.shape)

[900 294   1   1  46 180   1   1   1   1   1 953   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0]
(22373, 173)


In [37]:
vocab_size = len(word_index) + 1
embedding_dim = 16
max_length = 300
trunc_type='post'
padding_type='post'
oov_tok = "<OOV>"
training_size = 20000

training_sentences = df["message"][0:training_size]
testing_sentences = df["message"][training_size:]
training_labels = df["label"][0:training_size]
testing_labels = df["label"][training_size:]


In [38]:

tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(training_sentences)

word_index = tokenizer.word_index

training_sentences = tokenizer.texts_to_sequences(training_sentences)
training_padded = pad_sequences(training_sentences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

testing_sequences = tokenizer.texts_to_sequences(testing_sentences)
testing_padded = pad_sequences(testing_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)


In [39]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(16, activation="relu"),
    tf.keras.layers.Dense(3, activation="softmax")
])


model.compile(loss=tf.keras.losses.SparseCategoricalCrossentropy(),
              optimizer=tf.keras.optimizers.Adam(),
              metrics=["accuracy"])

history = model.fit(training_padded,
                    training_labels,
                    epochs=10,
                    validation_data=(testing_padded, testing_labels))

Epoch 1/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 7ms/step - accuracy: 0.6922 - loss: 0.7737 - val_accuracy: 0.7560 - val_loss: 0.5759
Epoch 2/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 6ms/step - accuracy: 0.8145 - loss: 0.4548 - val_accuracy: 0.7922 - val_loss: 0.4990
Epoch 3/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 6ms/step - accuracy: 0.8656 - loss: 0.3612 - val_accuracy: 0.8255 - val_loss: 0.4733
Epoch 4/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 6ms/step - accuracy: 0.8893 - loss: 0.3022 - val_accuracy: 0.8437 - val_loss: 0.4524
Epoch 5/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 6ms/step - accuracy: 0.8987 - loss: 0.2730 - val_accuracy: 0.8361 - val_loss: 0.4563
Epoch 6/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 6ms/step - accuracy: 0.9071 - loss: 0.2499 - val_accuracy: 0.8428 - val_loss: 0.4634
Epoch 7/10
[1m625/625[0m 

In [40]:
# Obtener las métricas de evaluación en los datos de prueba
loss, accuracy = model.evaluate(testing_padded, testing_labels)

# Imprimir los resultados de la evaluación
print("Loss:", loss)
print("Accuracy:", accuracy)

[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.9017 - loss: 0.3011
Loss: 0.5268239378929138
Accuracy: 0.8453434705734253


In [41]:
# Supongamos que tienes una nueva secuencia de texto para hacer la predicción
new_text = [" bella la miss venezuela"]

# Tokenizar el texto utilizando el mismo tokenizer utilizado durante el entrenamiento
new_sequences = tokenizer.texts_to_sequences(new_text)

# Asegurarse de que las secuencias tengan la misma longitud que las secuencias de entrenamiento
new_padded = pad_sequences(new_sequences, maxlen=max_length, padding="post", truncating="post")

# Hacer la predicción utilizando el modelo entrenado
predictions = model.predict(new_padded)

# Obtener la clase predicha (índice con la mayor probabilidad)
predicted_class = tf.argmax(predictions, axis=1).numpy()[0]

# Imprimir la clase predicha
print("Clase predicha:", predicted_class)

# 1 neutro, 2 positvo, 0 negativo

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 152ms/step
Clase predicha: 2


prueba 3 / modelo con menos epoch

In [42]:

model_1 = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(16, activation="relu"),
    tf.keras.layers.Dense(3, activation="softmax")
])


model_1.compile(loss=tf.keras.losses.SparseCategoricalCrossentropy(),
              optimizer=tf.keras.optimizers.Adam(),
              metrics=["accuracy"])


history = model_1.fit(training_padded,
                    training_labels,
                    epochs=5,
                    validation_data=(testing_padded, testing_labels))

Epoch 1/5
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 5ms/step - accuracy: 0.6977 - loss: 0.7832 - val_accuracy: 0.7526 - val_loss: 0.6150
Epoch 2/5
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 5ms/step - accuracy: 0.7907 - loss: 0.5005 - val_accuracy: 0.7758 - val_loss: 0.5232
Epoch 3/5
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 5ms/step - accuracy: 0.8367 - loss: 0.4107 - val_accuracy: 0.8184 - val_loss: 0.4905
Epoch 4/5
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 5ms/step - accuracy: 0.8707 - loss: 0.3546 - val_accuracy: 0.8314 - val_loss: 0.4758
Epoch 5/5
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 6ms/step - accuracy: 0.8859 - loss: 0.3150 - val_accuracy: 0.8306 - val_loss: 0.4762


In [43]:

loss, accuracy = model_1.evaluate(testing_padded, testing_labels)

print("Loss:", loss)
print("Accuracy:", accuracy)

[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.8757 - loss: 0.3452
Loss: 0.47616755962371826
Accuracy: 0.8305941820144653


In [44]:

new_text = ["horrile la nueva misss que se vaya"]

new_sequences = tokenizer.texts_to_sequences(new_text)


new_padded = pad_sequences(new_sequences, maxlen=max_length, padding="post", truncating="post")

predictions = model_1.predict(new_padded)


predicted_class = tf.argmax(predictions, axis=1).numpy()[0]


print("Clase predicha:", predicted_class)

# 1 neutro, 2 positvo, 0 negativo

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 109ms/step
Clase predicha: 2


prueba 4 / modelo con Learning rate callback



In [45]:

model_4 = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(16, activation="relu"),
    tf.keras.layers.Dropout(0.4),
    tf.keras.layers.Dense(3, activation="softmax")
])


model_4.compile(loss=tf.keras.losses.SparseCategoricalCrossentropy(),
              optimizer=tf.keras.optimizers.Adam(),
              metrics=["accuracy"])

# Learning rate callback
scheduler = tf.keras.callbacks.LearningRateScheduler(lambda epoch: 1e-3 * 10**(epoch/10))
# Entrenar el modelo

history = model_4.fit(training_padded,
                    training_labels,
                    epochs=4,
                    validation_data=(testing_padded, testing_labels), 
                    callbacks=[scheduler])



Epoch 1/4
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 5ms/step - accuracy: 0.5915 - loss: 0.9149 - val_accuracy: 0.7370 - val_loss: 0.6401 - learning_rate: 0.0010
Epoch 2/4
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 5ms/step - accuracy: 0.7683 - loss: 0.5744 - val_accuracy: 0.7779 - val_loss: 0.5364 - learning_rate: 0.0013
Epoch 3/4
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 5ms/step - accuracy: 0.8192 - loss: 0.4609 - val_accuracy: 0.8217 - val_loss: 0.4815 - learning_rate: 0.0016
Epoch 4/4
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 5ms/step - accuracy: 0.8499 - loss: 0.3933 - val_accuracy: 0.8268 - val_loss: 0.4805 - learning_rate: 0.0020


In [46]:
# Obtener las métricas de evaluación en los datos de prueba
loss, accuracy = model_4.evaluate(testing_padded, testing_labels)

# Imprimir los resultados de la evaluación
print("Loss:", loss)
print("Accuracy:", accuracy)

[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.8740 - loss: 0.3501
Loss: 0.4805208444595337
Accuracy: 0.8268015384674072


In [48]:
# Obtener las métricas de evaluación en los datos de prueba
loss, accuracy = model_4.evaluate(testing_padded, testing_labels)

# Imprimir los resultados de la evaluación
print("Loss:", loss)
print("Accuracy:", accuracy)

[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.8729 - loss: 0.3527
Loss: 0.4754085838794708
Accuracy: 0.8263801336288452


In [49]:

new_text = ["miss venezuela"]


new_sequences = tokenizer.texts_to_sequences(new_text)


new_padded = pad_sequences(new_sequences, maxlen=max_length, padding="post", truncating="post")

predictions = model_4.predict(new_padded)


predicted_class = tf.argmax(predictions, axis=1).numpy()[0]

print("Clase predicha:", predicted_class)

# 1 neutro, 2 positvo, 0 negativo

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 77ms/step
Clase predicha: 1
