In [1]:
# ============================================
# Celda 1: Importar librerías y cargar datos
# ============================================
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GlobalAveragePooling1D, Dense
from sklearn.preprocessing import LabelEncoder

# Los datasets ya vienen cargados en el entorno del reto,
# pero si estás en Colab, puedes subirlos manualmente o usar una URL.
# En este ejemplo, los cargamos desde URLs públicas:
train_data = pd.read_csv("https://raw.githubusercontent.com/justmarkham/pycon-2016-tutorial/master/data/sms.tsv", sep='\t', header=None, names=['label', 'message'])
test_data = train_data.sample(frac=0.2, random_state=42)
train_data = train_data.drop(test_data.index)

print("✅ Datos cargados correctamente.")
print(train_data.head())


✅ Datos cargados correctamente.
  label                                            message
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...


In [2]:
# ============================================
# Celda 2: Preparar datos
# ============================================

# Convertir etiquetas "ham"/"spam" a 0/1
label_encoder = LabelEncoder()
train_labels = label_encoder.fit_transform(train_data["label"])
test_labels = label_encoder.transform(test_data["label"])

# Tokenizar texto (convertir palabras a enteros)
vocab_size = 1000
embedding_dim = 16
max_length = 120
trunc_type = 'post'
padding_type = 'post'
oov_tok = "<OOV>"

tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(train_data["message"])
train_sequences = tokenizer.texts_to_sequences(train_data["message"])
train_padded = pad_sequences(train_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

test_sequences = tokenizer.texts_to_sequences(test_data["message"])
test_padded = pad_sequences(test_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

print("✅ Textos tokenizados y preparados.")
print(f"Ejemplo de secuencia: {train_sequences[0][:10]}")


✅ Textos tokenizados y preparados.
Ejemplo de secuencia: [50, 576, 1, 1, 748, 600, 66, 9, 1, 86]


In [3]:
# ============================================
# Celda 3: Crear y entrenar el modelo
# ============================================

model = Sequential([
    Embedding(vocab_size, embedding_dim, input_length=max_length),
    GlobalAveragePooling1D(),
    Dense(24, activation='relu'),
    Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

history = model.fit(
    train_padded,
    np.array(train_labels),
    epochs=10,
    validation_data=(test_padded, np.array(test_labels)),
    verbose=2
)

print("✅ Modelo entrenado correctamente.")


Epoch 1/10




140/140 - 2s - 15ms/step - accuracy: 0.8656 - loss: 0.3853 - val_accuracy: 0.8662 - val_loss: 0.3648
Epoch 2/10
140/140 - 1s - 4ms/step - accuracy: 0.8654 - loss: 0.3576 - val_accuracy: 0.8654 - val_loss: 0.3490
Epoch 3/10
140/140 - 1s - 4ms/step - accuracy: 0.8654 - loss: 0.3334 - val_accuracy: 0.8591 - val_loss: 0.3156
Epoch 4/10
140/140 - 1s - 4ms/step - accuracy: 0.8699 - loss: 0.2782 - val_accuracy: 0.8968 - val_loss: 0.2528
Epoch 5/10
140/140 - 1s - 4ms/step - accuracy: 0.9098 - loss: 0.2072 - val_accuracy: 0.9093 - val_loss: 0.1927
Epoch 6/10
140/140 - 1s - 4ms/step - accuracy: 0.9457 - loss: 0.1581 - val_accuracy: 0.9372 - val_loss: 0.1526
Epoch 7/10
140/140 - 1s - 4ms/step - accuracy: 0.9542 - loss: 0.1299 - val_accuracy: 0.9515 - val_loss: 0.1230
Epoch 8/10
140/140 - 1s - 4ms/step - accuracy: 0.9677 - loss: 0.1043 - val_accuracy: 0.9722 - val_loss: 0.1140
Epoch 9/10
140/140 - 1s - 6ms/step - accuracy: 0.9706 - loss: 0.0876 - val_accuracy: 0.9632 - val_loss: 0.0966
Epoch 10/10

In [4]:
# ============================================
# Celda 4: Función predict_message
# ============================================

def predict_message(message):
    seq = tokenizer.texts_to_sequences([message])
    padded = pad_sequences(seq, maxlen=max_length, padding=padding_type, truncating=trunc_type)
    prediction = model.predict(padded, verbose=0)[0][0]
    label = "spam" if prediction > 0.5 else "ham"
    return [float(prediction), label]


In [5]:
# ============================================
# Celda 5: Prueba final
# ============================================

tests = [
    "Hey, are we still meeting for dinner tonight?",
    "Congratulations! You've been selected for a $1000 Walmart gift card. Go to http://bit.ly/12345 to claim now.",
    "Don't forget to bring the documents for tomorrow's meeting.",
    "You won a free cruise to the Bahamas! Call now to claim your prize.",
]

for msg in tests:
    print(msg)
    print(predict_message(msg))
    print("-----")


Hey, are we still meeting for dinner tonight?
[0.008391112089157104, 'ham']
-----
Congratulations! You've been selected for a $1000 Walmart gift card. Go to http://bit.ly/12345 to claim now.
[0.7261187434196472, 'spam']
-----
Don't forget to bring the documents for tomorrow's meeting.
[0.038604311645030975, 'ham']
-----
You won a free cruise to the Bahamas! Call now to claim your prize.
[0.8994461894035339, 'spam']
-----
