In [52]:
import pandas as pd
!pip install scikit-learn



In [53]:
## 1. IMPORTACIONES Y CARGA DE DATOS

# Import required libraries
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.layers import Input
from tensorflow.keras.layers import Bidirectional
from tensorflow.keras.layers import Dropout # Recomendado para múltiples capas
import nltk
import re # Necesitas importar la librería re (expresiones regulares)
from nltk.corpus import stopwords

# Load datasets with the correct encoding
# Asegúrate de que los archivos 'SMS_train.csv' y 'SMS_test.csv' estén en la misma carpeta del Notebook
train_data = pd.read_csv('sms_train.csv', encoding='latin1') 
test_data = pd.read_csv('sms_test.csv', encoding='latin1')

# Combine train and test datasets for text preprocessing
data = pd.concat([train_data, test_data], ignore_index=True)

print("Paso 1 completado: Librerías importadas y datos cargados.")

Paso 1 completado: Librerías importadas y datos cargados.


In [54]:
# Busca mensajes que contengan "order" y "confirm"
patron_busqueda = r'(order|ship|confirm|address|replying|YES)' 

# Aquí asumo que la columna de mensajes originales se llama 'Message_body' y la de etiquetas 'Label'
mensajes_sospechosos = data[data['Message_body'].str.contains(patron_busqueda, case=False, na=False)]

# Muestra los mensajes y sus etiquetas
print("--- Mensajes con patrones de Phishing ('order', 'confirm', 'YES') ---")
print(mensajes_sospechosos[['Message_body', 'Label']].sample(20))


--- Mensajes con patrones de Phishing ('order', 'confirm', 'YES') ---
                                           Message_body     Label
57    Yes. Please leave at  &lt;#&gt; . So that at  ...  Non-Spam
429   Oh shut it. Omg yesterday I had a dream that I...  Non-Spam
149            Hi elaine, is today's meeting confirmed?  Non-Spam
1014  Thanks for your subscription to Ringtone UK yo...      Spam
94    Yes princess! I want to please you every night...  Non-Spam
847   Aiyo... Her lesson so early... I'm still sleep...  Non-Spam
640    Oh yes, why is it like torture watching england?  Non-Spam
489   Do you want a New Nokia 3510i colour phone Del...      Spam
690   lyricalladie(21/F) is inviting you to be her f...      Spam
995   -PLS STOP bootydelious (32/F) is inviting you ...      Spam
196   Do u konw waht is rael FRIENDSHIP Im gving yuo...  Non-Spam
306   Pls speak with me. I wont ask anything other t...  Non-Spam
770   URGENT This is our 2nd attempt to contact U. Y...      Spam
354   

  mensajes_sospechosos = data[data['Message_body'].str.contains(patron_busqueda, case=False, na=False)]


In [55]:
# Combine train and test datasets for text preprocessing
data = pd.concat([train_data, test_data], ignore_index=True)

In [56]:
def preprocess_final(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', ' ', text)
    # Aplicar tokenización y eliminación de stopwords en una sola línea de list comprehension:
    tokens = [word for word in text.split() if word not in stopwords.words('english')]
    return ' '.join(tokens)

In [57]:
# Apply preprocessing to SMS texts
data['processed_text'] = data['Message_body'].apply(preprocess_final)
print("¡El preprocesamiento fue exitoso! Avanzando a Tokenización de Keras.")

¡El preprocesamiento fue exitoso! Avanzando a Tokenización de Keras.


In [58]:
# Tokenization and Padding
tokenizer = Tokenizer(num_words=5000, oov_token="<unk>") 
tokenizer.fit_on_texts(data['processed_text']) 
sequences = tokenizer.texts_to_sequences(data['processed_text'])
maxlen = 200
padded_sequences = pad_sequences(sequences, maxlen=maxlen)

print("Tokenización y Padding completados.")
print(f"Forma de las secuencias con padding: {padded_sequences.shape}")

# --- DIVISIÓN DE DATOS ---

# 3. Dividir los datos en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(
    padded_sequences, 
    data['Label'], # Asegúrese de que esta columna existe y contiene 0s y 1s
    test_size=0.2, 
    random_state=42
)

print(f"\nConjunto de Entrenamiento (X_train): {X_train.shape}")
print(f"Conjunto de Prueba (X_test): {X_test.shape}")

Tokenización y Padding completados.
Forma de las secuencias con padding: (1082, 200)

Conjunto de Entrenamiento (X_train): (865, 200)
Conjunto de Prueba (X_test): (217, 200)


In [59]:
# Prepare labels and convert 'Label' to binary format (0 for 'Non-Spam', 1 for 'Spam')
train_data['Label'] = train_data['Label'].map({'Non-Spam': 0, 'Spam': 1})
test_data['Label'] = test_data['Label'].map({'Non-Spam': 0, 'Spam': 1})

In [60]:
# Split preprocessed data back into train and test sets
# padded_sequences es el resultado de la tokenización de Keras (Paso 3 anterior)
X_train = padded_sequences[:len(train_data)]
X_test = padded_sequences[len(train_data):]
y_train = train_data['Label'].values
y_test = test_data['Label'].values

In [61]:
model = Sequential()
model.add(Input(shape=(200,)))
# Usamos el mismo vocabulario (5000) y output_dim (128)
model.add(Embedding(input_dim=5000, output_dim=128)) 
model.add(Bidirectional(LSTM(64)))
model.add(Dense(1, activation='sigmoid'))

In [30]:
## CÓDIGO DE ENTRENAMIENTO MEJORADO CON PESO DE CLASES

# 1. Definir los pesos de las clases
# Ajusta el peso de la clase 1 ('Spam') según tu desbalance real. 
# Si el 5% es Spam, usa 19.0. Si el 10% es Spam, usa 9.0.
# Usamos 9.0 como punto de partida si el desbalance es 10%.
class_weights = {0: 1.0, 1: 25.0} 

# 2. Compilar el modelo (misma configuración)
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# 3. Entrenar con el nuevo parámetro class_weight
print("Entrenando modelo con PONDERACIÓN DE CLASES (más foco en el Spam)...")

history = model.fit(
    X_train, 
    y_train, 
    epochs=20, # Aumentamos a 20 épocas
    batch_size=64, 
    validation_split=0.1,
    class_weight=class_weights # <--- ¡EL CAMBIO CLAVE!
)

loss, accuracy = model.evaluate(X_test, y_test, verbose=0)
print(f'Test Accuracy (con pesos): {accuracy:.2f}')

Entrenando modelo con PONDERACIÓN DE CLASES (más foco en el Spam)...
Epoch 1/20


  y_numpy = np.round(y_numpy).astype("int32")


[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 144ms/step - accuracy: 0.0000e+00 - loss: nan - val_accuracy: 0.0000e+00 - val_loss: nan
Epoch 2/20
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 122ms/step - accuracy: 0.0000e+00 - loss: nan - val_accuracy: 0.0000e+00 - val_loss: nan
Epoch 3/20
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 139ms/step - accuracy: 0.0000e+00 - loss: nan - val_accuracy: 0.0000e+00 - val_loss: nan
Epoch 4/20
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 121ms/step - accuracy: 0.0000e+00 - loss: nan - val_accuracy: 0.0000e+00 - val_loss: nan
Epoch 5/20
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 138ms/step - accuracy: 0.0000e+00 - loss: nan - val_accuracy: 0.0000e+00 - val_loss: nan
Epoch 6/20
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 120ms/step - accuracy: 0.0000e+00 - loss: nan - val_accuracy: 0.0000e+00 - val_loss: nan
Epoch 7/20
[1m14/14[0m [

In [45]:

   # Define function to preprocess text (VERSION CORREGIDA)
# Esta versión utiliza .split() para evitar el error 'punkt'
def preprocess_input_text(text):
    # Tokenize and convert to lowercase
    text = text.lower()
    tokens = text.split() # <--- ¡CAMBIO CLAVE! Usamos split()
    
    # Remove stopwords
    tokens = [word for word in tokens if word not in stopwords.words('english')] 
    return ' '.join(tokens) # Join tokens into a string

# Define function to predict spam or not (VERSION FINAL Y CORREGIDA)
def predict_spam_or_not(text):
    
    # 1. Preprocesar (usando la función limpia con re.sub, no la versión .split())
    # Asumo que preprocess_final incluye la limpieza de caracteres especiales.
    processed_text = preprocess_final(text) 
    
    # 2. Convertir texto limpio a secuencia numérica
    sequence = tokenizer.texts_to_sequences([processed_text]) 
    
    # 3. Aplicar padding (se asegura de que la longitud sea maxlen=200)
    padded_sequence = pad_sequences(sequence, maxlen=200) 
    
    # 4. Predecir (obtiene la probabilidad entre 0 y 1)
    # verbose=0 evita imprimir el progreso de la predicción.
    prediction = model.predict(padded_sequence, verbose=0)
    
    # Extraer la probabilidad
    probability = prediction[0][0] 
    
    # Imprimir la probabilidad para el diagnóstico
    print(f"Probabilidad de ser Spam: {probability:.4f}")
    
    # 5. Interpretar la predicción con el umbral (AJUSTE FINAL)
    
    # Umbral bajo para ser más sensible al spam sutil y Phishing, 
    # forzando que probabilidades entre 0.25 y 0.5 se consideren SPAM.
    UMBRAL = 0.25 
    
    # 

    return "Spam" if probability > UMBRAL else "No Spam"

# Nota: La función preprocess_input_text no se utiliza aquí, 
# ya que la función predict_spam_or_not llama a 'preprocess_final'.

In [46]:
#Single text example
text_example = "Free entry in 2 a wkly comp to win FA final tkts 21st May 2005. Text FA to 87121 received entry questions(std txt rate)T&C's apply 08452810075over18's"

# Predict if the text is spam or not spam
prediction_result = predict_spam_or_not(text_example)
print(f'The message: "(text_example)" is classified as: {prediction_result}')

Probabilidad de ser Spam: nan
The message: "(text_example)" is classified as: No Spam


In [47]:
#Single text example
text_example = "Meet me at the mall tomorrow. I found something you'll love"

# Predict if the text is spam or not spam
prediction_result = predict_spam_or_not(text_example)
print(f'The message: "(text_example)" is classified as: {prediction_result}')

Probabilidad de ser Spam: nan
The message: "(text_example)" is classified as: No Spam


In [48]:
#Single text example
text_example = "Congratulations You've been selected for an exclusive affer. Clain your $1000 gift card now by visiting our site"

# Predict if the text is spam or not spam
prediction_result = predict_spam_or_not(text_example)
print(f'The message: "(text_example)" is classified as: {prediction_result}')

Probabilidad de ser Spam: nan
The message: "(text_example)" is classified as: No Spam


In [49]:
#Single text example
text_example = "URGENT! We need to verify your account details immediately. Please click on the link below to avoid account suspension: bit.ly/123xyz"

# Predict if the text is spam or not spam
prediction_result = predict_spam_or_not(text_example)
print(f'The message: "(text_example)" is classified as: {prediction_result}')

Probabilidad de ser Spam: nan
The message: "(text_example)" is classified as: No Spam


In [50]:
#Single text example
text_example = "You have won a free, all-inclusive Caribbean vacation! To claim your prize, text 'WINNER' to 89000 before midnight tonight."

# Predict if the text is spam or not spam
prediction_result = predict_spam_or_not(text_example)
print(f'The message: "(text_example)" is classified as: {prediction_result}')

Probabilidad de ser Spam: nan
The message: "(text_example)" is classified as: No Spam


In [51]:
#Single text example
text_example = "FREE ENTRY! 2-for-1 tickets available now! Call 0800-555-1234 now for a chance to win $5000 cash!!!"

# Predict if the text is spam or not spam
prediction_result = predict_spam_or_not(text_example)
print(f'The message: "(text_example)" is classified as: {prediction_result}')

Probabilidad de ser Spam: nan
The message: "(text_example)" is classified as: No Spam


In [43]:
#Single text example
text_example = "Your Netflix subscription payment failed. Please update your billing info here: secure-login.com/netflix"

# Predict if the text is spam or not spam
prediction_result = predict_spam_or_not(text_example)
print(f'The message: "(text_example)" is classified as: {prediction_result}')

Probabilidad de ser Spam: nan
The message: "(text_example)" is classified as: No Spam


In [21]:
#Single text example
text_example = "Dear customer, your order #49583 is ready to ship. Confirm your address by replying 'YES' to this number."

# Predict if the text is spam or not spam
prediction_result = predict_spam_or_not(text_example)
print(f'The message: "(text_example)" is classified as: {prediction_result}')

Probabilidad de ser Spam: 0.0051
The message: "(text_example)" is classified as: No Spam
