In [None]:
import tensorflow as tf
import pandas as pd
from tensorflow import keras
from tensorflow.keras import layers
import numpy as np
import matplotlib.pyplot as plt
import re
import string

In [None]:
print(f"TensorFlow version: {tf.__version__}")

# Paramètres pour stabilité
tf.get_logger().setLevel('ERROR')

# get data files
!wget -q https://cdn.freecodecamp.org/project-data/sms/train-data.tsv
!wget -q https://cdn.freecodecamp.org/project-data/sms/valid-data.tsv

train_file_path = "train-data.tsv"
test_file_path = "valid-data.tsv"

In [None]:
print("Chargement des données...")
train_data = pd.read_csv(train_file_path, sep='\t', header=None, names=['label', 'message'])
test_data = pd.read_csv(test_file_path, sep='\t', header=None, names=['label', 'message'])

print(f"Données d'entraînement: {train_data.shape}")
print(f"Données de test: {test_data.shape}")
print(f"Distribution train: {train_data['label'].value_counts().to_dict()}")

# Preprocessing du texte
def preprocess_text(text):
    """Nettoyage basique du texte"""
    text = str(text).lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Appliquer le preprocessing
train_data['message'] = train_data['message'].apply(preprocess_text)
test_data['message'] = test_data['message'].apply(preprocess_text)

# Préparer les données pour TensorFlow
train_messages = train_data['message'].values
test_messages = test_data['message'].values

# Encoder les labels (ham=0, spam=1)
train_labels = (train_data['label'] == 'spam').astype(int).values
test_labels = (test_data['label'] == 'spam').astype(int).values

print(f"Labels train - ham: {np.sum(train_labels == 0)}, spam: {np.sum(train_labels == 1)}")

In [None]:
VOCAB_SIZE = 10000
MAX_LENGTH = 50
EMBEDDING_DIM = 16

# Création du tokenizer TensorFlow
tokenizer = tf.keras.preprocessing.text.Tokenizer(
    num_words=VOCAB_SIZE,
    oov_token="<OOV>",
    filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n'
)

# Entraîner le tokenizer sur toutes les données
all_messages = np.concatenate([train_messages, test_messages])
tokenizer.fit_on_texts(all_messages)

# Convertir en séquences
train_sequences = tokenizer.texts_to_sequences(train_messages)
test_sequences = tokenizer.texts_to_sequences(test_messages)

# Padding
train_padded = tf.keras.preprocessing.sequence.pad_sequences(
    train_sequences, maxlen=MAX_LENGTH, padding='post', truncating='post'
)
test_padded = tf.keras.preprocessing.sequence.pad_sequences(
    test_sequences, maxlen=MAX_LENGTH, padding='post', truncating='post'
)

print(f"Forme finale - Train: {train_padded.shape}, Test: {test_padded.shape}")

# Créer le modèle de Deep Learning
def create_model():
    model = keras.Sequential([
        # Embedding layer pour convertir les indices en vecteurs denses
        layers.Embedding(VOCAB_SIZE, EMBEDDING_DIM, input_length=MAX_LENGTH),

        # Couche de dropout pour la régularisation
        layers.Dropout(0.2),

        # Global Average Pooling pour réduire la dimensionnalité
        layers.GlobalAveragePooling1D(),

        # Couches denses avec activations
        layers.Dense(24, activation='relu'),
        layers.Dropout(0.5),
        layers.Dense(12, activation='relu'),

        # Couche de sortie pour classification binaire
        layers.Dense(1, activation='sigmoid')
    ])

    return model

In [None]:
# Construire le modèle
print("Construction du modèle...")
model = create_model()

# Compilation du modèle
model.compile(
    optimizer=keras.optimizers.Adam(learning_rate=0.001),
    loss='binary_crossentropy',
    metrics=['accuracy']
)

# Afficher l'architecture
model.summary()

# Callbacks pour améliorer l'entraînement
callbacks = [
    keras.callbacks.EarlyStopping(
        monitor='val_loss',
        patience=3,
        restore_best_weights=True
    ),
    keras.callbacks.ReduceLROnPlateau(
        monitor='val_loss',
        factor=0.2,
        patience=2,
        min_lr=0.0001
    )
]

# Entraîner le modèle
print("Entraînement du modèle...")
history = model.fit(
    train_padded,
    train_labels,
    batch_size=32,
    epochs=10,
    validation_data=(test_padded, test_labels),
    callbacks=callbacks,
    verbose=1
)

In [None]:
# Visualiser les résultats d'entraînement
plt.figure(figsize=(12, 4))

plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'], label='Training Accuracy', marker='o')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy', marker='s')
plt.title('Model Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()
plt.grid(True)

plt.subplot(1, 2, 2)
plt.plot(history.history['loss'], label='Training Loss', marker='o')
plt.plot(history.history['val_loss'], label='Validation Loss', marker='s')
plt.title('Model Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.grid(True)

plt.tight_layout()
plt.show()

In [None]:
# Évaluation finale
test_loss, test_accuracy = model.evaluate(test_padded, test_labels, verbose=0)
print(f"\nPrécision finale sur le test: {test_accuracy:.4f}")

# Tests de prédiction
sample_texts = [
    "how are you doing today?",
    "WINNER! You have won $1000! Call now!",
    "Hey, want to grab lunch tomorrow?",
    "URGENT! Your account will be closed. Click here now!"
]

print("\nExemples de prédictions:")
for text in sample_texts:
    # Préprocesser
    clean_text = preprocess_text(text)
    sequence = tokenizer.texts_to_sequences([clean_text])
    padded = tf.keras.preprocessing.sequence.pad_sequences(
        sequence, maxlen=MAX_LENGTH, padding='post', truncating='post'
    )

    # Prédire
    prediction = model.predict(padded, verbose=0)[0][0]
    label = "spam" if prediction > 0.5 else "ham"

    print(f"'{text}' -> {label} (score: {prediction:.4f})")

In [None]:
# function to predict messages based on model
# (should return list containing prediction and label, ex. [0.008318834938108921, 'ham'])
def predict_message(pred_text):
    try:
        # Préprocesser le texte
        clean_text = preprocess_text(pred_text)

        # Convertir en séquence
        sequence = tokenizer.texts_to_sequences([clean_text])

        # Padding
        padded = tf.keras.preprocessing.sequence.pad_sequences(
            sequence, maxlen=MAX_LENGTH, padding='post', truncating='post'
        )

        # Prédiction
        prediction_score = model.predict(padded, verbose=0)[0][0]

        # Déterminer le label
        label = "spam" if prediction_score > 0.5 else "ham"

        # Retourner [probabilité, label]
        return [float(prediction_score), label]

    except Exception as e:
        print(f"Erreur dans predict_message: {e}")
        return [0.0, "ham"]

pred_text = "how are you doing today?"
prediction = predict_message(pred_text)
print(f"\nTest de la fonction: {prediction}")


In [None]:
# Run this cell to test your function and model. Do not modify contents.
def test_predictions():
  test_messages = ["how are you doing today",
                   "sale today! to stop texts call 98912460324",
                   "i dont want to go. can we try it a different day? available sat",
                   "our new mobile video service is live. just install on your phone to start watching.",
                   "you have won £1000 cash! call to claim your prize.",
                   "i'll bring it tomorrow. don't forget the milk.",
                   "wow, is your arm alright. that happened to me one time too"
                  ]

  test_answers = ["ham", "spam", "ham", "spam", "spam", "ham", "ham"]
  passed = True

  for msg, ans in zip(test_messages, test_answers):
    prediction = predict_message(msg)
    if prediction[1] != ans:
      passed = False

  if passed:
    print("You passed the challenge. Great job!")
  else:
    print("You haven't passed yet. Keep trying.")

test_predictions()
