In [1]:
!pip install pandas numpy scikit-learn nltk



In [2]:
from google.colab import files
uploaded = files.upload()

Saving spam.csv to spam.csv


In [3]:
import pandas as pd

# Charger les données
data = pd.read_csv('spam.csv', encoding='latin-1')

# Garder uniquement les colonnes nécessaires
data = data[['v1', 'v2']]
data.columns = ['label', 'message']

# Afficher les premières lignes
print(data.head())

  label                                            message
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...


In [4]:
print(f"Nombre de lignes et colonnes : {data.shape}")


Nombre de lignes et colonnes : (5572, 2)


In [5]:
print(data['label'].value_counts())

label
ham     4825
spam     747
Name: count, dtype: int64


In [6]:
print(data.isnull().sum())

label      0
message    0
dtype: int64


In [7]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
data['label'] = encoder.fit_transform(data['label'])
print(data.head())

   label                                            message
0      0  Go until jurong point, crazy.. Available only ...
1      0                      Ok lar... Joking wif u oni...
2      1  Free entry in 2 a wkly comp to win FA Cup fina...
3      0  U dun say so early hor... U c already then say...
4      0  Nah I don't think he goes to usf, he lives aro...


In [8]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(data['message'], data['label'], test_size=0.2, random_state=42)
print(f"Train set: {X_train.shape}, Test set: {X_test.shape}")

Train set: (4457,), Test set: (1115,)


In [9]:
from sklearn.feature_extraction.text import CountVectorizer

# Convertir les messages en vecteurs numériques
vectorizer = CountVectorizer()

# Ajuster sur les données d'entraînement et transformer
X_train_vec = vectorizer.fit_transform(X_train)

# Transformer les données de test
X_test_vec = vectorizer.transform(X_test)

print(f"Nombre de features : {len(vectorizer.get_feature_names_out())}")

Nombre de features : 7735


In [10]:
from sklearn.linear_model import LogisticRegression

# Créer le modèle
model = LogisticRegression()

# Entraîner le modèle
model.fit(X_train_vec, y_train)

print("Modèle entraîné avec succès !")

Modèle entraîné avec succès !


In [11]:
y_pred = model.predict(X_test_vec)


In [12]:
from sklearn.metrics import accuracy_score, classification_report

# Précision
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy : {accuracy * 100:.2f}%")

# Rapport de classification
print("\nRapport de classification :\n", classification_report(y_test, y_pred))

Accuracy : 97.76%

Rapport de classification :
               precision    recall  f1-score   support

           0       0.98      1.00      0.99       965
           1       0.99      0.84      0.91       150

    accuracy                           0.98      1115
   macro avg       0.98      0.92      0.95      1115
weighted avg       0.98      0.98      0.98      1115



In [13]:
# Tester avec un message personnalisé
new_messages = [
    "Congratulations! You've won a $1,000 gift card. Call now to claim.",
    "Hi, just checking in. Let me know if you need anything!"
]

# Convertir le message en vecteurs
new_messages_vec = vectorizer.transform(new_messages)

# Prédictions
predictions = model.predict(new_messages_vec)

# Résultats
for message, pred in zip(new_messages, predictions):
    print(f"Message : '{message}' - {'Spam' if pred == 1 else 'Not Spam'}")

Message : 'Congratulations! You've won a $1,000 gift card. Call now to claim.' - Spam
Message : 'Hi, just checking in. Let me know if you need anything!' - Not Spam
