In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score
import nltk
from nltk.corpus import stopwords
import re


# Download stopwords list if not already present
try:
    stopwords.words('english')
except LookupError:
    nltk.download('stopwords')


# --- 1. Data Loading (FIXED for KeyError) ---
try:
    # Load the CSV
    df = pd.read_csv('/content/email_spam.csv', encoding='latin-1')

    # Assume the dataset has columns: 'text' and 'type' (based on previous outputs)
    df = df[['type', 'text']].rename(columns={'type': 'label'})

    # --- 1b. Data Cleaning (FIX for ValueError: Input y contains NaN) ---
    # Drop rows where either 'label' or 'text' column has a missing value (NaN)
    print(f"Original shape: {df.shape}")
    df.dropna(subset=['label', 'text'], inplace=True)
    print(f"Shape after dropping NaNs: {df.shape}")

    # Inspect unique values in the 'label' column before mapping
    print("\nUnique values in 'label' column before mapping:")
    print(df['label'].unique())


    # Convert labels to binary (0 for 'not spam', 1 for 'spam')
    # Added .str.lower() and .str.strip() to handle potential casing and whitespace issues
    df['label'] = df['label'].str.lower().str.strip()
    df['label'] = df['label'].map({'not spam': 0, 'spam': 1})


    # Remove rows where mapping failed (NaN labels)
    df.dropna(subset=['label'], inplace=True)
    df['label'] = df['label'].astype(int)   # Ensure label is integer

    print(f"Final shape after mapping and dropping missing labels: {df.shape}")
    print(f"Label counts:\n{df['label'].value_counts()}")

    # --- 2. Text Preprocessing Function ---
    def preprocess_text(text):
        # Ensure text is treated as a string to avoid errors on non-string inputs
        text = str(text)
        # Remove non-alphabetic characters and convert to lower case
        text = re.sub('[^a-zA-Z]', ' ', text).lower()
        # Tokenize (split into words)
        words = text.split()
        # Remove stopwords
        words = [w for w in words if w not in stopwords.words('english')]
        # Re-join the words into a single string
        return ' '.join(words)

    # Apply preprocessing to the text column
    df['cleaned_text'] = df['text'].apply(preprocess_text)

    # --- 3. Feature Extraction (Vectorization) ---
    # Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(
        df['cleaned_text'], df['label'], test_size=0.2, random_state=42
    )

    # Initialize CountVectorizer (Bag-of-Words model)
    vectorizer = CountVectorizer(max_features=5000)

    # Fit the vectorizer to the training data and transform it
    X_train_vec = vectorizer.fit_transform(X_train).toarray()

    # Transform the test data using the *fitted* vectorizer
    X_test_vec = vectorizer.transform(X_test).toarray()

    print(f"\nTraining data shape (samples, features): {X_train_vec.shape}")
    print(f"Testing data shape (samples, features): {X_test_vec.shape}")


    # --- 4. Model Training (Multinomial Naive Bayes) ---
    # FIX IS APPLIED: y_train no longer contains NaNs
    model = MultinomialNB()
    model.fit(X_train_vec, y_train)


    # --- 5. Evaluation and Prediction ---
    # Make predictions on the test set
    y_pred = model.predict(X_test_vec)

    # Evaluate the model's performance
    print("\n--- Model Evaluation ---")
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
    print("\nClassification Report:\n", classification_report(y_test, y_pred, target_names=['Ham', 'Spam']))

    # --- Example Spam/Ham Prediction ---
    def predict_spam(text_to_check):
        cleaned_text = preprocess_text(text_to_check)
        # Vectorize the new text (must use the trained vectorizer)
        text_vec = vectorizer.transform([cleaned_text]).toarray()
        prediction = model.predict(text_vec)[0]
        return "SPAM" if prediction == 1 else "HAM"

    # Test the model with new emails
    test_email_spam = "Congratulations! You've won a free iPhone. Click this link immediately to claim your prize."
    test_email_ham = "Hey, just following up on our meeting yesterday. Could you send me the revised document by 5 pm?"

    print("\n--- Live Prediction Examples ---")
    print(f"Email: '{test_email_spam}'\nPrediction: {predict_spam(test_email_spam)}")
    print(f"Email: '{test_email_ham}'\nPrediction: {predict_spam(test_email_ham)}")

except FileNotFoundError:
    print("Error: 'email_spam.csv' not found. Please ensure the file is in the correct directory and try again.")

Original shape: (84, 2)
Shape after dropping NaNs: (84, 2)

Unique values in 'label' column before mapping:
['spam' 'not spam']
Final shape after mapping and dropping missing labels: (84, 2)
Label counts:
label
0    58
1    26
Name: count, dtype: int64

Training data shape (samples, features): (67, 1829)
Testing data shape (samples, features): (17, 1829)

--- Model Evaluation ---
Accuracy: 0.8235

Classification Report:
               precision    recall  f1-score   support

         Ham       0.79      1.00      0.88        11
        Spam       1.00      0.50      0.67         6

    accuracy                           0.82        17
   macro avg       0.89      0.75      0.77        17
weighted avg       0.86      0.82      0.80        17


--- Live Prediction Examples ---
Email: 'Congratulations! You've won a free iPhone. Click this link immediately to claim your prize.'
Prediction: HAM
Email: 'Hey, just following up on our meeting yesterday. Could you send me the revised document b

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score
import nltk
from nltk.corpus import stopwords
import re

# Download stopwords if not already present
try:
    stopwords.words('english')
except LookupError:
    nltk.download('stopwords')

# --- 1. Load and Clean Data ---
try:
    df = pd.read_csv('/content/email_spam.csv', encoding='latin-1')

    # Keep only required columns
    df = df[['type', 'text']].rename(columns={'type': 'label'})
    df.dropna(subset=['label', 'text'], inplace=True)

    # Print original label values
    print("Original label values:", df['label'].unique())

    # Normalize and map labels
    df['label'] = df['label'].str.lower().str.strip()
    df['label'] = df['label'].replace({
        'ham': 0, 'not spam': 0,
        'spam': 1
    })

    # Drop unmapped labels
    df.dropna(subset=['label'], inplace=True)
    df['label'] = df['label'].astype(int)

    print("\nLabel Distribution:\n", df['label'].value_counts())

    # --- 2. Preprocessing Function ---
    stop_words = set(stopwords.words('english'))

    def preprocess_text(text):
        text = str(text).lower()
        text = re.sub(r'[^a-z]', ' ', text)
        words = text.split()
        words = [word for word in words if word not in stop_words]
        return ' '.join(words)

    df['cleaned_text'] = df['text'].apply(preprocess_text)

    # --- 3. Split Data ---
    X_train, X_test, y_train, y_test = train_test_split(
        df['cleaned_text'], df['label'], test_size=0.2, random_state=42, stratify=df['label']
    )

    # --- 4. Vectorization ---
    vectorizer = CountVectorizer(max_features=5000)
    X_train_vec = vectorizer.fit_transform(X_train)
    X_test_vec = vectorizer.transform(X_test)

    # --- 5. Train Model ---
    model = MultinomialNB()
    model.fit(X_train_vec, y_train)

    # --- 6. Evaluate Model ---
    y_pred = model.predict(X_test_vec)
    print("\n--- Evaluation ---")
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
    print(classification_report(y_test, y_pred, target_names=['Ham', 'Spam']))

    # --- 7. Live Prediction Function ---
    def predict_spam(text_to_check):
        cleaned = preprocess_text(text_to_check)
        vector = vectorizer.transform([cleaned])
        prediction = model.predict(vector)[0]
        return "SPAM" if prediction == 1 else "HAM"

    # --- 8. Test Predictions ---
    test_email_spam = "Congratulations! You've won a free iPhone. Click this link immediately to claim your prize."
    test_email_ham = "Hey, just following up on our meeting yesterday. Could you send me the revised document by 5 pm?"

    print("\n--- Live Predictions ---")
    print(f"SPAM Email Prediction: {predict_spam(test_email_spam)}")
    print(f"HAM Email Prediction: {predict_spam(test_email_ham)}")

except FileNotFoundError:
    print("Error: 'email_spam.csv' not found. Please ensure the file is in the correct directory.")


Original label values: ['spam' 'not spam']

Label Distribution:
 label
0    58
1    26
Name: count, dtype: int64

--- Evaluation ---
Accuracy: 0.7059
              precision    recall  f1-score   support

         Ham       0.71      1.00      0.83        12
        Spam       0.00      0.00      0.00         5

    accuracy                           0.71        17
   macro avg       0.35      0.50      0.41        17
weighted avg       0.50      0.71      0.58        17


--- Live Predictions ---
SPAM Email Prediction: SPAM
HAM Email Prediction: HAM


  df['label'] = df['label'].replace({
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score
import nltk
from nltk.corpus import stopwords
import re

# Download stopwords if not already present
try:
    stopwords.words('english')
except LookupError:
    nltk.download('stopwords')

try:
    # Load dataset
    df = pd.read_csv('/content/email_spam.csv', encoding='latin-1')

    # Keep only required columns and drop NaNs
    df = df[['type', 'text']].rename(columns={'type': 'label'})
    df.dropna(subset=['label', 'text'], inplace=True)

    # Show original labels
    print("Original label values:", df['label'].unique())

    # Map labels and fix future warning by adding infer_objects()
    df['label'] = df['label'].str.lower().str.strip()
    df['label'] = df['label'].replace({
        'ham': 0, 'not spam': 0,
        'spam': 1
    }).infer_objects()

    # Drop rows where mapping failed and convert to int
    df.dropna(subset=['label'], inplace=True)
    df['label'] = df['label'].astype(int)

    print("\nLabel Distribution:\n", df['label'].value_counts())

    # Preprocessing function
    stop_words = set(stopwords.words('english'))

    def preprocess_text(text):
        text = str(text).lower()
        text = re.sub(r'[^a-z]', ' ', text)
        words = text.split()
        words = [word for word in words if word not in stop_words]
        return ' '.join(words)

    df['cleaned_text'] = df['text'].apply(preprocess_text)

    # Stratified split to maintain label balance
    X_train, X_test, y_train, y_test = train_test_split(
        df['cleaned_text'], df['label'], test_size=0.2, random_state=42, stratify=df['label']
    )

    # Vectorization
    vectorizer = CountVectorizer(max_features=5000)
    X_train_vec = vectorizer.fit_transform(X_train)
    X_test_vec = vectorizer.transform(X_test)

    # Train model
    model = MultinomialNB()
    model.fit(X_train_vec, y_train)

    # Predict
    y_pred = model.predict(X_test_vec)

    # Evaluate with zero_division=0 to suppress warnings
    print("\n--- Model Evaluation ---")
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
    print(classification_report(y_test, y_pred, target_names=['Ham', 'Spam'], zero_division=0))

    # Function to predict new emails
    def predict_spam(text_to_check):
        cleaned = preprocess_text(text_to_check)
        vector = vectorizer.transform([cleaned])
        prediction = model.predict(vector)[0]
        return "SPAM" if prediction == 1 else "HAM"

    # Test predictions
    test_email_spam = "Congratulations! You've won a free iPhone. Click this link immediately to claim your prize."
    test_email_ham = "Hey, just following up on our meeting yesterday. Could you send me the revised document by 5 pm?"

    print("\n--- Live Predictions ---")
    print(f"SPAM Email Prediction: {predict_spam(test_email_spam)}")
    print(f"HAM Email Prediction: {predict_spam(test_email_ham)}")

except FileNotFoundError:
    print("Error: 'email_spam.csv' not found. Please ensure the file is in the correct directory.")


Original label values: ['spam' 'not spam']

Label Distribution:
 label
0    58
1    26
Name: count, dtype: int64

--- Model Evaluation ---
Accuracy: 0.7059
              precision    recall  f1-score   support

         Ham       0.71      1.00      0.83        12
        Spam       0.00      0.00      0.00         5

    accuracy                           0.71        17
   macro avg       0.35      0.50      0.41        17
weighted avg       0.50      0.71      0.58        17


--- Live Predictions ---
SPAM Email Prediction: SPAM
HAM Email Prediction: HAM


  df['label'] = df['label'].replace({
