In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score
import nltk
from nltk.corpus import stopwords
import re

# Download stopwords if not already present
try:
    stopwords.words('english')
except LookupError:
    nltk.download('stopwords')

try:
    # Load dataset
    df = pd.read_csv('/content/email_spam.csv', encoding='latin-1')

    # Keep only required columns and drop NaNs
    df = df[['type', 'text']].rename(columns={'type': 'label'})
    df.dropna(subset=['label', 'text'], inplace=True)

    # Show original labels
    print("Original label values:", df['label'].unique())

    # Map labels and fix future warning by adding infer_objects()
    df['label'] = df['label'].str.lower().str.strip()
    df['label'] = df['label'].replace({
        'ham': 0, 'not spam': 0,
        'spam': 1
    }).infer_objects()

    # Drop rows where mapping failed and convert to int
    df.dropna(subset=['label'], inplace=True)
    df['label'] = df['label'].astype(int)

    print("\nLabel Distribution:\n", df['label'].value_counts())

    # Preprocessing function
    stop_words = set(stopwords.words('english'))

    def preprocess_text(text):
        text = str(text).lower()
        text = re.sub(r'[^a-z]', ' ', text)
        words = text.split()
        words = [word for word in words if word not in stop_words]
        return ' '.join(words)

    df['cleaned_text'] = df['text'].apply(preprocess_text)

    # Stratified split to maintain label balance
    X_train, X_test, y_train, y_test = train_test_split(
        df['cleaned_text'], df['label'], test_size=0.2, random_state=42, stratify=df['label']
    )

    # Vectorization
    vectorizer = CountVectorizer(max_features=5000)
    X_train_vec = vectorizer.fit_transform(X_train)
    X_test_vec = vectorizer.transform(X_test)

    # Train model
    model = MultinomialNB()
    model.fit(X_train_vec, y_train)

    # Predict
    y_pred = model.predict(X_test_vec)

    # Evaluate with zero_division=0 to suppress warnings
    print("\n--- Model Evaluation ---")
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
    print(classification_report(y_test, y_pred, target_names=['Ham', 'Spam'], zero_division=0))

    # Function to predict new emails
    def predict_spam(text_to_check):
        cleaned = preprocess_text(text_to_check)
        vector = vectorizer.transform([cleaned])
        prediction = model.predict(vector)[0]
        return "SPAM" if prediction == 1 else "HAM"

    # Test predictions
    test_email_spam = "Congratulations! You've won a free iPhone. Click this link immediately to claim your prize."
    test_email_ham = "Hey, just following up on our meeting yesterday. Could you send me the revised document by 5 pm?"

    print("\n--- Live Predictions ---")
    print(f"SPAM Email Prediction: {predict_spam(test_email_spam)}")
    print(f"HAM Email Prediction: {predict_spam(test_email_ham)}")

except FileNotFoundError:
    print("Error: 'email_spam.csv' not found. Please ensure the file is in the correct directory.")


Original label values: ['spam' 'not spam']

Label Distribution:
 label
0    58
1    26
Name: count, dtype: int64

--- Model Evaluation ---
Accuracy: 0.7059
              precision    recall  f1-score   support

         Ham       0.71      1.00      0.83        12
        Spam       0.00      0.00      0.00         5

    accuracy                           0.71        17
   macro avg       0.35      0.50      0.41        17
weighted avg       0.50      0.71      0.58        17


--- Live Predictions ---
SPAM Email Prediction: SPAM
HAM Email Prediction: HAM


  df['label'] = df['label'].replace({


In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
import nltk
from nltk.corpus import stopwords
import re

# Download stopwords if not already present
try:
    stopwords.words('english')
except LookupError:
    nltk.download('stopwords')

try:
    # Load dataset
    df = pd.read_csv('/content/email_spam.csv', encoding='latin-1')

    # Keep only required columns and drop NaNs
    df = df[['type', 'text']].rename(columns={'type': 'label'})
    df.dropna(subset=['label', 'text'], inplace=True)

    # Normalize labels
    df['label'] = df['label'].str.lower().str.strip()
    df['label'] = df['label'].replace({
        'ham': 0, 'not spam': 0,
        'spam': 1
    }).infer_objects()
    df.dropna(subset=['label'], inplace=True)
    df['label'] = df['label'].astype(int)

    print("Label distribution:\n", df['label'].value_counts())

    # Text preprocessing
    stop_words = set(stopwords.words('english'))

    def preprocess_text(text):
        text = str(text).lower()
        text = re.sub(r'[^a-z]', ' ', text)
        words = text.split()
        words = [word for word in words if word not in stop_words]
        return ' '.join(words)

    df['cleaned_text'] = df['text'].apply(preprocess_text)

    # Stratified train-test split to maintain class proportions
    X_train, X_test, y_train, y_test = train_test_split(
        df['cleaned_text'], df['label'], test_size=0.2, random_state=42, stratify=df['label']
    )

    # Use TF-IDF vectorizer
    vectorizer = TfidfVectorizer(max_features=5000)
    X_train_vec = vectorizer.fit_transform(X_train)
    X_test_vec = vectorizer.transform(X_test)

    # Logistic Regression with class_weight='balanced' to handle imbalance
    model = LogisticRegression(max_iter=1000, class_weight='balanced', random_state=42)
    model.fit(X_train_vec, y_train)

    # Predict on test data
    y_pred = model.predict(X_test_vec)

    # Evaluation
    print("\n--- Model Evaluation ---")
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
    print(classification_report(y_test, y_pred, target_names=['Ham', 'Spam'], zero_division=0))

    # Function to predict new emails
    def predict_spam(text_to_check):
        cleaned = preprocess_text(text_to_check)
        vector = vectorizer.transform([cleaned])
        prediction = model.predict(vector)[0]
        return "SPAM" if prediction == 1 else "HAM"

    # Test live predictions
    test_email_spam = "Congratulations! You've won a free iPhone. Click this link immediately to claim your prize."
    test_email_ham = "Hey, just following up on our meeting yesterday. Could you send me the revised document by 5 pm?"

    print("\n--- Live Predictions ---")
    print(f"SPAM Email Prediction: {predict_spam(test_email_spam)}")
    print(f"HAM Email Prediction: {predict_spam(test_email_ham)}")

except FileNotFoundError:
    print("Error: 'email_spam.csv' not found. Please ensure the file is in the correct directory.")


Label distribution:
 label
0    58
1    26
Name: count, dtype: int64

--- Model Evaluation ---
Accuracy: 0.6471
              precision    recall  f1-score   support

         Ham       0.69      0.92      0.79        12
        Spam       0.00      0.00      0.00         5

    accuracy                           0.65        17
   macro avg       0.34      0.46      0.39        17
weighted avg       0.49      0.65      0.55        17


--- Live Predictions ---
SPAM Email Prediction: SPAM
HAM Email Prediction: HAM


  df['label'] = df['label'].replace({
