<a href="https://colab.research.google.com/github/Ephrame-A/Drum-app/blob/main/Email_checker.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install required packages (if not already installed)
!pip install -q scikit-learn pandas numpy

# Import libraries
from google.colab import files
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.pipeline import Pipeline
import joblib

# ===== STEP 1: MANUAL UPLOAD =====
print("Please upload your 'spam-email-dataset.csv' file:")
uploaded = files.upload()

Please upload your 'spam-email-dataset.csv' file:


In [2]:


# Check if file was uploaded
if not uploaded:
    raise ValueError("No file uploaded. Please upload a CSV file named 'spam-email-dataset.csv'")

# Get the filename (handles different naming)
uploaded_filename = list(uploaded.keys())[0]

# ===== STEP 2: LOAD DATASET =====
try:
    data = pd.read_csv(uploaded_filename)
    print("\nDataset loaded successfully!")
    print(f"Total records: {len(data)}")
    print("\nFirst 5 records:")
    print(data.head())

    # Verify required columns exist
    if 'text' not in data.columns or 'spam_or_not' not in data.columns:
        raise ValueError("Dataset must contain 'text' and 'spam_or_not' columns")

except Exception as e:
    print(f"\nError loading dataset: {e}")
    raise

# ===== STEP 3: PREPROCESS DATA =====
print("\nPreprocessing data...")

# Clean data (remove missing values)
data = data.dropna()
data = data.rename(columns={'spam_or_not': 'label'})  # Rename for consistency

# Split into features (X) and target (y)
X = data['text']
y = data['label']

# Split into training (80%) and testing (20%) sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y  # Maintain class balance
)

print("\nData split complete:")
print(f"Training emails: {len(X_train)}")
print(f"Testing emails: {len(X_test)}")
print(f"Spam % in training: {y_train.mean():.2%}")
print(f"Spam % in testing: {y_test.mean():.2%}")

# ===== STEP 4: TRAIN MODEL =====
print("\nTraining Logistic Regression model...")

# Pipeline: TF-IDF → Logistic Regression
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(
        max_features=5000,       # Limit vocabulary size
        stop_words='english',   # Remove common words (the, and, etc.)
        lowercase=True,         # Convert to lowercase
        ngram_range=(1, 2),     # Use single words + word pairs
        min_df=5,               # Ignore rare words (<5 occurrences)
        max_df=0.7)),           # Ignore overly common words (>70% docs)
    ('classifier', LogisticRegression(
        penalty='l2',           # Ridge regularization
        C=1.0,                 # Inverse regularization strength
        solver='liblinear',     # Optimized for small datasets
        max_iter=1000,          # Max training iterations
        class_weight='balanced'))  # Handle imbalanced classes
])

# Train the model
pipeline.fit(X_train, y_train)
print("Training complete!")

# ===== STEP 5: EVALUATE MODEL =====
print("\nEvaluating model...")

# Predictions
y_pred = pipeline.predict(X_test)

# Metrics
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred, target_names=['Not Spam', 'Spam'])
cm = confusion_matrix(y_test, y_pred)

print(f"📊 Accuracy: {accuracy:.2%}")
print("\n📝 Classification Report:")
print(report)
print("\n🔢 Confusion Matrix:")
print(cm)

# ===== STEP 6: TEST WITH EXAMPLES =====
def predict_spam(email_text):
    prediction = pipeline.predict([email_text])[0]
    proba = pipeline.predict_proba([email_text])[0][1] if prediction == 1 else pipeline.predict_proba([email_text])[0][0]

    print(f"\n✉️ Email: {email_text[:100]}...")
    if prediction == 1:
        print(f"🔴 SPAM (confidence: {proba:.2%})")
    else:
        print(f"🟢 NOT SPAM (confidence: {proba:.2%})")

print("\n🔎 Testing with examples:")
test_emails = [
    "Win a free iPhone now! Click here!",  # Spam
    "Hi team, meeting at 3 PM tomorrow",   # Not spam
    "Your account is locked. Verify now!",  # Spam
    "Thanks for your order confirmation",  # Not spam
    "Earn $1000 daily from home!"          # Spam
]

for email in test_emails:
    predict_spam(email)

# ===== STEP 7: SAVE MODEL =====
print("\n💾 Saving model...")
joblib.dump(pipeline, 'spam_classifier.joblib')

# Download the model
print("\n⬇️ Downloading model file...")
files.download('spam_classifier.joblib')
print("✅ Done! Model saved as 'spam_classifier.joblib'")

Please upload your 'spam-email-dataset.csv' file:


KeyboardInterrupt: 