<a href="https://colab.research.google.com/github/Festuskipkoech/Festus_data-science/blob/main/SpamEmailDetection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, confusion_matrix
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer


In [2]:

# download required data
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [7]:
import re
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

class SpamDetector:
    def __init__(self):
        self.vectorizer = TfidfVectorizer(max_features=5000)
        self.classifier = LogisticRegression(random_state=42)
        self.lemmatizer = WordNetLemmatizer()
        self.stop_words = set(stopwords.words('english'))

    def preprocess_text(self, text):
        # Convert to lowercase
        text = text.lower()
        # Remove special characters
        text = re.sub(r'[^a-zA-Z\s]', '', text)
        # Tokenize and remove stop words
        words = text.split()
        words = [self.lemmatizer.lemmatize(word) for word in words if word not in self.stop_words]
        return ' '.join(words)

    def prepare_data(self, df, text_column, label_column):
        # Preprocess the text data
        df['processed_text'] = df[text_column].apply(self.preprocess_text)

        # Convert text to TF-IDF features
        X = self.vectorizer.fit_transform(df['processed_text'])
        y = df[label_column]

        # Split the data
        return train_test_split(X, y, test_size=0.2, random_state=42)

    def train(self, X_train, y_train):
        self.classifier.fit(X_train, y_train)

    def evaluate(self, X_test, y_test):
        predictions = self.classifier.predict(X_test)

        print("\nClassification Report:")
        print(classification_report(y_test, predictions))

        print("\nConfusion Matrix:")
        print(confusion_matrix(y_test, predictions))

        return predictions

    def predict(self, text):
        processed_text = self.preprocess_text(text)
        vectorized_text = self.vectorizer.transform([processed_text])
        prediction = self.classifier.predict(vectorized_text)
        probability = self.classifier.predict_proba(vectorized_text)

        return {
            'is_spam': bool(prediction[0]),
            'spam_probability': probability[0][1]
        }

# Example usage
if __name__ == '__main__':
    # Load your dataset, assuming it has 'text' and 'label' columns
    # where 'label' is 1 for spam and 0 for ham

    df = pd.read_csv('/content/spam_ham_dataset.csv')

    # Initialize and train the spam detector
    detector = SpamDetector()
    X_train, X_test, y_train, y_test = detector.prepare_data(df, 'text', 'label')
    detector.train(X_train, y_train)

    # Evaluate the model
    predictions = detector.evaluate(X_test, y_test)

    # Example prediction
    sample_email = """Congratulations! You have won a $1000 gift card! Click here to claim your prize now!"""
    result = detector.predict(sample_email)
    print("\nSample email predictions:")
    print(f"Is spam: {result['is_spam']}")
    print(f"Spam probability: {result['spam_probability']:.2f}")



Classification Report:
              precision    recall  f1-score   support

         ham       0.99      0.99      0.99       742
        spam       0.97      0.98      0.97       293

    accuracy                           0.98      1035
   macro avg       0.98      0.98      0.98      1035
weighted avg       0.98      0.98      0.98      1035


Confusion Matrix:
[[733   9]
 [  7 286]]

Sample email predictions:
Is spam: True
Spam probability: 0.77
