In [4]:
import pandas as pd
import re
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB

# Define the custom TextPreprocessor class first
class TextPreprocessor(BaseEstimator, TransformerMixin):
    """
    A custom transformer to perform basic text cleaning.
    """
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        # Convert input to a pandas Series if it's not already
        if not isinstance(X, pd.Series):
            X = pd.Series(X)
            
        # Convert all text to lowercase
        X = X.apply(lambda text: text.lower())
        # Remove punctuation
        X = X.apply(lambda text: re.sub(r'[^\w\s]', '', text))
        return X

# 1. Load data (replace 'your_spam_dataset.csv' with your actual file name)
# Assuming a DataFrame with a 'text' column for messages and a 'label' column for classification
spam_data = pd.read_csv('D:/Learn with me (DS and AIML)/spam.csv')

# 2. Define X (features) and y (target)
X = spam_data['text']
y = spam_data['label']

# 3. Create the imblearn pipeline with all steps in the correct order
pipeline = Pipeline([
    ('preprocessor', TextPreprocessor()),  # Step 1: Preprocess text
    ('tfidf', TfidfVectorizer(stop_words='english', ngram_range=(1, 2))),  # Step 2: Vectorize text
    ('oversample', SMOTE(random_state=42)),  # Step 3: Resample numerical data
    ('classifier', MultinomialNB())  # Step 4: Train classifier
])

# 4. Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 5. Train the pipeline on the training data
pipeline.fit(X_train, y_train)

# 6. Define and predict new messages
test_messages = [
    "Congratulations! You've won a free iPhone. Click here to claim your prize.",
    "Hey, do you have time to chat later today?",
    "URGENT: Your account has been suspended. Please update your details now.",
    "Can you send me the report by end of day?",
    "WINNER!! As a valued network customer you have been selected to receive a £900 prize.",
    "Just confirming our meeting for tomorrow.",
]

detections = pipeline.predict(test_messages)

# 7. Display the results
print("Spam Detection Results:")
for i, message in enumerate(test_messages):
    prediction = detections[i]
    print(f"Message: '{message}' -> Predicted: {prediction}")


Spam Detection Results:
Message: 'Congratulations! You've won a free iPhone. Click here to claim your prize.' -> Predicted: spam
Message: 'Hey, do you have time to chat later today?' -> Predicted: spam
Message: 'URGENT: Your account has been suspended. Please update your details now.' -> Predicted: spam
Message: 'Can you send me the report by end of day?' -> Predicted: ham
Message: 'WINNER!! As a valued network customer you have been selected to receive a £900 prize.' -> Predicted: spam
Message: 'Just confirming our meeting for tomorrow.' -> Predicted: ham
