# SMS Spam Detection System

## Problem Definition

The goal of this project is to develop a machine learning system that can accurately classify SMS messages as either spam or non-spam (ham). This system will help:
- Telecommunications companies filter out unwanted communications
- Messaging platforms protect users from potential scams
- Organizations maintain a clean and secure messaging environment

## 1. Data Collection and Setup

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
import joblib
from wordcloud import WordCloud
import pickle
import nltk

# Download NLTK data if not already downloaded
nltk.download('stopwords')

## 2. Data Preprocessing

In [None]:
# Load and preprocess the dataset
df = pd.read_csv('spam.csv', encoding="ISO-8859-1")
df = df[['v1', 'v2']]
df.columns = ['target', 'text']
df['target'] = df['target'].map({'ham': 0, 'spam': 1})

# Text preprocessing function
def preprocess_text(text):
    """
    Preprocess text data by:
    1. Converting to lowercase
    2. Removing special characters
    3. Removing extra spaces
    4. Removing stopwords
    5. Stemming words
    """
    text = text.lower()
    text = re.sub('[^a-zA-Z]', ' ', text)
    text = re.sub('\s+', ' ', text)
    stop_words = set(stopwords.words('english'))
    words = text.split()
    words = [word for word in words if word not in stop_words]
    ps = PorterStemmer()
    words = [ps.stem(word) for word in words]
    return ' '.join(words)

# Apply preprocessing
df['processed_text'] = df['text'].apply(preprocess_text)

## 3. Data Visualization

In [None]:
def create_word_cloud(text, title):
    """Create word clouds for visual analysis"""
    wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)
    plt.figure(figsize=(10, 6))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.title(title)
    plt.axis('off')
    plt.show()

# Create word clouds for spam and ham messages
spam_text = ' '.join(df[df['target'] == 1]['processed_text'])
ham_text = ' '.join(df[df['target'] == 0]['processed_text'])

create_word_cloud(spam_text, 'Most Common Words in Spam Messages')
create_word_cloud(ham_text, 'Most Common Words in Ham Messages')

## 4. Model Selection and Training

In [None]:
# Split data
X = df['processed_text']
y = df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define models and parameters
models = {
    'Naive Bayes': {
        'model': Pipeline([
            ('tfidf', TfidfVectorizer()),
            ('clf', MultinomialNB())
        ]),
        'params': {
            'tfidf__max_features': [1000, 2000, 5000],
            'tfidf__ngram_range': [(1,1), (1,2)]
        }
    },
    'SVM': {
        'model': Pipeline([
            ('tfidf', TfidfVectorizer()),
            ('clf', SVC())
        ]),
        'params': {
            'tfidf__max_features': [1000, 2000],
            'tfidf__ngram_range': [(1,1), (1,2)],
            'clf__C': [0.1, 1, 10],
            'clf__kernel': ['linear', 'rbf']
        }
    },
    'Random Forest': {
        'model': Pipeline([
            ('tfidf', TfidfVectorizer()),
            ('clf', RandomForestClassifier())
        ]),
        'params': {
            'tfidf__max_features': [1000, 2000],
            'tfidf__ngram_range': [(1,1), (1,2)],
            'clf__n_estimators': [50, 100, 200],
            'clf__max_depth': [None, 10, 20]
        }
    }
}

# Train and evaluate models
best_models = {}
for name, config in models.items():
    print(f'\nTraining {name}...')
    
    # Grid search for best parameters
    grid_search = GridSearchCV(
        config['model'],
        config['params'],
        cv=5,
        scoring='accuracy',
        n_jobs=-1
    )
    
    grid_search.fit(X_train, y_train)
    
    # Get best model
    best_model = grid_search.best_estimator_
    
    # Evaluate on test set
    y_pred = best_model.predict(X_test)
    
    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    
    print(f'Best {name} score: {accuracy:.4f}')
    print(f'Best {name} parameters: {grid_search.best_params_}')
    
    # Store best model and metrics
    best_models[name] = {
        'model': best_model,
        'metrics': {
            'accuracy': accuracy,
            'precision': precision,
            'recall': recall,
            'f1': f1
        }
    }

## 5. Model Evaluation

In [None]:
def plot_confusion_matrix(y_true, y_pred, title):
    """Plot confusion matrix for model evaluation"""
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
               xticklabels=['Ham', 'Spam'],
               yticklabels=['Ham', 'Spam'])
    plt.title(title)
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.show()

# Compare model performance
print('\nModel Performance Comparison:')
for name, results in best_models.items():
    print(f'\n{name}:')
    metrics = results['metrics']
    print(f'Accuracy: {metrics["accuracy"]: .4f}')
    print(f'Precision: {metrics["precision"]: .4f}')
    print(f'Recall: {metrics["recall"]: .4f}')
    print(f'F1 Score: {metrics["f1"]: .4f}')
    
    # Plot confusion matrix
    y_pred = results['model'].predict(X_test)
    plot_confusion_matrix(y_test, y_pred, f'{name} Confusion Matrix')

## 6. Model Deployment

In [None]:
def save_model(model, filename):
    """Save model and vectorizer for deployment"""
    with open(filename, 'wb') as file:
        pickle.dump(model, file)

def predict_spam(message, model):
    """
    Predict if a message is spam or not
    
    Args:
        message (str): Input message to classify
        model: Trained machine learning model
        
    Returns:
        str: 'Spam' or 'Not Spam'
    """
    # Preprocess the message
    processed_text = preprocess_text(message)
    
    # Make prediction
    prediction = model.predict([processed_text])[0]
    
    return 'Spam' if prediction == 1 else 'Not Spam'

# Save the best performing model
best_model_name = max(best_models, key=lambda k: best_models[k]['metrics']['f1'])
best_model = best_models[best_model_name]['model']

# Save model and vectorizer
save_model(best_model, 'spam_detector_model.pkl')

# Test the deployed model
test_messages = [
    "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005.",
    "Hey, how are you? I'm doing great!",
    "WINNER!! As a valued network customer you have been selected to receivea £900 prize reward!"
]

print('\nTesting deployed model:')
for message in test_messages:
    print(f'\nMessage: {message}')
    print(f'Prediction: {predict_spam(message, best_model)}')