In [2]:
import re
import joblib
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk

# Cache for vectorizer (loaded once)
_VECTORIZER_CACHE = None

def preprocess_email_for_inference(email_text, vectorizer_path='l1_vectorizer.pkl'):
    """
    Preprocess a single email text for phishing detection inference.
    
    Args:
        email_text (str): Raw email text to preprocess
        vectorizer_path (str): Path to the saved TF-IDF vectorizer
        
    Returns:
        tuple: (cleaned_text, vectorized_features)
            - cleaned_text (str): Preprocessed text string
            - vectorized_features (sparse matrix): TF-IDF features ready for model prediction
            
    Raises:
        ValueError: If email_text is empty or not a string
        FileNotFoundError: If vectorizer file is not found
    """
    global _VECTORIZER_CACHE
    
    # Input validation
    if not isinstance(email_text, str):
        raise ValueError(f"Input must be a string, got {type(email_text).__name__}")
    
    if not email_text or email_text.strip() == "":
        raise ValueError("Empty string - no content to process")
    
    # Ensure NLTK data is available
    try:
        nltk.data.find('tokenizers/punkt')
        nltk.data.find('corpora/stopwords')
        nltk.data.find('corpora/wordnet')
    except LookupError:
        print("Downloading required NLTK data...")
        nltk.download('punkt', quiet=True)
        nltk.download('stopwords', quiet=True)
        nltk.download('wordnet', quiet=True)
    
    # Load vectorizer (with caching)
    if _VECTORIZER_CACHE is None:
        try:
            _VECTORIZER_CACHE = joblib.load(vectorizer_path)
            print(f"Loaded vectorizer from {vectorizer_path}")
        except FileNotFoundError:
            raise FileNotFoundError(
                f"Vectorizer not found at {vectorizer_path}. "
                "Please ensure the model has been trained and saved."
            )
    
    # Initialize preprocessing tools
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))
    
    # Add email-specific junk words
    email_junk = {
        're', 'fw', 'fwd', 'subject', 'date', 'from', 'to', 'cc',
        'spama', 'spamassassin', 'razor', 'exmh', 'rpm-l',
        'nbsp', 'html', 'font', 'http', 'https'
    }
    stop_words.update(email_junk)
    
    # --- CLEANING PIPELINE ---
    
    # Step 1: Fix encoding & remove junk
    text = email_text.replace(u'\u00a0', ' ')  # Non-breaking space
    text = re.sub(r'^\s*>\s?', '', text, flags=re.MULTILINE)  # Reply quotes
    text = re.sub(r'[-_=]{4,}', '', text)  # Separators
    
    # Step 2: Lowercase
    text = text.lower()
    
    # Step 3: Normalize (Token Replacement) - BEFORE removing punctuation
    text = re.sub(r'http\S+|www\S+|https\S+', ' _URL_ ', text, flags=re.MULTILINE)
    text = re.sub(r'[\w\.-]+@[\w\.-]+', ' _EMAIL_ ', text, flags=re.MULTILINE)
    text = re.sub(r'\d+', ' _NUM_ ', text, flags=re.MULTILINE)
    
    # Step 4: Tokenize
    try:
        tokens = word_tokenize(text)
    except Exception:
        # Fallback to simple split if tokenization fails
        tokens = text.split()
    
    # Step 5: Remove punctuation & stop words
    clean_tokens = []
    for word in tokens:
        if word in ['_URL_', '_EMAIL_', '_NUM_']:
            clean_tokens.append(word)
        elif word.isalpha() and word not in stop_words:
            clean_tokens.append(word)
    
    # Step 6: Lemmatize
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in clean_tokens]
    
    # Step 7: Join to create cleaned text
    cleaned_text = ' '.join(lemmatized_tokens)
    
    # Check if cleaning resulted in empty text
    if not cleaned_text.strip():
        raise ValueError("Empty string after preprocessing - no meaningful content")
    
    # Step 8: Vectorize for model prediction
    vectorized_features = _VECTORIZER_CACHE.transform([cleaned_text])
    
    return cleaned_text, vectorized_features


# Example usage function
def predict_phishing(email_text, model_path='l1_nlp_model.pkl', vectorizer_path='l1_vectorizer.pkl'):
    """
    Complete pipeline: preprocess email and predict if it's phishing.
    
    Args:
        email_text (str): Raw email text
        model_path (str): Path to saved model
        vectorizer_path (str): Path to saved vectorizer
        
    Returns:
        dict: Prediction results with cleaned text and probability scores
    """
    try:
        # Preprocess
        cleaned_text, features = preprocess_email_for_inference(email_text, vectorizer_path)
        
        # Load model and predict
        model = joblib.load(model_path)
        prediction = model.predict(features)[0]
        probabilities = model.predict_proba(features)[0]
        
        # Get class labels in the correct order
        class_labels = model.classes_
        
        # Create probability dictionary with correct mapping
        prob_dict = {label: prob for label, prob in zip(class_labels, probabilities)}
        
        # Get the confidence for the predicted class
        predicted_confidence = prob_dict[prediction]
        
        return {
            'prediction': prediction,
            'is_phishing': prediction == 'Phishing Email',
            'confidence': predicted_confidence,
            'probabilities': prob_dict,
            'cleaned_text': cleaned_text[:200] + '...' if len(cleaned_text) > 200 else cleaned_text
        }
        
    except ValueError as e:
        return {'error': str(e)}
    except Exception as e:
        return {'error': f"Prediction failed: {str(e)}"}


# Test the function
if __name__ == "__main__":
    # Test email
    test_email = """
    URGENT: Your account has been suspended!
    Click here http://suspicious-link.com to verify your identity immediately.
    We need your password and credit card information within 24 hours.
    """
    
    print("Testing preprocessing function...\n")
    
    try:
        cleaned, features = preprocess_email_for_inference(test_email)
        print("✓ Preprocessing successful!")
        print(f"Cleaned text: {cleaned[:100]}...")
        print(f"Feature shape: {features.shape}")
        
        print("\n" + "="*50)
        print("Testing complete prediction pipeline...\n")
        
        result = predict_phishing(test_email)
        if 'error' in result:
            print(f"✗ Error: {result['error']}")
        else:
            print(f"✓ Prediction: {result['prediction']}")
            print(f"  Is Phishing: {result['is_phishing']}")
            print(f"  Confidence: {result['confidence']:.2%}")
            print(f"  Probabilities:")
            for label, prob in result['probabilities'].items():
                print(f"    - {label}: {prob:.2%}")
                
    except Exception as e:
        print(f"✗ Test failed: {str(e)}")

Testing preprocessing function...

Downloading required NLTK data...
Loaded vectorizer from l1_vectorizer.pkl
✓ Preprocessing successful!
Cleaned text: account click _URL_ verify identity need password credit card information within _NUM_...
Feature shape: (1, 5000)

Testing complete prediction pipeline...

Downloading required NLTK data...
✓ Prediction: Phishing Email
  Is Phishing: True
  Confidence: 94.54%
  Probabilities:
    - Phishing Email: 94.54%
    - Safe Email: 5.46%


Downloading required NLTK data...


('account click _URL_ verify identity need password credit card information within _NUM_',
 <Compressed Sparse Row sparse matrix of dtype 'float64'
 	with 12 stored elements and shape (1, 5000)>)