In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score


In [3]:
# 1. Read the dataset
df = pd.read_csv('/kaggle/input/email-spam-classification-dataset-csv/emails.csv')

# 2. Load the dataset (view top rows & class balance)
print(df.head())
print("\nClass Distribution:\n", df['spam'].value_counts())


  Email No.  the  to  ect  and  for  of    a  you  hou  ...  connevey  jay  \
0   Email 1    0   0    1    0    0   0    2    0    0  ...         0    0   
1   Email 2    8  13   24    6    6   2  102    1   27  ...         0    0   
2   Email 3    0   0    1    0    0   0    8    0    0  ...         0    0   
3   Email 4    0   5   22    0    5   1   51    2   10  ...         0    0   
4   Email 5    7   6   17    1    5   2   57    0    9  ...         0    0   

   valued  lay  infrastructure  military  allowing  ff  dry  Prediction  
0       0    0               0         0         0   0    0           0  
1       0    0               0         0         0   1    0           0  
2       0    0               0         0         0   0    0           0  
3       0    0               0         0         0   0    0           0  
4       0    0               0         0         0   1    0           0  

[5 rows x 3002 columns]

Class Distribution:
 spam
0    5104
1      57
2       5
4    

In [5]:
# 3. Preprocess the dataset
def clean_text(text):
    text = str(text)  # Ensure the input is a string
    text = text.lower()
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)  # remove links
    text = re.sub(r'\W', ' ', text)                     # remove non-alphanumeric
    text = re.sub(r'\s+', ' ', text)                    # remove extra spaces
    return text

# Apply preprocessing
df['text'] = df['text'].apply(clean_text)


In [7]:
# Remove empty or whitespace-only strings after cleaning
df['text'] = df['text'].apply(clean_text)
df = df[df['text'].str.strip() != '']


In [12]:
# Preprocessing
def clean_text(text):
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

df['text'] = df['text'].apply(clean_text)

# Remove empty texts
df = df[df['text'].str.strip() != '']
df.dropna(subset=['text', 'spam'], inplace=True)

print(df['text'].head())

0    0
1    0
2    0
3    0
4    0
Name: text, dtype: object


In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score


In [16]:
print("First 10 rows of text column:\n", df['text'].head(10))
print("Data type of text column:", df['text'].dtype)
print("Number of missing values:", df['text'].isna().sum())
print("Number of empty strings:", df['text'].str.strip().eq('').sum())
print("Sample of non-empty text:", df['text'][df['text'].str.strip() != ''].head())

First 10 rows of text column:
 0    0
1    0
2    0
3    0
4    0
5    1
6    0
7    0
8    0
9    0
Name: text, dtype: object
Data type of text column: object
Number of missing values: 0
Number of empty strings: 0
Sample of non-empty text: 0    0
1    0
2    0
3    0
4    0
Name: text, dtype: object


In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Replace 'message' with the actual column name containing text
text_column = 'message'  # Update this based on df.columns output
if text_column not in df.columns:
    raise ValueError(f"Column '{text_column}' not found. Available columns: {df.columns}")

# Ensure text is string and handle missing values
df[text_column] = df[text_column].astype(str).fillna('')

# Check for empty text
if df[text_column].str.strip().eq('').all():
    raise ValueError(f"All entries in '{text_column}' are empty. Check your data:\n{df[text_column].head()}")

# TF-IDF Vectorization
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
X = vectorizer.fit_transform(df[text_column])

# Labels (use the 'spam' column)
y = df['spam']

print("Vectorized data shape:", X.shape)
print("Sample features:", vectorizer.get_feature_names_out()[:10])

Vectorized data shape: (5172, 4)
Sample features: ['11' '13' '15' '16']


In [19]:
# Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Naive Bayes model

In [21]:
# Train Naive Bayes model
model = MultinomialNB()
model.fit(X_train, y_train)


In [22]:
# Predictions
y_pred = model.predict(X_test)

In [23]:
# Evaluation
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("Accuracy Score:", accuracy_score(y_test, y_pred))

Confusion Matrix:
 [[1024    0]
 [  11    0]]

Classification Report:
               precision    recall  f1-score   support

           0       0.99      1.00      0.99      1024
           1       0.00      0.00      0.00        11

    accuracy                           0.99      1035
   macro avg       0.49      0.50      0.50      1035
weighted avg       0.98      0.99      0.98      1035

Accuracy Score: 0.9893719806763285


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [24]:
import joblib

# Save the trained Naive Bayes model
joblib.dump(model, '/kaggle/working/email_spam_model.pkl')

# Save the TF-IDF vectorizer
joblib.dump(vectorizer, '/kaggle/working/tfidf_vectorizer.pkl')

print("Model and vectorizer saved to /kaggle/working/")


Model and vectorizer saved to /kaggle/working/


In [26]:
import pandas as pd

# Load your dataset (adjust path if needed)
df = pd.read_csv('/kaggle/input/email-spam-classification-dataset-csv/emails.csv', encoding='latin-1')
print("Column names:", df.columns)
print("First 5 rows:\n", df.head())

Column names: Index(['Email No.', 'the', 'to', 'ect', 'and', 'for', 'of', 'a', 'you', 'hou',
       ...
       'connevey', 'jay', 'valued', 'lay', 'infrastructure', 'military',
       'allowing', 'ff', 'dry', 'Prediction'],
      dtype='object', length=3002)
First 5 rows:
   Email No.  the  to  ect  and  for  of    a  you  hou  ...  connevey  jay  \
0   Email 1    0   0    1    0    0   0    2    0    0  ...         0    0   
1   Email 2    8  13   24    6    6   2  102    1   27  ...         0    0   
2   Email 3    0   0    1    0    0   0    8    0    0  ...         0    0   
3   Email 4    0   5   22    0    5   1   51    2   10  ...         0    0   
4   Email 5    7   6   17    1    5   2   57    0    9  ...         0    0   

   valued  lay  infrastructure  military  allowing  ff  dry  Prediction  
0       0    0               0         0         0   0    0           0  
1       0    0               0         0         0   1    0           0  
2       0    0               0     

In [27]:
!pip install gradio --quiet


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m54.2/54.2 MB[0m [31m33.5 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m323.3/323.3 kB[0m [31m24.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m95.2/95.2 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.5/11.5 MB[0m [31m99.2 MB/s[0m eta [36m0:00:00[0m:00:01[0m0:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m72.0/72.0 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.4/62.4 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[?25h

In [40]:
import pandas as pd
import joblib
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
import gradio as gr
import re
import string

# Initialize model and vectorizer as None
model = None
vectorizer = None

def preprocess_text(text):
    """Clean and preprocess email text"""
    if not text:
        return ""
    
    # Convert to lowercase
    text = text.lower()
    
    # Remove special characters and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    # Remove extra whitespace
    text = ' '.join(text.split())
    
    return text

def load_or_create_model():
    """Load existing model or create a new one"""
    global model, vectorizer
    
    try:
        # Try to load pre-trained model and vectorizer
        model = joblib.load('/kaggle/input/email-spam/keras/default/1/email_spam_model.pkl')
        vectorizer = joblib.load('/kaggle/input/email-spam/keras/default/1/tfidf_vectorizer.pkl')
        print("✅ Loaded pre-trained model and vectorizer")
        return True
    except:
        print("⚠️ Pre-trained model not found, creating new model...")
        
    try:
        # Load and prepare dataset
        df = pd.read_csv('/kaggle/input/email-spam/spam.csv', encoding='latin-1')
        
        # Assume the dataset has columns like 'v1' (label) and 'v2' (text)
        # Adjust column names based on your actual dataset structure
        if 'v1' in df.columns and 'v2' in df.columns:
            df = df[['v1', 'v2']].dropna()
            df.columns = ['label', 'text']
        elif 'Prediction' in df.columns:
            # If your dataset has different structure
            text_cols = [col for col in df.columns if col not in ['Email No.', 'Prediction']]
            if text_cols:
                df['text'] = df[text_cols].astype(str).agg(' '.join, axis=1)
                df['label'] = df['Prediction'].map({1: 'spam', 0: 'ham'})
                df = df[['label', 'text']].dropna()
        else:
            raise ValueError("Unable to identify text and label columns in dataset")
        
        # Preprocess text
        df['text'] = df['text'].apply(preprocess_text)
        
        # Create and train model
        vectorizer = TfidfVectorizer(
            max_features=5000,
            stop_words='english',
            ngram_range=(1, 2),
            max_df=0.95,
            min_df=2
        )
        
        # Create pipeline
        model = Pipeline([
            ('tfidf', vectorizer),
            ('classifier', MultinomialNB(alpha=0.1))
        ])
        
        # Train model
        X_train, X_test, y_train, y_test = train_test_split(
            df['text'], df['label'], test_size=0.2, random_state=42
        )
        
        model.fit(X_train, y_train)
        
        # Get accuracy
        accuracy = model.score(X_test, y_test)
        print(f"✅ Model trained successfully! Accuracy: {accuracy:.3f}")
        
        return True
        
    except Exception as e:
        print(f"❌ Error creating model: {str(e)}")
        return False

def create_fallback_classifier():
    """Create a simple keyword-based classifier as fallback"""
    global model, vectorizer
    
    # High-confidence spam indicators
    high_spam_keywords = [
        'congratulations', 'winner', 'won', 'prize', 'lottery', 'jackpot',
        'urgent', 'act now', 'limited time', 'expires', 'hurry',
        'click here', 'click now', 'claim now', 'visit now',
        'free money', 'easy money', 'make money', 'earn money',
        'viagra', 'cialis', 'pharmacy', 'medication', 'pills',
        'hot singles', 'meet singles', 'dating', 'lonely', 'romance',
        'nigerian prince', 'inheritance', 'million dollars'
    ]
    
    # Medium spam indicators
    medium_spam_keywords = [
        'free', 'offer', 'deal', 'discount', 'sale', 'promotion',
        'limited', 'exclusive', 'special', 'bonus', 'gift',
        'money', 'cash', 'earn', 'income', 'profit', 'guarantee',
        'loan', 'credit', 'debt', 'refinance', 'mortgage',
        'weight loss', 'lose weight', 'diet', 'supplements',
        'casino', 'gambling', 'bet', 'lottery', 'scratch',
        'work from home', 'make money online', 'business opportunity'
    ]
    
    # Spam phrases (exact matches)
    spam_phrases = [
        'hot singles in your area', 'meet singles near you',
        'you have won', 'you are a winner', 'claim your prize',
        'act now', 'limited time offer', 'expires soon',
        'click here now', 'visit our website', 'call now',
        'no credit check', 'guaranteed approval', 'risk free',
        'work from home', 'make money fast', 'easy money'
    ]
    
    def fallback_predict(text):
        if not text:
            return "ham"
        
        text_lower = text.lower().strip()
        
        # Check for exact spam phrases first
        for phrase in spam_phrases:
            if phrase in text_lower:
                return "spam"
        
        # Count high-confidence spam keywords (weight = 3)
        high_spam_score = sum(3 for keyword in high_spam_keywords if keyword in text_lower)
        
        # Count medium-confidence spam keywords (weight = 1)
        medium_spam_score = sum(1 for keyword in medium_spam_keywords if keyword in text_lower)
        
        total_spam_score = high_spam_score + medium_spam_score
        
        # Classification logic
        if high_spam_score >= 3:  # Any high-confidence keyword = spam
            return "spam"
        elif total_spam_score >= 4:  # Multiple medium keywords = spam
            return "spam"
        elif total_spam_score >= 2 and any(urgent in text_lower for urgent in ['urgent', '!', 'now', 'immediately']):
            return "spam"  # Medium score + urgency = spam
        else:
            return "ham"
    
    model = fallback_predict
    vectorizer = None
    print("✅ Created improved fallback keyword-based classifier")

def predict_email(email_text):
    """Predict if email is spam or ham"""
    global model, vectorizer
    
    # Initialize model if not done already
    if model is None:
        success = load_or_create_model()
        if not success:
            create_fallback_classifier()
    
    # Handle empty input
    if not email_text or email_text.strip() == "":
        return "📝 Please enter an email or message to classify."
    
    try:
        # Preprocess input
        cleaned_text = preprocess_text(email_text)
        
        if hasattr(model, 'predict'):
            # Scikit-learn pipeline model
            prediction = model.predict([cleaned_text])[0]
            
            # Get probability if available
            if hasattr(model, 'predict_proba'):
                proba = model.predict_proba([cleaned_text])[0]
                if prediction == 'spam':
                    confidence = max(proba)
                    return f"🚨 **SPAM** (ML Confidence: {confidence:.1%})"
                else:
                    confidence = max(proba)
                    return f"✅ **HAM** (ML Confidence: {confidence:.1%})"
            else:
                return f"🚨 **SPAM** (ML Model)" if prediction == 'spam' else "✅ **HAM** (ML Model)"
        else:
            # Fallback classifier with detailed reasoning
            prediction = model(email_text)
            
            # Provide reasoning for keyword-based classification
            text_lower = email_text.lower()
            detected_keywords = []
            
            # Check what triggered the classification
            high_spam_keywords = [
                'congratulations', 'winner', 'won', 'prize', 'hot singles',
                'urgent', 'act now', 'click here', 'free money', 'viagra'
            ]
            medium_spam_keywords = [
                'free', 'offer', 'money', 'deal', 'limited', 'exclusive'
            ]
            
            for keyword in high_spam_keywords:
                if keyword in text_lower:
                    detected_keywords.append(f"'{keyword}' (high)")
            
            for keyword in medium_spam_keywords:
                if keyword in text_lower and keyword not in ' '.join(detected_keywords):
                    detected_keywords.append(f"'{keyword}' (med)")
                    
            if prediction == 'spam':
                if detected_keywords:
                    keywords_text = ', '.join(detected_keywords[:3])  # Show max 3
                    return f"🚨 **SPAM** (Keywords: {keywords_text})"
                else:
                    return f"🚨 **SPAM** (Pattern-based)"
            else:
                return f"✅ **HAM** (No spam indicators found)"
            
    except Exception as e:
        # Last resort: simple keyword check with error handling
        spam_indicators = ['win', 'free', 'prize', 'click', 'offer', 'urgent', 'congratulations', 'hot singles']
        detected = [word for word in spam_indicators if word in email_text.lower()]
        
        if detected:
            return f"🚨 **SPAM** (Detected: {', '.join(detected[:2])} - Backup mode)"
        else:
            return f"✅ **HAM** (Backup mode - Error: {str(e)[:30]}...)"

# Example texts for testing
example_texts = [
    "Congratulations! You've won a $1000 Walmart gift card. Click here to claim now!",
    "Hey, how are you doing? Want to meet for coffee later?",
    "URGENT: Your account will be suspended. Click here immediately!",
    "Hi mom, just wanted to let you know I arrived safely.",
    "FREE VIAGRA! Limited time offer! Act now!",
    "Meeting scheduled for tomorrow at 2 PM in conference room A."
]

# Create Gradio interface
iface = gr.Interface(
    fn=predict_email,
    inputs=gr.Textbox(
        lines=4,
        placeholder="Enter your email content here...\n\nExample: 'Congratulations! You've won a prize!' or 'Hi, how are you?'",
        label="Email Content"
    ),
    outputs=gr.Textbox(label="Classification Result"),
    title="📧 Email Spam Classifier",
    description="""
    **Enter any email content to check if it's Spam or Ham (legitimate email).**
    
    The classifier uses machine learning to analyze text patterns and identify spam characteristics.
    
    🚨 **Spam**: Promotional, suspicious, or unwanted emails  
    ✅ **Ham**: Legitimate, normal emails
    """,
    examples=example_texts,
    theme=gr.themes.Soft(),
    allow_flagging="never"
)

# Launch the interface
if __name__ == "__main__":
    print("🚀 Starting Email Spam Classifier...")
    iface.launch(share=True)



🚀 Starting Email Spam Classifier...
* Running on local URL:  http://127.0.0.1:7867
* Running on public URL: https://570d7c2fba26e91386.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


⚠️ Pre-trained model not found, creating new model...
❌ Error creating model: [Errno 2] No such file or directory: '/kaggle/input/email-spam/spam.csv'
✅ Created improved fallback keyword-based classifier


In [41]:
import pandas as pd
import joblib
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
import gradio as gr
import re
import string
import random

# Initialize model and vectorizer as None
model = None
vectorizer = None

def preprocess_text(text):
    """Clean and preprocess text"""
    if not text:
        return ""
    
    # Convert to lowercase
    text = text.lower()
    
    # Remove special characters and digits but keep basic punctuation
    text = re.sub(r'[^\w\s!?.,]', '', text)
    
    # Remove extra whitespace
    text = ' '.join(text.split())
    
    return text

def create_smart_classifier():
    """Create an enhanced keyword and pattern-based classifier"""
    global model, vectorizer
    
    # Enhanced spam indicators with weights
    spam_patterns = {
        # High confidence spam indicators (weight: 5)
        'high_confidence': [
            'congratulations', 'winner', 'won', 'prize', 'lottery', 'jackpot',
            'nigerian prince', 'inheritance', 'million dollars', 'bank transfer',
            'hot singles', 'meet singles', 'lonely tonight', 'sexy',
            'viagra', 'cialis', 'pharmacy', 'medication', 'pills',
            'click here now', 'claim now', 'act immediately', 'limited time',
            'you have been selected', 'final notice', 'urgent response required'
        ],
        
        # Medium confidence spam indicators (weight: 3)
        'medium_confidence': [
            'free money', 'easy money', 'make money', 'earn money', 'quick cash',
            'work from home', 'business opportunity', 'guaranteed income',
            'lose weight fast', 'weight loss', 'diet pills', 'supplements',
            'casino', 'gambling', 'bet now', 'lottery ticket',
            'no credit check', 'guaranteed approval', 'loan approved',
            'refinance', 'mortgage', 'debt relief', 'consolidation',
            'risk free', 'money back guarantee', 'no obligation'
        ],
        
        # Lower confidence spam indicators (weight: 2)
        'low_confidence': [
            'free', 'offer', 'deal', 'discount', 'sale', 'promotion',
            'limited', 'exclusive', 'special', 'bonus', 'gift',
            'urgent', 'hurry', 'expires', 'deadline', 'now',
            'click', 'visit', 'call', 'order', 'buy',
            'money', 'cash', 'earn', 'profit', 'save'
        ]
    }
    
    # Spam phrases that are strong indicators
    spam_phrases = [
        'hot singles in your area', 'meet singles near you',
        'you have won', 'you are a winner', 'claim your prize',
        'act now or lose', 'limited time offer', 'expires today',
        'click here now', 'visit our website now', 'call now',
        'no credit check required', 'guaranteed approval',
        'work from home', 'make money fast', 'easy money online',
        'lose weight without', 'diet pills that work',
        'nigerian prince needs help', 'millions of dollars'
    ]
    
    # Legitimate patterns (negative indicators for spam)
    legitimate_patterns = [
        'meeting', 'conference', 'schedule', 'appointment',
        'family', 'friend', 'mom', 'dad', 'brother', 'sister',
        'work', 'office', 'project', 'team', 'colleague',
        'thank you', 'please', 'regards', 'sincerely',
        'birthday', 'anniversary', 'celebration', 'holiday'
    ]
    
    def enhanced_predict(text):
        if not text or len(text.strip()) < 3:
            return "ham", 0, []
        
        text_lower = text.lower().strip()
        detected_patterns = []
        spam_score = 0
        
        # Check for exact spam phrases first (highest weight)
        for phrase in spam_phrases:
            if phrase in text_lower:
                spam_score += 8
                detected_patterns.append(f"'{phrase}' (phrase)")
        
        # Check high confidence keywords
        for keyword in spam_patterns['high_confidence']:
            if keyword in text_lower:
                spam_score += 5
                detected_patterns.append(f"'{keyword}' (high)")
        
        # Check medium confidence keywords
        for keyword in spam_patterns['medium_confidence']:
            if keyword in text_lower:
                spam_score += 3
                detected_patterns.append(f"'{keyword}' (medium)")
        
        # Check low confidence keywords
        for keyword in spam_patterns['low_confidence']:
            if keyword in text_lower:
                spam_score += 2
                detected_patterns.append(f"'{keyword}' (low)")
        
        # Check for legitimate patterns (reduce spam score)
        legitimate_count = 0
        for pattern in legitimate_patterns:
            if pattern in text_lower:
                legitimate_count += 1
        
        # Reduce spam score for legitimate patterns
        spam_score -= legitimate_count * 1.5
        
        # Additional pattern checks
        exclamation_count = text.count('!')
        caps_ratio = sum(1 for c in text if c.isupper()) / max(len(text), 1)
        
        # Excessive exclamation marks or caps
        if exclamation_count >= 3:
            spam_score += 2
            detected_patterns.append("multiple exclamations")
        
        if caps_ratio > 0.3 and len(text) > 10:
            spam_score += 2
            detected_patterns.append("excessive caps")
        
        # URL detection
        if 'http' in text_lower or 'www.' in text_lower or '.com' in text_lower:
            spam_score += 1
            detected_patterns.append("contains URL")
        
        # Determine classification
        if spam_score >= 8:
            return "spam", min(spam_score * 10, 95), detected_patterns
        elif spam_score >= 5:
            return "spam", min(spam_score * 8, 85), detected_patterns
        elif spam_score >= 3:
            return "spam", min(spam_score * 12, 75), detected_patterns
        else:
            return "ham", max(10, 100 - spam_score * 5), detected_patterns
    
    model = enhanced_predict
    vectorizer = None
    print("✅ Created enhanced pattern-based classifier")

def predict_text(input_text):
    """Predict if any text is spam-like or legitimate"""
    global model
    
    # Initialize model if not done already
    if model is None:
        create_smart_classifier()
    
    # Handle empty input
    if not input_text or input_text.strip() == "":
        return "📝 Please enter some text to classify."
    
    try:
        # Get prediction with confidence and detected patterns
        prediction, confidence, detected_patterns = model(input_text)
        
        # Format the result
        if prediction == 'spam':
            result = f"🚨 **SPAM-LIKE** (Confidence: {confidence}%)"
        else:
            result = f"✅ **LEGITIMATE** (Confidence: {confidence}%)"
        
        # Add detected patterns if any
        if detected_patterns:
            patterns_text = ", ".join(detected_patterns[:4])  # Show max 4 patterns
            result += f"\n\n🔍 **Detected patterns:** {patterns_text}"
            if len(detected_patterns) > 4:
                result += f" (+{len(detected_patterns)-4} more)"
        
        # Add explanation
        if prediction == 'spam':
            result += "\n\n⚠️ This text contains characteristics commonly found in spam or promotional content."
        else:
            result += "\n\n✨ This text appears to be legitimate and natural."
            
        return result
        
    except Exception as e:
        return f"❌ Error analyzing text: {str(e)}"

def generate_random_sentence():
    """Generate a random sentence for testing"""
    sentences = [
        # Legitimate sentences
        "Hey, how are you doing today?",
        "The meeting is scheduled for tomorrow at 3 PM.",
        "Thanks for your help with the project yesterday.",
        "Happy birthday! Hope you have a wonderful day.",
        "Can you please send me the report by Friday?",
        "I'll be working from home tomorrow.",
        "Let's grab coffee sometime this week.",
        "The weather is really nice today, isn't it?",
        "Please review the attached document when you have time.",
        "Looking forward to seeing you at the conference.",
        
        # Spam-like sentences
        "Congratulations! You've won a $1000 gift card!",
        "URGENT: Click here to claim your prize now!",
        "Make money fast with this amazing opportunity!",
        "Hot singles in your area want to meet you!",
        "FREE VIAGRA! Limited time offer, act now!",
        "You have been selected for a special promotion!",
        "Lose weight fast with these miracle pills!",
        "FINAL NOTICE: Your account will be suspended!",
        "Nigerian prince needs your help with millions!",
        "Guaranteed loan approval, no credit check required!",
        
        # Mixed/borderline sentences
        "Special offer just for you - 50% off everything!",
        "Don't miss out on this limited time deal!",
        "Free shipping on your next order!",
        "Urgent: Please update your account information.",
        "Click here to unsubscribe from our mailing list."
    ]
    
    return random.choice(sentences)

# Create Gradio interface
with gr.Blocks(theme=gr.themes.Soft(), title="Universal Text Spam Classifier") as iface:
    gr.Markdown("""
    # 🤖 Universal Text Spam Classifier
    
    **Analyze any text to detect spam-like characteristics!**
    
    This classifier can analyze emails, messages, social media posts, or any text content to identify:
    - 🚨 **Spam-like content**: Promotional, suspicious, or unwanted text
    - ✅ **Legitimate content**: Natural, normal communication
    
    Enter any text below or try the random sentence generator!
    """)
    
    with gr.Row():
        with gr.Column(scale=3):
            text_input = gr.Textbox(
                lines=4,
                placeholder="Enter any text here...\n\nExamples:\n• 'Hey, how are you?'\n• 'Congratulations! You won!'\n• 'Meeting at 3 PM tomorrow'",
                label="Text to Analyze"
            )
        
        with gr.Column(scale=1):
            random_btn = gr.Button("🎲 Generate Random Sentence", variant="secondary")
    
    classify_btn = gr.Button("🔍 Analyze Text", variant="primary", size="lg")
    
    result_output = gr.Textbox(
        label="Analysis Result",
        lines=6,
        interactive=False
    )
    
    gr.Markdown("""
    ### 📚 How it works:
    - **Pattern Recognition**: Identifies common spam keywords and phrases
    - **Context Analysis**: Considers legitimate communication patterns
    - **Confidence Scoring**: Provides confidence levels for classifications
    - **Multi-factor Detection**: Analyzes capitalization, punctuation, and URL patterns
    
    ### 🎯 Use Cases:
    - Email filtering and security
    - Social media content moderation
    - Message screening
    - Content quality assessment
    """)
    
    # Event handlers
    classify_btn.click(
        fn=predict_text,
        inputs=text_input,
        outputs=result_output
    )
    
    random_btn.click(
        fn=generate_random_sentence,
        outputs=text_input
    )
    
    text_input.submit(
        fn=predict_text,
        inputs=text_input,
        outputs=result_output
    )

# Launch the interface
if __name__ == "__main__":
    print("🚀 Starting Universal Text Spam Classifier...")
    iface.launch(share=True)

🚀 Starting Universal Text Spam Classifier...
* Running on local URL:  http://127.0.0.1:7868
* Running on public URL: https://368b907736658570da.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


✅ Created enhanced pattern-based classifier
