In [None]:
import os

# Detect SMS spam dataset in several common locations and set `file_path` for the notebook.
possible_paths = [
    'SMS Spam',
    './SMS Spam',
    '../SMS Spam',
    r'C:/Users/USER/Desktop/Newtopy/SMS Spam',
    'sms.csv',
    './sms.csv',
    '../sms.csv',
    './main/sms.csv',
    r'C:/Users/USER/Desktop/Newtopy/sms.csv'
]
print('Kernel working directory:', os.getcwd())
found = None
for p in possible_paths:
    if os.path.exists(p):
        found = p
        print(f'  Found: {p}')
        break
if not found:
    raise FileNotFoundError(f"SMS dataset not found. Checked: {possible_paths}\nPlace 'SMS Spam' or 'sms.csv' in the project root or the main/ folder, or update the path.")
file_path = found
print(f'Using file_path = {file_path}')


Kernel working directory: c:\Users\USER\Desktop\SPAM DETECTION\main
  Found: ../SMS Spam
Using file_path = ../SMS Spam


In [None]:
# Quick verification that file_path was found and is readable
import os
if os.path.exists(file_path):
    size_mb = os.path.getsize(file_path) / (1024 * 1024)
    print(f'✓ File found: {file_path}')
    print(f'  Size: {size_mb:.2f} MB')
else:
    print(f'✗ File not found: {file_path}')


✓ File found: ../SMS Spam
  Size: 0.46 MB


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import re
import joblib

# --- 1. Define Preprocessing Function (Must match the one in app.py) ---
def preprocess_text(text):
    """Cleans and preprocesses the input text."""
    # Handle potential non-string inputs from unexpected data structure, if any
    if not isinstance(text, str):
        return ""
    text = text.lower()
    # Remove all characters that are not lowercase letters or spaces
    text = re.sub(r'[^a-z\s]', '', text)
    # Remove extra spaces (tabs, newlines, multiple spaces) and strip leading/trailing spaces
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# --- 2. Load the Dataset ---
# The file is tab-separated and has no header. Try common relative paths so the notebook

data = pd.read_csv('../SMS Spam', sep='\t', header=None, names=['label', 'message'], encoding='latin-1', on_bad_lines='skip')

# Initial inspection and cleanup
print("Initial Data Shape:", data.shape)
print("\nFirst 5 rows:")
print(data.head())
print("\nData Info:")
data.info()

# --- 3. Prepare Data ---
# Convert labels to numerical format: 'ham' -> 0, 'spam' -> 1
data['label_encoded'] = data['label'].apply(lambda x: 1 if x == 'spam' else 0)

# Apply text cleaning
data['cleaned_message'] = data['message'].apply(preprocess_text)

# Check label distribution to confirm stratification necessity
print("\nLabel Distribution:")
print(data['label'].value_counts())

# --- 4. Split Data ---
X_train_msg, X_test_msg, y_train, y_test = train_test_split(
    data['cleaned_message'],
    data['label_encoded'],
    test_size=0.2,
    random_state=42,
    # Stratify to maintain the same spam/ham ratio in train/test sets
    stratify=data['label_encoded'] 
)

# --- 5. Vectorization (TF-IDF) ---
vectorizer = TfidfVectorizer(stop_words='english')

# Fit the vectorizer only on the training data and transform both train and test sets
X_train = vectorizer.fit_transform(X_train_msg)
X_test = vectorizer.transform(X_test_msg)

# --- 6. Model Training (Multinomial Naive Bayes) ---
model = MultinomialNB()
model.fit(X_train, y_train)

# --- 7. Evaluation ---
y_pred = model.predict(X_test)

# Calculate metrics (using zero_division=0 to handle potential division by zero if a class has no predictions)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, zero_division=0)
recall = recall_score(y_test, y_pred, zero_division=0)
f1 = f1_score(y_test, y_pred, zero_division=0)

# --- 8. Save Final Model and Vectorizer ---
joblib.dump(model, 'sms_spam_model.joblib')
joblib.dump(vectorizer, 'tfidf_vectorizer.joblib')

print("\nModel Training Complete")
print("Accuracy:", round(accuracy, 4))
print("Precision:", round(precision, 4))
print("Recall:", round(recall, 4))
print("F1-Score:", round(f1, 4))
print("\nFiles saved: sms_spam_model.joblib, tfidf_vectorizer.joblib")

ModuleNotFoundError: No module named 'pandas'