In [None]:
### Email Spam Detection Preprocessing.ipynb

# 1. Import libraries
import pandas as pd
import numpy as np
import re
import string
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

# 2. Load the dataset
file_path = '/mnt/data/combined_data.csv'
data = pd.read_csv(file_path)

# 3. Quick look at the data
print("First few rows:")
print(data.head())
print("\nData Info:")
print(data.info())

# 4. Check columns
print("\nColumns:", data.columns.tolist())

# Assuming the columns are ['text', 'label']
# If different, you can rename like this:
# data.rename(columns={'your_text_column': 'text', 'your_label_column': 'label'}, inplace=True)

# 5. Basic Cleaning Function
def clean_text(text):
    text = str(text).lower()                            # Lowercase
    text = re.sub(r'<.*?>', '', text)                   # Remove HTML tags
    text = re.sub(r'http\S+', '', text)                 # Remove URLs
    text = re.sub(r'[^\w\s]', '', text)                # Remove punctuation
    text = re.sub(r'\d+', '', text)                    # Remove numbers
    text = re.sub(r'\s+', ' ', text).strip()             # Remove extra spaces
    return text

# 6. Apply cleaning to the text data
data['clean_text'] = data['text'].apply(clean_text)

# 7. Feature Extraction (TF-IDF)
vectorizer = TfidfVectorizer(max_features=3000)  # Limit to top 3000 features
X = vectorizer.fit_transform(data['clean_text'])

# 8. Target Variable
y = data['label']

# 9. Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 10. Save processed data (Optional)
# from scipy import sparse
# sparse.save_npz('/mnt/data/X_train.npz', X_train)
# np.save('/mnt/data/y_train.npy', y_train)

print("\nPreprocessing Complete!")
print(f"Training samples: {X_train.shape[0]}")
print(f"Testing samples: {X_test.shape[0]}")


First few rows:
   label                                               text
0      1  ounce feather bowl hummingbird opec moment ala...
1      1  wulvob get your medircations online qnb ikud v...
2      0   computer connection from cnn com wednesday es...
3      1  university degree obtain a prosperous future m...
4      0  thanks for all your answers guys i know i shou...

Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 83448 entries, 0 to 83447
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   label   83448 non-null  int64 
 1   text    83448 non-null  object
dtypes: int64(1), object(1)
memory usage: 1.3+ MB
None

Columns: ['label', 'text']

Preprocessing Complete!
Training samples: 66758
Testing samples: 16690


In [3]:
print(data['label'].value_counts())


label
1    43910
0    39538
Name: count, dtype: int64


In [4]:
# 1. Import classifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix

# 2. Initialize and train the model
model = MultinomialNB()
model.fit(X_train, y_train)

# 3. Predict on test data
y_pred = model.predict(X_test)

# 4. Evaluate
print("\nAccuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred))

print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))



Accuracy: 0.9590173756740563
Precision: 0.9641049240681087
Recall: 0.9574954296160878
F1 Score: 0.9607888099059849

Classification Report:
               precision    recall  f1-score   support

           0       0.95      0.96      0.96      7938
           1       0.96      0.96      0.96      8752

    accuracy                           0.96     16690
   macro avg       0.96      0.96      0.96     16690
weighted avg       0.96      0.96      0.96     16690


Confusion Matrix:
 [[7626  312]
 [ 372 8380]]


In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC

# Initialize models
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Random Forest": RandomForestClassifier(n_estimators=100),
    "Linear SVM": LinearSVC()
}

# Train, Predict and Evaluate
for name, clf in models.items():
    print(f"\n=== {name} ===")
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Precision:", precision_score(y_test, y_pred))
    print("Recall:", recall_score(y_test, y_pred))
    print("F1 Score:", f1_score(y_test, y_pred))



=== Logistic Regression ===
Accuracy: 0.9802875973636909
Precision: 0.9758219410236132
Recall: 0.9868601462522852
F1 Score: 0.9813100039765948

=== Random Forest ===
Accuracy: 0.9846015578190533
Precision: 0.978375943236851
Recall: 0.992573126142596
F1 Score: 0.9854234019624525

=== Linear SVM ===
Accuracy: 0.9828639904134212
Precision: 0.9812414733969986
Recall: 0.9861745886654479
F1 Score: 0.9837018463642581


In [None]:
import pickle
# Save the model
with open('spam_classifier_model.pkl', 'wb') as f:
    pickle.dump(model, f)

# Save the vectorizer
with open('tfidf_vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)

print("Model and vectorizer saved!")


Model and vectorizer saved!
