## Spam Email Detector

Import Libraries

In [23]:
from sklearn.ensemble import IsolationForest
from sklearn.model_selection import train_test_split
import pandas as pd

load dataset

In [24]:
df = pd.read_csv(
    "hf://datasets/TrainingDataPro/email-spam-classification/email_spam.csv"
)
df.head()
# assign X and y
X = df["title"] + " " + df["text"]
y = df["type"]

# split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# view shape
print(f"Training data shape: {X_train.shape}, {y_train.shape}")
print(f"Testing data shape: {X_test.shape}, {y_test.shape}")

Training data shape: (67,), (67,)
Testing data shape: (17,), (17,)


Preprocess the text

In [None]:
# convert all text to lowercase
X_train = X_train.str.lower()
X_test = X_test.str.lower()

# vectorize text
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(stop_words="english",max_features=1000,min_df=2,ngram_range=(1,2))

X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

# view shape
print(f"Training data shape: {X_train.shape}, {y_train.shape}")
print(f"Testing data shape: {X_test.shape}, {y_test.shape}")

Training data shape: (67, 2065), (67,)
Testing data shape: (17, 2065), (17,)


Initilise model

In [26]:
model = IsolationForest(contamination=0.31, random_state=42)
# train model
model.fit(X_train)

# predict on test data
predictions = model.predict(X_test)
anomaly_scores = model.decision_function(X_test)

# view predictions
print(f"Predictions: {predictions}")
print(f"Anomaly scores: {anomaly_scores}")

# crosstab
print(pd.crosstab(predictions, y_test,margins=True))

Predictions: [ 1  1  1  1  1  1  1  1 -1  1  1  1  1  1  1  1 -1]
Anomaly scores: [ 0.00805365  0.02265909  0.02068973  0.04306854  0.01739172  0.01613712
  0.02735531  0.01925921 -0.0193898   0.03409306  0.02759595  0.01767208
  0.00807727  0.00712183  0.00889083  0.00085627 -0.01871754]
type   not spam  spam  All
row_0                     
-1            1     1    2
1            10     5   15
All          11     6   17


Try one class SVM

In [27]:
# import one class svm
from sklearn.svm import OneClassSVM
# create model
model = OneClassSVM(gamma='auto', nu=0.1)
# train model
model.fit(X_train)
# predict on test data
predictions = model.predict(X_test)
anomaly_scores = model.decision_function(X_test)
# view predictions
print(f"Predictions: {predictions}")
print(f"Anomaly scores: {anomaly_scores}")
# crosstab
print(pd.crosstab(predictions, y_test,margins=True))

Predictions: [-1 -1 -1 -1 -1 -1 -1 -1  1 -1 -1 -1 -1  1 -1 -1 -1]
Anomaly scores: [-2.66971446e-04 -2.70332428e-04 -5.39767325e-04 -8.23377061e-05
 -2.62834284e-04 -2.66861474e-04 -3.10403397e-04 -5.43192794e-04
  1.31713227e-04 -3.82575149e-04 -5.86165965e-04 -4.47082052e-04
 -3.12369359e-04  1.35044752e-06 -1.38845529e-04 -2.14232130e-04
 -1.05998370e-04]
type   not spam  spam  All
row_0                     
-1           10     5   15
1             1     1    2
All          11     6   17


Data Size too small for this unsupervised learning?