In [None]:
import pandas as pd

data = pd.read_csv("enron_spam_data.csv", engine="python", on_bad_lines="skip")
data = data[['Message', 'Spam/Ham']]
data.columns = ['text', 'label']
data.dropna(inplace=True)


In [None]:
def remap_label(row):
    text = row['text']
    if row['label'] == 'spam':
        return 'Spam'
    if 'offer' in text or 'discount' in text or 'sale' in text:
        return 'Promotions'
    if 'support' in text or 'issue' in text or 'help' in text:
        return 'Support'
    return 'Personal'

data['label'] = data.apply(remap_label, axis=1)


In [None]:
import re

def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z ]', '', text)
    return text

data['text'] = data['text'].apply(clean_text)


In [None]:
data['text'] = data['text'].astype(str)
data = data[data['text'].str.strip() != '']
data.dropna(inplace=True)


In [None]:
X = data['text']
y = data['label']

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(max_features=5000, ngram_range=(1,2))
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)


In [None]:
from sklearn.linear_model import LogisticRegression

tfidf_model = LogisticRegression(max_iter=1000)
tfidf_model.fit(X_train_tfidf, y_train)


In [None]:
from sklearn.metrics import confusion_matrix, classification_report

y_pred_tfidf = tfidf_model.predict(X_test_tfidf)
confusion_matrix(y_test, y_pred_tfidf)
classification_report(y_test, y_pred_tfidf)


'              precision    recall  f1-score   support\n\n    Personal       0.86      0.99      0.92      1175\n  Promotions       0.93      0.54      0.68       236\n        Spam       1.00      1.00      1.00       596\n     Support       0.85      0.65      0.74       339\n\n    accuracy                           0.90      2346\n   macro avg       0.91      0.79      0.83      2346\nweighted avg       0.90      0.90      0.89      2346\n'

In [None]:
pip install fasttext

Collecting fasttext
  Downloading fasttext-0.9.3.tar.gz (73 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/73.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m [32m71.7/73.4 kB[0m [31m2.1 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m73.4/73.4 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting pybind11>=2.2 (from fasttext)
  Using cached pybind11-3.0.1-py3-none-any.whl.metadata (10.0 kB)
Using cached pybind11-3.0.1-py3-none-any.whl (293 kB)
Building wheels for collected packages: fasttext
  Building wheel for fasttext (pyproject.toml) ... [?25l[?25hdone
  Created wheel for fasttext: filename=fasttext-0.9.3-cp312-cp312-linux_x86_64.whl size=4498211 sha256=e0cb53dc3ccd1c1

In [None]:
import fasttext

with open("emails.txt", "w") as f:
    for t in X:
        f.write(t + "\n")

ft_model = fasttext.train_unsupervised("emails.txt", model="skipgram")


In [None]:
import numpy as np

def ft_vector(text):
    return ft_model.get_sentence_vector(text)

X_train_ft = np.array([ft_vector(t) for t in X_train])
X_test_ft = np.array([ft_vector(t) for t in X_test])


In [None]:
from sklearn.ensemble import RandomForestClassifier

ft_model_clf = RandomForestClassifier(n_estimators=200)
ft_model_clf.fit(X_train_ft, y_train)


In [None]:
y_pred_ft = ft_model_clf.predict(X_test_ft)
confusion_matrix(y_test, y_pred_ft)
classification_report(y_test, y_pred_ft)


'              precision    recall  f1-score   support\n\n    Personal       0.78      0.98      0.87      1175\n  Promotions       0.84      0.39      0.53       236\n        Spam       1.00      1.00      1.00       596\n     Support       0.78      0.37      0.51       339\n\n    accuracy                           0.84      2346\n   macro avg       0.85      0.68      0.73      2346\nweighted avg       0.84      0.84      0.81      2346\n'

In [None]:
from transformers import AutoTokenizer, AutoModel
import torch

tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


In [None]:
def transformer_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        output = model(**inputs)
    return output.last_hidden_state.mean(dim=1).squeeze().numpy()


In [None]:
X_train_emb = np.array([transformer_embedding(t) for t in X_train])
X_test_emb = np.array([transformer_embedding(t) for t in X_test])


In [None]:
genai_model = LogisticRegression(max_iter=1000)
genai_model.fit(X_train_emb, y_train)


In [None]:
y_pred_genai = genai_model.predict(X_test_emb)
confusion_matrix(y_test, y_pred_genai)
classification_report(y_test, y_pred_genai)


'              precision    recall  f1-score   support\n\n    Personal       0.81      0.90      0.85      1175\n  Promotions       0.70      0.52      0.59       236\n        Spam       1.00      1.00      1.00       596\n     Support       0.58      0.47      0.52       339\n\n    accuracy                           0.82      2346\n   macro avg       0.77      0.72      0.74      2346\nweighted avg       0.81      0.82      0.82      2346\n'

In [None]:
def predict_email(email):
    email = clean_text(email)
    emb = transformer_embedding(email).reshape(1, -1)
    return genai_model.predict(emb)[0]

predict_email("Get 50 percent discount now")


'Promotions'