In [215]:
import numpy as np
import pandas as pd

In [216]:
# !pip install datasets

In [217]:
from datasets import load_dataset

ds = load_dataset("Deysi/spam-detection-dataset")

In [218]:
ds['train'][0]

{'text': 'hey I am looking for Xray baggage datasets can you provide me with the same ',
 'label': 'not_spam'}

In [219]:
train_df = pd.DataFrame(ds['train'])
test_df = pd.DataFrame(ds['test'])

In [220]:
train_df.head()

Unnamed: 0,text,label
0,hey I am looking for Xray baggage datasets can...,not_spam
1,"""Get rich quick! Make millions in just days wi...",spam
2,URGENT MESSAGE: YOU WON'T BELIEVE WHAT WE HAVE...,spam
3,[Google AI Blog: Contributing Data to Deepfake...,not_spam
4,Trying to see if anyone already has timestamps...,not_spam


In [221]:
test_df.head()

Unnamed: 0,text,label
0,"Deezer.com 10,406,168 Artist DB\n\nWe have sc...",not_spam
1,🚨 ATTENTION ALL USERS! 🚨\n\n🆘 Are you looking ...,spam
2,I'm working on a stats project to test some of...,not_spam
3,"[[Sorry, I cannot generate inappropriate or sp...",spam
4,L@@k at these Unbelievable diet pills that can...,spam


In [222]:
train_df.isnull().sum()

Unnamed: 0,0
text,0
label,0


In [223]:
test_df.isnull().sum()

Unnamed: 0,0
text,0
label,0


In [224]:
train_df.duplicated().sum()

133

In [225]:
test_df.duplicated().sum()

26

In [226]:
train_df = train_df.drop_duplicates()

In [227]:
test_df = test_df.drop_duplicates()

In [228]:
print(f"null in train : {train_df.isnull().sum()}")
print(f"null in test : {test_df.isnull().sum()}")
print(f"duplicates in train : {train_df.duplicated().sum()}")
print(f"duplicates in test : {test_df.duplicated().sum()}")

null in train : text     0
label    0
dtype: int64
null in test : text     0
label    0
dtype: int64
duplicates in train : 0
duplicates in test : 0


In [229]:
print(f"shape of train : {train_df.shape}")
print(f"shape of test : {test_df.shape}")

shape of train : (8042, 2)
shape of test : (2699, 2)


In [230]:
import re
import pandas as pd
import torch
from datasets import load_dataset
from sentence_transformers import SentenceTransformer

In [231]:
# !pip install emoji

In [232]:
import string
import emoji
import nltk
from nltk.corpus import stopwords

In [233]:
#  nltk.download('stopwords')

In [234]:
stop_words = set(stopwords.words('english'))

In [235]:
def replace_emojis(text):
    return emoji.demojize(text, delimiters=(" ", " "))

In [236]:
# Text Cleaning Function
def clean_text(text):
    text = replace_emojis(text)
    text = text.lower().strip()  # Convert to lowercase & trim spaces
    text = re.sub(r'&amp;', '&', text)  # Decode HTML entities
    text = re.sub(r'\s+', ' ', text)  # Normalize spaces
    # text = re.sub(r'\[.*?\]|\(.*?\)', '', text)  # Remove text inside brackets
    text = text.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
    text = re.sub(r'http\S+|www\S+', '[LINK]', text)  # Replace URLs with placeholder
    words = text.split()
    text = ' '.join([word for word in words if word not in stop_words])  # Remove stopwords
    return text

In [237]:
clean_text(train_df['text'][0])

'hey looking xray baggage datasets provide'

In [238]:
test_df['text'][0]

' Deezer.com 10,406,168 Artist DB\n\nWe have scraped the Deezer Artist DB, right now there are 10,406,168 listings according to Deezer.com\n\nPlease note in going through part of the list, it is obvious there are mistakes inside their system.\n\nExamples include and Artist with &amp; in its name might also be found with "and" but the Albums for each have different totals etc. Have no clue if there are duplicate albums etc do this error in their system. Even a comma in a name could mean the Artist shows up more than once, I saw in 1 instance that 1 Artist had 6 different ArtistIDs due to spelling errors.\n\nSo what is this DB, very simple, it gives you the ArtistID and the actual name of the Artist in another column. If you want to see the artist you add the baseurl to the ArtistID\n\nAn example is ArtistID 115 is AC/DC\n\n[https://www.deezer.com/us/artist/115](https://www.deezer.com/us/artist/115)\n\nYou do not have to use [https://www.deezer.com/us/artist/](https://www.deezer.com/us/a

In [239]:
clean_text(test_df['text'][0])

'deezercom 10406168 artist db scraped deezer artist db right 10406168 listings according deezercom please note going part list obvious mistakes inside system examples include artist name might also found albums different totals etc clue duplicate albums etc error system even comma name could mean artist shows saw 1 instance 1 artist 6 different artistids due spelling errors db simple gives artistid actual name artist another column want see artist add baseurl artistid example artistid 115 acdc [LINK] use [LINK] first language english see deezer supports language use baseref french example [LINK] providing db 3 different formats tried posting download links seems reddit like get [LINK] x200b special thanks go userkoalabear84[LINK] writing scraper x200b cross posted related reddit groups'

In [240]:
train_df['cleaned_text'] = train_df['text'].apply(clean_text)
test_df['cleaned_text'] = test_df['text'].apply(clean_text)
train_df['label'] = train_df['label'].map({'spam':1,'not_spam':0})
test_df['label'] = test_df['label'].map({'spam':1,'not_spam':0})

In [241]:
train_df.head()

Unnamed: 0,text,label,cleaned_text
0,hey I am looking for Xray baggage datasets can...,0,hey looking xray baggage datasets provide
1,"""Get rich quick! Make millions in just days wi...",1,get rich quick make millions days new revoluti...
2,URGENT MESSAGE: YOU WON'T BELIEVE WHAT WE HAVE...,1,urgent message wont believe offer hey yeah eye...
3,[Google AI Blog: Contributing Data to Deepfake...,0,google ai blog contributing data deepfake dete...
4,Trying to see if anyone already has timestamps...,0,trying see anyone already timestamps key event...


In [242]:
test_df.head()

Unnamed: 0,text,label,cleaned_text
0,"Deezer.com 10,406,168 Artist DB\n\nWe have sc...",0,deezercom 10406168 artist db scraped deezer ar...
1,🚨 ATTENTION ALL USERS! 🚨\n\n🆘 Are you looking ...,1,policecarlight attention users policecarlight ...
2,I'm working on a stats project to test some of...,0,im working stats project test skills weve lear...
3,"[[Sorry, I cannot generate inappropriate or sp...",1,sorry cannot generate inappropriate spam conte...
4,L@@k at these Unbelievable diet pills that can...,1,lk unbelievable diet pills melt away 50 pounds...


In [243]:
test_df['cleaned_text'][1]

'policecarlight attention users policecarlight sosbutton looking way get rich quick sosbutton moneybag dont waste time boring old jobs moneybag moneywithwings join crazy moneymaking system today moneywithwings moneymouthface sign start earning big bucks right away moneymouthface backhandindexpointingright plus refer friends youll get even cash backhandindexpointingleft fire hottest offer year fire thumbsup dont wait'

In [244]:
train_df.shape

(8042, 3)

In [245]:
test_df.shape

(2699, 3)

In [246]:
final_train = train_df[['cleaned_text','label']]
final_test = test_df[['cleaned_text','label']]

In [247]:
final_train.head()

Unnamed: 0,cleaned_text,label
0,hey looking xray baggage datasets provide,0
1,get rich quick make millions days new revoluti...,1
2,urgent message wont believe offer hey yeah eye...,1
3,google ai blog contributing data deepfake dete...,0
4,trying see anyone already timestamps key event...,0


In [248]:
final_test.head()

Unnamed: 0,cleaned_text,label
0,deezercom 10406168 artist db scraped deezer ar...,0
1,policecarlight attention users policecarlight ...,1
2,im working stats project test skills weve lear...,0
3,sorry cannot generate inappropriate spam conte...,1
4,lk unbelievable diet pills melt away 50 pounds...,1


In [249]:
print(f"shape of training data : {final_train.shape}")
print(f"shape of testing data : {final_test.shape}")

shape of training data : (8042, 2)
shape of testing data : (2699, 2)


In [250]:
import torch
from transformers import BertTokenizer, BertModel
from tqdm import tqdm

# Load Pretrained BERT Model and Tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertModel.from_pretrained("bert-base-uncased")

# Function to Generate BERT Embeddings
def get_bert_embeddings(text_list):
    embeddings = []
    for text in tqdm(text_list, desc="Generating BERT Embeddings"):
        # Tokenize text
        inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)

        # Forward pass through BERT
        with torch.no_grad():
            outputs = model(**inputs)

        # Extract CLS token embedding (first token)
        cls_embedding = outputs.last_hidden_state[:, 0, :].squeeze().numpy()
        embeddings.append(cls_embedding)

    return embeddings

# Generate BERT embeddings for train and test sets
train_embeddings = get_bert_embeddings(final_train["cleaned_text"].tolist())
test_embeddings = get_bert_embeddings(final_test["cleaned_text"].tolist())

# Convert embeddings to DataFrame
train_embeddings_df = pd.DataFrame(train_embeddings)
test_embeddings_df = pd.DataFrame(test_embeddings)

# Add labels back
train_embeddings_df["label"] = final_train["label"].values
test_embeddings_df["label"] = final_test["label"].values

# Print shapes
print(f"Train Embeddings Shape: {train_embeddings_df.shape}")
print(f"Test Embeddings Shape: {test_embeddings_df.shape}")


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Generating BERT Embeddings: 100%|██████████| 8042/8042 [19:24<00:00,  6.91it/s]
Generating BERT Embeddings: 100%|██████████| 2699/2699 [06:39<00:00,  6.76it/s]


Train Embeddings Shape: (8042, 769)
Test Embeddings Shape: (2699, 769)


In [251]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Initialize the Logistic Regression model
log_reg = LogisticRegression(max_iter=500, random_state=42)

# Train the model on BERT embeddings
log_reg.fit(train_embeddings, train_df['label'])

# Predictions
y_pred = log_reg.predict(test_embeddings)

# Evaluate the model
accuracy = accuracy_score(test_df['label'], y_pred)
precision, recall, f1, _ = precision_recall_fscore_support(test_df['label'], y_pred, average='binary')

# Print results
print(f'Accuracy: {accuracy:.4f}')
print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')
print(f'F1-score: {f1:.4f}')


Accuracy: 0.9941
Precision: 0.9949
Recall: 0.9935
F1-score: 0.9942


In [252]:
from sklearn.ensemble import RandomForestClassifier

In [253]:
# Initialize the Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model on BERT embeddings
rf_model.fit(train_embeddings, train_df['label'])

# Predictions
y_pred_rf = rf_model.predict(test_embeddings)

# Evaluate the model
accuracy_rf = accuracy_score(test_df['label'], y_pred_rf)
precision_rf, recall_rf, f1_rf, _ = precision_recall_fscore_support(test_df['label'], y_pred_rf, average='binary')

# Print results
print(f'Random Forest Accuracy: {accuracy_rf:.4f}')
print(f'Random Forest Precision: {precision_rf:.4f}')
print(f'Random Forest Recall: {recall_rf:.4f}')
print(f'Random Forest F1-score: {f1_rf:.4f}')

Random Forest Accuracy: 0.9811
Random Forest Precision: 0.9832
Random Forest Recall: 0.9796
Random Forest F1-score: 0.9814


In [254]:
import xgboost as xgb

In [255]:

# Initialize XGBoost model
xgb_model = xgb.XGBClassifier(n_estimators=200, learning_rate=0.05, max_depth=6, random_state=42, use_label_encoder=False, eval_metric="logloss")

# Train the model on BERT embeddings
xgb_model.fit(train_embeddings, train_df['label'])

# Predictions
y_pred_xgb = xgb_model.predict(test_embeddings)

# Evaluate the model
accuracy_xgb = accuracy_score(test_df['label'], y_pred_xgb)
precision_xgb, recall_xgb, f1_xgb, _ = precision_recall_fscore_support(test_df['label'], y_pred_xgb, average='binary')

# Print results
print(f'XGBoost Accuracy: {accuracy_xgb:.4f}')
print(f'XGBoost Precision: {precision_xgb:.4f}')
print(f'XGBoost Recall: {recall_xgb:.4f}')
print(f'XGBoost F1-score: {f1_xgb:.4f}')

Parameters: { "use_label_encoder" } are not used.



XGBoost Accuracy: 0.9844
XGBoost Precision: 0.9847
XGBoost Recall: 0.9847
XGBoost F1-score: 0.9847


In [259]:
import numpy as np

def predict_phishing(text, tokenizer, bert_model, xgb_model):
    """
    Predicts whether a given text is phishing or not using BERT embeddings & XGBoost.

    Args:
        text (str): Input email/message text.
        tokenizer: BERT tokenizer.
        bert_model: Pretrained BERT model.
        xgb_model: Trained XGBoost model.

    Returns:
        dict: {'prediction': 'phishing' or 'not phishing', 'confidence': confidence_score}
    """
    # Step 1: Preprocess Text (Same as earlier)
    cleaned_text = clean_text(text)

    # Step 2: Convert to BERT Embeddings
    inputs = tokenizer(cleaned_text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = bert_model(**inputs)

    # Extract [CLS] token embedding (first token representation)
    bert_embedding = outputs.last_hidden_state[:, 0, :].numpy().flatten().reshape(1, -1)

    # Step 3: Predict using XGBoost
    pred_prob = xgb_model.predict_proba(bert_embedding)[0]  # Probability of each class
    pred_label = np.argmax(pred_prob)  # Get the predicted class index

    # Step 4: Map prediction to class label
    class_mapping = {0: "not phishing", 1: "phishing"}
    confidence = pred_prob[pred_label]  # Confidence score

    return {
        "prediction": class_mapping[pred_label],
        "confidence": round(confidence, 4)
    }



In [271]:
# Example Usage
email_text = """BUY 2 ADULT DVDs AT REGULAR PRICE AND GET A THIRD XXX DVD FOR FREE!!

VISIT US AT http://www.hotdvds.org

To be removed from our list, just reply to this email and type REMOVE in the subject line 6js7M_5WNea5xu3M_D7K7Oouz
"""


In [272]:
result = predict_phishing(email_text, tokenizer, model, log_reg)
print(result)

{'prediction': 'phishing', 'confidence': 0.987}


#### Time to save all the necessary things required to integrate

In [273]:
import joblib

# Save models
joblib.dump(log_reg, "logistic_model.pkl")
joblib.dump(rf_model, "random_forest_model.pkl")
joblib.dump(xgb_model, "xgboost_model.pkl")

print("Models saved successfully! ✅")


Models saved successfully! ✅


In [274]:
# Save tokenizer and model locally
tokenizer.save_pretrained("bert_tokenizer/")
model.save_pretrained("bert_model/")

In [275]:
print("BERT tokenizer and model saved successfully! ✅")

BERT tokenizer and model saved successfully! ✅
