<a href="https://colab.research.google.com/github/Chxrls/ADET-Laravel/blob/main/PhisNet.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [16]:
!pip install datasets




In [17]:
from datasets import load_dataset
dataset = load_dataset("zefang-liu/phishing-email-dataset")
print(dataset)
print(dataset['train'][0]['Email Type'])
print(dataset['train'].features['Email Type'])

DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0', 'Email Text', 'Email Type'],
        num_rows: 18650
    })
})
Safe Email
Value(dtype='string', id=None)


In [18]:
# Splitting the dataset for test/train
from datasets import ClassLabel, DatasetDict
import pandas as pd

print("Casting 'Email Type' column to ClassLabel...")
# Get the unique string labels from the column
unique_labels = dataset['train'].unique('Email Type')
# Create the ClassLabel feature using these unique labels
class_label_feature = ClassLabel(names=unique_labels)
# Apply the casting to the 'train' split
dataset['train'] = dataset['train'].cast_column('Email Type', class_label_feature)

# Verify the change (optional but recommended)
print("\n--- Feature after casting 'Email Type' ---")
print(dataset['train'].features)
# Check the integer label for the first example
print("\nExample label after casting (now an integer):")
print(dataset['train'][0]['Email Type'])


# --- Now, re-run the splitting code ---
print("\nSplitting the dataset with stratification...")

# 1. Split the original 'train' data into training (80%) and test (20%)
# Stratify by 'Email Type' should work now because it's a ClassLabel
train_test_split = dataset['train'].train_test_split(test_size=0.2, stratify_by_column='Email Type')

test_dataset = train_test_split['test']         # This is 20% of the original data

# 2. Split the intermediate training set (80% of original) into final train and validation
# Validation = 10% of original => 12.5% of this intermediate training set (0.1 / 0.8 = 0.125)
train_val_split = train_test_split['train'].train_test_split(test_size=0.125, stratify_by_column='Email Type')

train_dataset = train_val_split['train']         # This is the final training set (70% of original)
validation_dataset = train_val_split['test']     # This is the final validation set (10% of original)

# 3. Combine them back into a single DatasetDict
final_dataset = DatasetDict({
    'train': train_dataset,
    'validation': validation_dataset,
    'test': test_dataset
})

# Print the final dataset structure to verify
print("\n--- Final Dataset Structure ---")
print(final_dataset)

# Optional: Check the class distribution in each split
print("\n--- Class Distribution Check ---")
for split_name, split_data in final_dataset.items():
    print(f"\nSplit: {split_name}")
    try:
        df = split_data.to_pandas()
        # Labels are now integers (0, 1, etc.)
        print("Label distribution (normalized):")
        print(df['Email Type'].value_counts(normalize=True))
        print("\nLabel distribution (raw counts):")
        print(df['Email Type'].value_counts())
    except ImportError:
        print("Pandas library not found. Skipping distribution check.")
    except Exception as e:
        print(f"Could not check distribution for {split_name}: {e}")

Casting 'Email Type' column to ClassLabel...

--- Feature after casting 'Email Type' ---
{'Unnamed: 0': Value(dtype='int64', id=None), 'Email Text': Value(dtype='string', id=None), 'Email Type': ClassLabel(names=['Safe Email', 'Phishing Email'], id=None)}

Example label after casting (now an integer):
0

Splitting the dataset with stratification...

--- Final Dataset Structure ---
DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0', 'Email Text', 'Email Type'],
        num_rows: 13055
    })
    validation: Dataset({
        features: ['Unnamed: 0', 'Email Text', 'Email Type'],
        num_rows: 1865
    })
    test: Dataset({
        features: ['Unnamed: 0', 'Email Text', 'Email Type'],
        num_rows: 3730
    })
})

--- Class Distribution Check ---

Split: train
Label distribution (normalized):
Email Type
0    0.607124
1    0.392876
Name: proportion, dtype: float64

Label distribution (raw counts):
Email Type
0    7926
1    5129
Name: count, dtype: int64

Split: val

In [19]:
# Tokenizing - crucial step
from transformers import AutoTokenizer

model_checkpoint = "distilbert-base-uncased"

# 2. Load the tokenizer associated with the chosen checkpoint
print(f"Loading tokenizer for '{model_checkpoint}'...")
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
print("Tokenizer loaded.")

# 3. Define a function to tokenize the email text
#    This function takes a batch of examples and applies the tokenizer to the 'Email Text' field.
def tokenize_function(examples):
    # padding='max_length' pads shorter sequences to the model's max length.
    # truncation=True cuts off sequences longer than the model's max length.
    # Convert 'Email Text' values to a list of strings if necessary
    email_texts = examples["Email Text"]
    # Ensure email_texts is always a list of strings
    email_texts = [str(text) if not isinstance(text, str) else text for text in email_texts]
    return tokenizer(email_texts, padding="max_length", truncation=True)

# 4. Apply the tokenization function to all splits in your dataset
#    We use batched=True for faster processing.
print("Applying tokenizer to the dataset (train, validation, test)...")
# This assumes your dataset dict from the previous step is named 'final_dataset'
tokenized_datasets = final_dataset.map(tokenize_function, batched=True)
print("Tokenization complete.")

# 5. Inspect results
print("\n--- Structure after tokenization ---")
print(tokenized_datasets)
print("\n--- Example of tokenized features (first train example) ---")
print(tokenized_datasets["train"][0])

Loading tokenizer for 'distilbert-base-uncased'...
Tokenizer loaded.
Applying tokenizer to the dataset (train, validation, test)...


Map:   0%|          | 0/13055 [00:00<?, ? examples/s]

Map:   0%|          | 0/1865 [00:00<?, ? examples/s]

Map:   0%|          | 0/3730 [00:00<?, ? examples/s]

Tokenization complete.

--- Structure after tokenization ---
DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0', 'Email Text', 'Email Type', 'input_ids', 'attention_mask'],
        num_rows: 13055
    })
    validation: Dataset({
        features: ['Unnamed: 0', 'Email Text', 'Email Type', 'input_ids', 'attention_mask'],
        num_rows: 1865
    })
    test: Dataset({
        features: ['Unnamed: 0', 'Email Text', 'Email Type', 'input_ids', 'attention_mask'],
        num_rows: 3730
    })
})

--- Example of tokenized features (first train example) ---
{'Unnamed: 0': 10646, 'Email Text': "re : integration meeting the meeting has been confirmed . they told me that they want to essentially gain an understanding of what our plan is to get up and running as quickly as possible which does i think differ slightly from integration . have i missed any critical group ? 9 am introduction 9 : 15 am infrastructure rub 9 : 45 am it - development hodges , georgeanne ; webb , jay ; m

In [None]:
# Training - original
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding
from transformers import AutoTokenizer # Import AutoTokenizer

model_checkpoint = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

# 1. Data Collation
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
# 2. Model Loading and Setup
num_labels = len(final_dataset['train'].unique('Email Type'))  # Get number of labels
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=num_labels)

# 3. Training Arguments
training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=1, # 1 training pass, (you can have more passes but takes longer training)
    weight_decay=0.01,
    evaluation_strategy="epoch",
)

#Rename the 'Email Type' column to 'labels'
tokenized_datasets = tokenized_datasets.rename_column("Email Type", "labels")

# 4. Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    # Add metrics if needed
)

# 5. Training and Evaluation
trainer.train()
eval_results = trainer.evaluate()
print(eval_results)

# 6. Save the Model (Optional)
trainer.save_model("./my_phishing_model")

Flattening the indices:   0%|          | 0/13055 [00:00<?, ? examples/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mmelindocharls[0m ([33mmelindocharls-university-of-science-and-technology-of-so[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss
1,0.1472,0.070033


{'eval_loss': 0.07003255933523178, 'eval_runtime': 1174.4599, 'eval_samples_per_second': 1.588, 'eval_steps_per_second': 0.1, 'epoch': 1.0}


In [86]:
# Testing: 2

import torch # Assuming you're using PyTorch backend
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import os
import numpy as np

# --- 1. Define Model Path and Load Model/Tokenizer ---
# This version ALWAYS loads the model and tokenizer from the specified path.
model_save_path = "./my_phishing_model" # Directory containing your saved model files

# --- Basic Checks ---
if not os.path.exists(model_save_path):
    raise FileNotFoundError(f"Model directory not found at {model_save_path}. Please ensure you uploaded the folder correctly.")
if not os.path.exists(os.path.join(model_save_path, 'config.json')):
     raise FileNotFoundError(f"config.json not found in {model_save_path}. Model files might be incomplete or in the wrong directory.")

print(f"Loaded Tokenizer: {model_save_path}")

try:
    # Load the model from the specified path
    loaded_model = AutoModelForSequenceClassification.from_pretrained(model_save_path)
    print("Model Loaded: TRUE")
except Exception as e:
    print(f"Error Loading Model: {e}")
    raise # Halt the execution

try:
    # Load the tokenizer from the specified path
    loaded_tokenizer = AutoTokenizer.from_pretrained(model_save_path)
    print("Tokenizer Loaded: TRUE")
except Exception as e:
    print(f"Error Loading Tokenizer: {e}")
    raise # Halt the execution


# --- 2. Determine Device (GPU or CPU) ---
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
loaded_model.to(device) # Move model to the determined device
loaded_model.eval() # Set the model to evaluation mode (important for consistent predictions)
print(f"Compute Device: {device}\n")

# --- 3. Define Sample Email Text ---
sample_email_text = """

Jocelyn Garrido <jocelyn.garrido@ustp.edu.ph>
Mar 23, 2025, 1:03 PM
to Hayah, bcc: me

Good day CS325 Students.

Please be advise that the schedule for the Research Methods monthly meeting is set on the following details:

Date: March 24, 2025 (Monday)
Time : 9:00 - 11:00 AM
Venue: ICT AVR (4th flr ICT Bldg)
Topic: Writing Research Proposal and Research Methods in CS

The change of time is due to the availability of the venue as well as our session speakers Dr. Junar Landicho and Ms. Cheryll Pagal.

Please be guided accordingly.
Thank you and stay safe always.

--
Best Regards,

Jocelyn L. Garrido
CITC- IT Department
"""
print("-" * 70)
print(f"{'EMAIL TO ANALYZE':^30}")
print(f"{sample_email_text [:len(sample_email_text) // 2]}...")
print("-" * 70)

# --- 4. Tokenize the Sample Email ---
# Use the loaded_tokenizer. Ensure tensors match your backend ('pt' for PyTorch)
inputs = loaded_tokenizer(sample_email_text, return_tensors="pt", truncation=True, padding=True, max_length=512)

# Move tokenized inputs to the same device as the model
inputs = {name: tensor.to(device) for name, tensor in inputs.items()}

# --- 5. Make Prediction ---
# Use torch.no_grad() to disable gradient calculations during inference
with torch.no_grad():
    outputs = loaded_model(**inputs)
    logits = outputs.logits

# --- 6. Process Output ---
# Apply softmax to get probabilities
probabilities = torch.softmax(logits, dim=-1).squeeze() # Use squeeze() for single input

# Get the predicted class index (the one with the highest probability)
predicted_class_id = torch.argmax(probabilities).item()

# Get the confidence score (probability) for the predicted class
predicted_class_prob = probabilities[predicted_class_id].item()

# --- 7. Map Prediction to Label String ---
# Use the model's configuration to get the human-readable label
phishing_index = 1
safe_index = 0

try:
    id2label = loaded_model.config.id2label
    predicted_label_str = id2label.get(predicted_class_id, f"Unknown Index {predicted_class_id}") # Use .get for safety
except AttributeError:
    print("Warning: Could not access id2label mapping in model config.")
    predicted_label_str = f"Predicted Class {predicted_class_id}" # Fallback

# --- 8. Display Result ---
print(f"{'PREDICTION RESULTS':^30}\n")
print(f"Predicted Index: {predicted_class_id}")  # Explicitly show the index
print(f"Predicted Label from Config: {predicted_label_str}") # Show the label from config (might be generic)
print(f"Confidence: {round(predicted_class_prob, 0)*100}% ({predicted_class_prob:.5f})")
print("-" * 70)

if predicted_class_id == phishing_index:
    print(f"Recommendation: This email seems suspicious (PHISHING)")
elif predicted_class_id == safe_index:
    print("Recommendation: This email seems safe")
else:
    print(f"Recommendation: Unknown prediction index ({predicted_class_id}). Cannot provide recommendation.")


Loaded Tokenizer: ./my_phishing_model
Model Loaded: TRUE
Tokenizer Loaded: TRUE
Compute Device: cpu

----------------------------------------------------------------------
       EMAIL TO ANALYZE       


Jocelyn Garrido <jocelyn.garrido@ustp.edu.ph>
Mar 23, 2025, 1:03 PM
to Hayah, bcc: me

Good day CS325 Students.

Please be advise that the schedule for the Research Methods monthly meeting is set on the following details:

Date: March 24, 2025 (Monday)
Time : 9:00 - 11:00 AM
Venue: ICT AVR (4th flr ICT Bldg)
T...
----------------------------------------------------------------------
      PREDICTION RESULTS      

Predicted Index: 0
Predicted Label from Config: LABEL_0
Confidence: 100.0% (0.99942)
----------------------------------------------------------------------
Recommendation: This email seems safe
