}# BERT Model - Prediction Notebook

Notebook này dùng để load model BERT đã được train và dự đoán với các emails mới.

## Chức năng:
1. Load model BERT đã được train
2. Load tokenizer
3. Dự đoán với mảng emails mới
4. Hiển thị kết quả dự đoán


## 1. Setup và Mount Google Drive


In [1]:
from google.colab import drive
drive.mount('/content/drive')


ModuleNotFoundError: No module named 'google.colab'

## 2. Import Thư Viện


In [None]:
# IMPORTS

import os
import numpy as np
import torch

# Transformers
from transformers import (
    BertTokenizerFast,
    BertForSequenceClassification
)

print("✓ Tất cả thư viện đã được import thành công!")


## 3. Cấu Hình Đường Dẫn


In [None]:
# CONFIGURATION

BASE_PATH = '/content/drive/MyDrive/Colab Notebooks/Graduation-Project'
MODEL_PATH = f"{BASE_PATH}/outputs/models/BERT/bert_base_email_model"

print(f"✓ Working directory: {BASE_PATH}")
print(f"✓ Model path: {MODEL_PATH}")


## 4. Load Model và Tokenizer


In [None]:
# LOAD MODEL AND TOKENIZER

print("="*70)
print("ĐANG LOAD MODEL VÀ TOKENIZER")
print("="*70)

# Load tokenizer
print(f"Đang load tokenizer từ: {MODEL_PATH}...")
tokenizer = BertTokenizerFast.from_pretrained(MODEL_PATH)
print("✓ Tokenizer loaded")

# Load model
print(f"Đang load model từ: {MODEL_PATH}...")
model = BertForSequenceClassification.from_pretrained(MODEL_PATH)
print("✓ Model loaded")

# Set model to evaluation mode
model.eval()
print("✓ Model set to evaluation mode")

# Determine device (CPU or GPU)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)
print(f"✓ Model moved to device: {device}")


## 5. Định Nghĩa Hàm Dự Đoán


In [None]:
def predict_emails(model, tokenizer, emails, max_length=512, device=None):
    """
    Dự đoán labels cho một mảng emails.

    Args:
        model: BERT model đã được load
        tokenizer: BERT tokenizer đã được load
        emails: List hoặc array chứa các email texts (strings)
        max_length: Độ dài tối đa của sequence (default: 512)
        device: Device để chạy model (None = tự động detect)

    Returns:
        tuple: (predicted_labels, probabilities)
            - predicted_labels: numpy array chứa labels (0 = Benign, 1 = Phishing)
            - probabilities: numpy array chứa probabilities cho class 1 (Phishing)
    """
    # Set model to evaluation mode
    model.eval()
    
    # Auto-detect device if not provided
    if device is None:
        device = next(model.parameters()).device
    
    # Convert emails to list if it's not already
    if not isinstance(emails, list):
        emails = list(emails)
    
    # Tokenize emails
    encodings = tokenizer(
        emails,
        truncation=True,
        max_length=max_length,
        padding=True,
        return_tensors="pt",
    )
    
    # Move input tensors to the same device as the model
    encodings = {key: val.to(device) for key, val in encodings.items()}
    
    # Make predictions
    with torch.no_grad():
        outputs = model(**encodings)
        logits = outputs.logits
        
        # Get predicted labels (0 or 1)
        predicted_labels = torch.argmax(logits, dim=1).cpu().numpy()
        
        # Get probabilities (softmax)
        probabilities = torch.softmax(logits, dim=1).cpu().numpy()
        
        # Extract probability of class 1 (Phishing)
        phishing_probs = probabilities[:, 1]
    
    return predicted_labels, phishing_probs
ss

def predict_single_email(model, tokenizer, email, max_length=512, device=None):
    """
    Dự đoán label cho một email đơn lẻ.

    Args:
        model: BERT model đã được load
        tokenizer: BERT tokenizer đã được load
        email: Email text (string)
        max_length: Độ dài tối đa của sequence (default: 512)
        device: Device để chạy model (None = tự động detect)

    Returns:
        tuple: (label, probability)
            - label: 0 (Benign) hoặc 1 (Phishing)
            - probability: Probability của class Phishing
    """
    labels, probs = predict_emails(model, tokenizer, [email], max_length, device)
    return labels[0], probs[0]

print("✓ Hàm dự đoán đã được định nghĩa")


## 6. Dự Đoán với Emails Mới


In [None]:
# TẠO MẢNG EMAILS MỚI ĐỂ DỰ ĐOÁN

new_emails = [
    "Internal HR Data Alignment Notice. This message is part of our quarterly internal alignment initiative focused on ensuring consistency across employee records following recent system integrations. As several tools were consolidated (time tracking, internal directory, notification preferences), we identified that a subset of profiles may show partial mismatches due to legacy formatting or delayed synchronization. This is not a corrective action and does not imply an issue with your account. However, to prevent potential delays in access provisioning or automated approvals, employees are asked to complete a brief self-review. The review confirms that basic profile attributes—such as display name, team affiliation, internal contact reference, and notification routing—are still accurate. The process is intentionally lightweight and should take no more than a few minutes. If no changes are required, you can simply confirm the current state. If updates are needed, you may annotate them directly so the system can reconcile differences without creating a support ticket. To proceed, please access the internal portal at [PORTAL], navigate to Profile Review, and complete the highlighted sections. Once reviewed, select [ACTION] to submit confirmation. If your profile has already been verified recently, the system will automatically mark it as complete and no further steps will be required. Please note that unconfirmed profiles may temporarily be placed into a pending state as part of automated compliance checks. This does not restrict general access but may delay role-specific permissions until reconciliation is complete. If you encounter any difficulties or believe this notice was sent in error, feel free to reply to this message so we can assist promptly. Thank you for helping us maintain accurate internal records.",
    "Logistics Intake Confirmation Hello, The office intake desk has received an item that appears to be associated with an internal shipment but lacks a complete recipient record. The label includes an abbreviated name and internal reference code, which matches multiple profiles. To prevent misrouting or automatic return, we are contacting potential recipients to confirm ownership. This is a standard procedure for unidentified deliveries and does not indicate any issue. If you are expecting equipment, documents, or materials, please confirm whether this item belongs to you. Confirmation requires only minimal information to route the package correctly. Visit [OFFICE_PORTAL], navigate to Deliveries, enter reference [ID], and select [ACTION]. If the item is not yours, you may select “Not my delivery,” and the system will continue matching. Items without confirmation are held for a limited time per policy before being returned to the sender. We apologize for the interruption and appreciate your quick response to help resolve the intake efficiently.",
    "IT Security Verification (Policy-Compliant Language) Hello, As part of our ongoing security posture review, the IT team is conducting a controlled verification of account metadata following recent environment updates. This review focuses on identifying benign anomalies such as new browser fingerprints, network changes, or device reconfigurations that can trigger precautionary flags. Your account was included in this review due to a non-critical change pattern detected last week. This does not indicate unauthorized activity. The verification process simply ensures the system correctly associates recent activity with a trusted user context. No information is collected via email. The only required step is to access the internal security console and confirm the listed device and session attributes. If the information matches your usage, the verification will complete immediately. Please visit [SECURITY_PORTAL], open Session Review, and follow the on-screen instructions. Once complete, select [ACTION] to finalize. If the review is not completed within the verification window, the system may temporarily limit certain sensitive operations (such as privilege elevation or environment exports) until confirmation is received. If you have already completed a similar check recently, the portal will reflect a completed status and no further action is required. For questions or access issues, reply to this message so we can assist. Thank you for helping maintain a secure and reliable environment.",
    "Shared Documentation Access Normalization Hello, We are in the process of consolidating shared project documentation into a standardized structure to reduce duplication and ensure everyone is referencing the most current materials. As part of this effort, legacy permissions are being reviewed and normalized. During this transition, some users may notice that previously accessible documents now appear with a “request access” status. This is expected behavior while roles are revalidated against the updated access model. To streamline this process and avoid manual requests, we are asking users to confirm their current role and access level. This allows the system to automatically assign appropriate permissions without over-provisioning, which is required for audit compliance. Please go to [PORTAL], select Access Review, verify your role within the relevant project space, and submit using [ACTION]. If your role does not require edit permissions, selecting view-only will still ensure uninterrupted access. If you believe your access has been incorrectly restricted or if you require temporary elevated access, respond to this email and we will review it manually. Our goal is to complete this normalization quickly so everyone can continue working without disruption.",
    "Billing Clarification and Cost Attribution Review Hello, We are reaching out as part of the routine end-of-cycle reconciliation for operational expenses. During aggregation, a small number of entries were flagged due to missing or ambiguous cost attribution tags. This is common when services are renewed, reclassified, or split across initiatives. Your name appears as the reference contact for one of the affected entries. This does not indicate an error; it simply means the system requires confirmation to finalize allocation and prevent the item from being deferred to the next reporting period. To simplify the process, we have prepared a summarized view that excludes sensitive payment details and focuses only on descriptive fields: service category, usage window, project reference, and cost center. If the information appears correct, you may approve it directly. If adjustments are required, selecting the adjustment option will allow you to leave a brief clarification note. Please log into [PORTAL], open Billing Review, locate reference [ID], and select [ACTION] to complete the confirmation. If no action is taken before the cycle closes, the item may temporarily remain pending, which could affect monthly summaries but will not impact approvals already granted. We understand end-of-period workloads can be heavy, so this message is intended to provide context and reduce follow-up. Let us know if you need assistance or if this reference should be reassigned.",
    "Security Training Simulation Announcement Hello PTITer, This week we are running a scheduled security awareness simulation designed to improve recognition of suspicious messages and reduce response time to potential threats. As part of the exercise, participants may receive simulated emails that resemble phishing attempts, including urgent language or verification prompts. Please be aware that these messages are part of a controlled training environment. They will not contain real links, credential requests, or data collection mechanisms. The purpose is to evaluate decision-making and reporting behavior, not to test compliance through deception. If you receive a simulation message, do not follow the instructions in the ",
    "Research Consent Email (Contains Sensitive Vocabulary) Hello, You are invited to participate in an internal research study examining how users interpret system notifications that reference identity verification, account confirmation, or credential-related terminology. These terms are used descriptively within hypothetical scenarios and do not require you to provide real credentials at any point.The study has been reviewed and approved through the appropriate ethics process. Participation is voluntary, and all responses are anonymized. You may stop at any time without penalty.During the study, you will review example messages and indicate how you would respond. Some messages intentionally include language often associated with phishing to evaluate perception and awareness. If you agree to participate, please visit [RESEARCH_PORTAL] and indicate consent. If you prefer not to participate, you may ignore this message.",
    "Password Reset Training Module Hello, As part of mandatory onboarding training, you will complete a module that demonstrates common password reset workflows. The module includes simulated login screens and reset prompts to illustrate how legitimate systems differ from malicious ones. Do not enter real credentials during training. All inputs use mock data and exist solely within the learning environment. To begin, visit [TRAINING_PORTAL] and open the Account Safety module. Completion will be automatically recorded.",
    "Finance Reminder with “Urgent” Terminology Hello, This is a required reminder regarding an outstanding receipt confirmation associated with a previously approved expense. System templates label this notice as “urgent” to ensure timely processing before the reporting window closes. No payment details or login information are requested. The confirmation simply verifies that services or goods were received as recorded. Please access [BILLING_PORTAL], locate the pending item, and submit confirmation. If you are not the correct contact, forwarding this message to the appropriate party will help avoid delays. We appreciate your attention to this administrative requirement.",
    "QA Report Discussing Phishing Data Hello team, Attached is this week’s QA summary covering dataset quality for email classification tasks. The report includes analysis of phishing indicators, credential language, and attack-style phrasing found in sample data. These references describe dataset content only and do not represent real incidents. The goal is to improve model robustness by understanding why certain benign emails are misclassified and vice versa. Please review and share any observations that could help refine preprocessing or labeling standards.",
]

print(f"✓ Đã tạo {len(new_emails)} emails để dự đoán")


In [None]:
# DỰ ĐOÁN VỚI MẢNG EMAILS

print("="*70)
print("ĐANG DỰ ĐOÁN VỚI MẢNG EMAILS")
print("="*70)

# Make predictions
predicted_labels, probabilities = predict_emails(model, tokenizer, new_emails)

print(f"\n✓ Đã dự đoán {len(predicted_labels)} emails")
print(f"✓ Labels: {predicted_labels}")
print(f"✓ Probabilities: {probabilities}")


In [None]:
# HIỂN THỊ KẾT QUẢ CHI TIẾT

print("="*70)
print("KẾT QUẢ DỰ ĐOÁN CHI TIẾT")
print("="*70)

for i, email in enumerate(new_emails, 1):
    label = 'Phishing' if predicted_labels[i-1] == 1 else 'Benign'
    prob = probabilities[i-1]
    
    print(f"\n[Email {i}]")
    print(f"  Text: {email[:80]}{'...' if len(email) > 80 else ''}")
    print(f"  Predicted Label: {label}")
    print(f"  Probability (Phishing): {prob:.4f}")
    print(f"  Confidence: {'High' if prob > 0.9 or prob < 0.1 else 'Medium' if prob > 0.7 or prob < 0.3 else 'Low'}")


## 7. Dự Đoán với Email Đơn Lẻ (Ví dụ)


In [None]:
# DỰ ĐOÁN VỚI MỘT EMAIL ĐƠN LẺ

single_email = "Your account has been suspended. Please click here to verify your identity: http://verify-account.com"

label, prob = predict_single_email(model, tokenizer, single_email)

print("="*70)
print("DỰ ĐOÁN EMAIL ĐƠN LẺ")
print("="*70)
print(f"Email: {single_email}")
print(f"Predicted Label: {'Phishing' if label == 1 else 'Benign'}")
print(f"Probability (Phishing): {prob:.4f}")


## 8. Tóm Tắt Kết Quả


In [None]:
# TÓM TẮT KẾT QUẢ

benign_count = np.sum(predicted_labels == 0)
phishing_count = np.sum(predicted_labels == 1)

print("="*70)
print("TÓM TẮT KẾT QUẢ")
print("="*70)
print(f"Tổng số emails: {len(new_emails)}")
print(f"Benign: {benign_count} ({benign_count/len(new_emails)*100:.1f}%)")
print(f"Phishing: {phishing_count} ({phishing_count/len(new_emails)*100:.1f}%)")
print(f"\nAverage probability (Phishing): {np.mean(probabilities):.4f}")
print(f"Min probability: {np.min(probabilities):.4f}")
print(f"Max probability: {np.max(probabilities):.4f}")
