In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from transformers import BertTokenizer, TFBertForSequenceClassification
from transformers import InputExample, InputFeatures
import tensorflow as tf

In [None]:
def load_dataset():
    phishing_emails = [
        "Your account has been compromised. Click here to reset your password immediately.",
        "You've won a free iPhone! Visit this link to claim your prize.",
        "Update your billing information to avoid service suspension.",
        "We noticed a login attempt from an unknown device. Verify your identity now.",
        "Urgent: Your account will be locked in 24 hours unless you respond."
    ]
     legitimate_emails = [
        "Your order has been shipped and will arrive in 3 days.",
        "Thank you for your payment. Your subscription is now active.",
        "Meeting scheduled for Monday at 10 AM.",
        "Reminder: Project deadline is next Friday.",
        "Welcome to our service! Let us know if you need any help."
    ]
    texts = phishing_emails * 100 + legitimate_emails * 100
    labels = [1]*len(phishing_emails)*100 + [0]*len(legitimate_emails)*100

    return pd.DataFrame({'text': texts, 'label': labels})

In [None]:
def convert_to_examples(df):
    return df.apply(lambda x: InputExample(guid=None,
                                           text_a=x['text'],
                                           text_b=None,
                                           label=x['label']), axis=1)

In [None]:
def convert_examples_to_tf_dataset(examples, tokenizer, max_length=64):
    features = []
    for e in examples:
        inputs = tokenizer.encode_plus(
            e.text_a,
            add_special_tokens=True,
            max_length=max_length,
            truncation=True,
            padding='max_length',
            return_token_type_ids=False
        )
        features.append(
            InputFeatures(
                input_ids=inputs['input_ids'],
                attention_mask=inputs['attention_mask'],
                label=e.label
            )
        )

    def gen():
        for f in features:
            yield ({
                'input_ids': f.input_ids,
                'attention_mask': f.attention_mask
            }, f.label)

    return tf.data.Dataset.from_generator(
        gen,
        ({'input_ids': tf.int32, 'attention_mask': tf.int32}, tf.int64),
        ({'input_ids': tf.TensorShape([None]),
          'attention_mask': tf.TensorShape([None])}, tf.TensorShape([]))
    )

In [None]:
def train_model(df):
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    examples = convert_to_examples(df)
    train_examples, test_examples = train_test_split(examples, test_size=0.2, random_state=42)

    train_data = convert_examples_to_tf_dataset(train_examples, tokenizer)
    test_data = convert_examples_to_tf_dataset(test_examples, tokenizer)
    train_data = train_data.shuffle(100).batch(16).prefetch(tf.data.AUTOTUNE)
    test_data = test_data.batch(16).prefetch(tf.data.AUTOTUNE)

    model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=2e-5),
                  loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
                  metrics=['accuracy'])

    model.fit(train_data, epochs=3, validation_data=test_data)

    preds = model.predict(test_data).logits
    y_pred = tf.argmax(preds, axis=1).numpy()
    y_true = [y for _, y in test_data.unbatch()]

    print("\nClassification Report:\n", classification_report(y_true, y_pred))