In [None]:
# Installing the Required Libraries
! pip install transformers faiss-cpu nltk scikit-learn


Collecting faiss-cpu
  Downloading faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.4 kB)
Downloading faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl (30.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.7/30.7 MB[0m [31m61.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.10.0


In [None]:
# Importing Libraries
import pandas as pd
import numpy as np
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import faiss
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
# Data Collection

import kagglehub

# Download latest version
path = kagglehub.dataset_download("naserabdullahalam/phishing-email-dataset")

print("Path to dataset files:", path)

Path to dataset files: /root/.cache/kagglehub/datasets/naserabdullahalam/phishing-email-dataset/versions/1


In [None]:
import pandas as pd

# Load datasets
ling = pd.read_csv('/content/Ling.csv')
nazario = pd.read_csv('/content/Nazario.csv')
nigerian_fraud = pd.read_csv('/content/Nigerian_Fraud.csv')
spam_assasin = pd.read_csv('/content/SpamAssasin.csv')
phishing_email = pd.read_csv('/content/phishing_email.csv')
enron = pd.read_csv('/content/Enron.csv')
ceas_08 = pd.read_csv('/content/CEAS_08.csv')

# Combine datasets into a single DataFrame
data = pd.concat([ling, nazario, nigerian_fraud, spam_assasin, phishing_email, enron, ceas_08], ignore_index=True)

# Display the first few rows of the combined dataset
print(data.head())


                                             subject  \
0            job posting - apple-iss research center   
1                                                NaN   
2  query : letter frequencies for text identifica...   
3                                               risk   
4                           request book information   

                                                body  label sender receiver  \
0  content - length : 3386 apple-iss research cen...    0.0    NaN      NaN   
1  lang classification grimes , joseph e . and ba...    0.0    NaN      NaN   
2  i am posting this inquiry for sergei atamas ( ...    0.0    NaN      NaN   
3  a colleague and i are researching the differin...    0.0    NaN      NaN   
4  earlier this morning i was on the phone with a...    0.0    NaN      NaN   

  date  urls text_combined  
0  NaN   NaN           NaN  
1  NaN   NaN           NaN  
2  NaN   NaN           NaN  
3  NaN   NaN           NaN  
4  NaN   NaN           NaN  


In [None]:
# Display the column names of the combined dataset
print(data.columns)

Index(['subject', 'body', 'label', 'sender', 'receiver', 'date', 'urls',
       'text_combined'],
      dtype='object')


In [None]:
# Preprocess and Cleaning the Data

import nltk
nltk.download('punkt')
nltk.download('punkt_tab')  # Ensure the missing resource is downloaded

# Example preprocessing function
def preprocess_text(text):
    if isinstance(text, float):
        return ''
    tokens = nltk.word_tokenize(text)
    return ' '.join(tokens)

# Apply preprocessing to the email body content
data['processed_text'] = data['body'].apply(preprocess_text)

# Display the first few rows of the preprocessed data
print(data.head())


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


                                             subject  \
0            job posting - apple-iss research center   
1                                                NaN   
2  query : letter frequencies for text identifica...   
3                                               risk   
4                           request book information   

                                                body  label sender receiver  \
0  content - length : 3386 apple-iss research cen...    0.0    NaN      NaN   
1  lang classification grimes , joseph e . and ba...    0.0    NaN      NaN   
2  i am posting this inquiry for sergei atamas ( ...    0.0    NaN      NaN   
3  a colleague and i are researching the differin...    0.0    NaN      NaN   
4  earlier this morning i was on the phone with a...    0.0    NaN      NaN   

  date  urls text_combined                                     processed_text  
0  NaN   NaN           NaN  content - length : 3386 apple-iss research cen...  
1  NaN   NaN           NaN  

In [None]:
# Using BERT tokenizer to convert the email content into tokens

from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize the email content and truncate sequences longer than 512 tokens
data['tokens'] = data['processed_text'].apply(lambda x: tokenizer.encode(x, add_special_tokens=True, max_length=512, truncation=True))

# Display the first few rows of the tokenized data
print(data.head())


                                             subject  \
0            job posting - apple-iss research center   
1                                                NaN   
2  query : letter frequencies for text identifica...   
3                                               risk   
4                           request book information   

                                                body  label sender receiver  \
0  content - length : 3386 apple-iss research cen...    0.0    NaN      NaN   
1  lang classification grimes , joseph e . and ba...    0.0    NaN      NaN   
2  i am posting this inquiry for sergei atamas ( ...    0.0    NaN      NaN   
3  a colleague and i are researching the differin...    0.0    NaN      NaN   
4  earlier this morning i was on the phone with a...    0.0    NaN      NaN   

  date  urls text_combined                                     processed_text  \
0  NaN   NaN           NaN  content - length : 3386 apple-iss research cen...   
1  NaN   NaN           NaN

In [None]:
# Pad and Truncate Sequences

MAX_LEN = 128

# Pad and truncate sequences
data['tokens'] = data['tokens'].apply(lambda x: x[:MAX_LEN] + [0] * (MAX_LEN - len(x)))

# Display the first few rows of the tokenized data
print(data.head())



                                             subject  \
0            job posting - apple-iss research center   
1                                                NaN   
2  query : letter frequencies for text identifica...   
3                                               risk   
4                           request book information   

                                                body  label sender receiver  \
0  content - length : 3386 apple-iss research cen...    0.0    NaN      NaN   
1  lang classification grimes , joseph e . and ba...    0.0    NaN      NaN   
2  i am posting this inquiry for sergei atamas ( ...    0.0    NaN      NaN   
3  a colleague and i are researching the differin...    0.0    NaN      NaN   
4  earlier this morning i was on the phone with a...    0.0    NaN      NaN   

  date  urls text_combined                                     processed_text  \
0  NaN   NaN           NaN  content - length : 3386 apple-iss research cen...   
1  NaN   NaN           NaN

In [None]:
# Train the Model

import pandas as pd
import nltk
from transformers import BertTokenizer

# Step 1: Load and Combine Datasets
ling = pd.read_csv('/content/Ling.csv')
nazario = pd.read_csv('/content/Nazario.csv')
nigerian_fraud = pd.read_csv('/content/Nigerian_Fraud.csv')
spam_assasin = pd.read_csv('/content/SpamAssasin.csv')
phishing_email = pd.read_csv('/content/phishing_email.csv')
enron = pd.read_csv('/content/Enron.csv')
ceas_08 = pd.read_csv('/content/CEAS_08.csv')

# Combine datasets into a single DataFrame
data = pd.concat([ling, nazario, nigerian_fraud, spam_assasin, phishing_email, enron, ceas_08], ignore_index=True)

# Step 2: Preprocess and Clean the Data
nltk.download('punkt')

# Example preprocessing function
def preprocess_text(text):
    if isinstance(text, float):
        return ''
    tokens = nltk.word_tokenize(text)
    return ' '.join(tokens)

# Apply preprocessing to the email body content
data['processed_text'] = data['body'].apply(preprocess_text)

# Step 3: Feature Extraction
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
MAX_LEN = 128

def tokenize_and_pad(text):
    encoded = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=MAX_LEN,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    )
    return encoded['input_ids'].squeeze().tolist(), encoded['attention_mask'].squeeze().tolist()

# Tokenize the email content and create attention masks
data['tokens'], data['attention_mask'] = zip(*data['processed_text'].apply(tokenize_and_pad))

# Display the first few rows of the tokenized data with attention masks
print(data.head())


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


                                             subject  \
0            job posting - apple-iss research center   
1                                                NaN   
2  query : letter frequencies for text identifica...   
3                                               risk   
4                           request book information   

                                                body  label sender receiver  \
0  content - length : 3386 apple-iss research cen...    0.0    NaN      NaN   
1  lang classification grimes , joseph e . and ba...    0.0    NaN      NaN   
2  i am posting this inquiry for sergei atamas ( ...    0.0    NaN      NaN   
3  a colleague and i are researching the differin...    0.0    NaN      NaN   
4  earlier this morning i was on the phone with a...    0.0    NaN      NaN   

  date  urls text_combined                                     processed_text  \
0  NaN   NaN           NaN  content - length : 3386 apple-iss research cen...   
1  NaN   NaN           NaN

In [None]:
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets
X_train_tokens, X_test_tokens, y_train, y_test = train_test_split(
    data['tokens'].tolist(), data['label'].tolist(), test_size=0.2, random_state=42
)
X_train_masks, X_test_masks = train_test_split(
    data['attention_mask'].tolist(), test_size=0.2, random_state=42
)

# Display the shapes of the splits to verify
print(f"X_train_tokens shape: {len(X_train_tokens)}")
print(f"X_test_tokens shape: {len(X_test_tokens)}")
print(f"y_train shape: {len(y_train)}")
print(f"y_test shape: {len(y_test)}")


X_train_tokens shape: 119499
X_test_tokens shape: 29875
y_train shape: 119499
y_test shape: 29875


In [None]:
# Loading the Pre-trained Model

from transformers import BertForSequenceClassification

model = BertForSequenceClassification.from_pretrained('bert-base-uncased')


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Training the model

import torch
from torch.optim import AdamW
from torch.utils.data import DataLoader, TensorDataset
import torch.nn.functional as F

# Ensure data is converted to tensors
train_inputs = torch.tensor(X_train_tokens)
train_masks = torch.tensor(X_train_masks)
train_labels = torch.tensor(y_train)

# Create DataLoader for training
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_dataloader = DataLoader(train_data, batch_size=8)

# Set up the optimizer
optimizer = AdamW(model.parameters(), lr=2e-5)

# Convert the labels to one-hot encoded format
def one_hot(labels, num_classes):
    return F.one_hot(labels.to(torch.long), num_classes)

# Define the number of classes
num_classes = 2

# Train the model (simplified example)
model.train()
for epoch in range(3):  # Let's train for 3 epochs
    for batch in train_dataloader:
        b_input_ids, b_input_mask, b_labels = batch
        optimizer.zero_grad()

        # Convert labels to one-hot encoded format
        b_labels_one_hot = one_hot(b_labels, num_classes).float()

        outputs = model(b_input_ids, attention_mask=b_input_mask, labels=b_labels_one_hot)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch+1}, Loss: {loss.item()}")



In [None]:
# Convert the data to tensors
train_inputs = torch.tensor(X_train_tokens)
train_masks = torch.tensor(X_train_masks)
train_labels = torch.tensor(y_train)
test_inputs = torch.tensor(X_test_tokens)
test_masks = torch.tensor(X_test_masks)
test_labels = torch.tensor(y_test)


In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Put the model in evaluation mode
model.eval()

# Evaluate the model
with torch.no_grad():
    outputs = model(test_inputs, attention_mask=test_masks)
    predictions = torch.argmax(outputs.logits, axis=1)

# Calculate performance metrics
accuracy = accuracy_score(test_labels, predictions)
precision = precision_score(test_labels, predictions, average='binary')
recall = recall_score(test_labels, predictions, average='binary')
f1 = f1_score(test_labels, predictions, average='binary')

# Print the evaluation results
print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')


In [None]:
!pip install Flask flask-ngrok


In [None]:
from flask import Flask, request, jsonify
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from flask_ngrok import run_with_ngrok

app = Flask(__name__)
run_with_ngrok(app)  # Start ngrok when app is run

# Load the pre-trained BERT model and tokenizer
model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

MAX_LEN = 128

@app.route('/detect', methods=['POST'])
def detect():
    email_content = request.json['email_content']
    encoded = tokenizer.encode_plus(
        email_content,
        add_special_tokens=True,
        max_length=MAX_LEN,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    )
    tokens_tensor = encoded['input_ids']
    attention_mask_tensor = encoded['attention_mask']
    with torch.no_grad():
        outputs = model(tokens_tensor, attention_mask=attention_mask_tensor)
    prediction = torch.argmax(outputs.logits, axis=1).item()
    return jsonify({'prediction': 'phishing' if prediction == 1 else 'legitimate'})

if __name__ == '__main__':
    app.run()


In [None]:
curl -X POST -H "Content-Type: application/json" -d '{"email_content": "Your email content here"}' http://<ngrok-public-url>/detect
