<a href="https://colab.research.google.com/github/AbhishekKabadi/AbhishekKabadi/blob/main/Project_spam_ham_detection_5830.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Step 1: Loading the data and preprocessing with BERT**

In [63]:
import pandas as pd
from transformers import BertTokenizer

# Loading the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased') # 'bert-base-uncased' couldn't handle the max_seq_length

# Loading the Data
from google.colab import files

uploaded = files.upload()



Saving F_phish_r2bin.csv to F_phish_r2bin (4).csv


In [67]:
# Exploring data

file_1 = 'F_phish_r2bin.csv'
df_phish = pd.read_csv(file_1)

# Displaying some rows
print("First few rows of the dataset:")
print(df_phish.head())

# Info
print("\nDataset information:")
print(df_phish.info())


First few rows of the dataset:
                                                text  label
0  Subject: enron methanol ; meter # : 988291\r\n...      0
1  Subject: hpl nom for january 9 , 2001\r\n( see...      0
2  Subject: neon retreat\r\nho ho ho , we ' re ar...      0
3  Subject: photoshop , windows , office . cheap ...      1
4  Subject: re : indian springs\r\nthis deal is t...      0

Dataset information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5171 entries, 0 to 5170
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    5171 non-null   object
 1   label   5171 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 80.9+ KB
None


In [66]:
# Displaying stats
print("\nStats  of the dataset:")
print(df_phish.describe())

# Unique values
print("\nUnique values in the 'label' column:")
print(df_phish['label'].unique())


Stats  of the dataset:
             label
count  5171.000000
mean      0.289886
std       0.453753
min       0.000000
25%       0.000000
50%       0.000000
75%       1.000000
max       1.000000

Unique values in the 'label' column:
[0 1]


In [68]:
def preprocess_text_for_bert(text):
    # Tokenizing the text
    tokens = tokenizer.tokenize(tokenizer.decode(tokenizer.encode(text)))

    max_seq_length = 512 # to suffice model's requirements

    # To split the possible longer sequences
    tokens = tokens[:max_seq_length]

    input_ids = tokenizer.convert_tokens_to_ids(tokens)

    # Padding
    padding_length = max_seq_length - len(input_ids)
    input_ids += [tokenizer.pad_token_id] * padding_length

    return input_ids


In [None]:
#  preprocessing 'text' column

df_phish['input_ids'] = df_phish['text'].apply(preprocess_text_for_bert)
print("\nPreprocessed input ids for BERT:")
print(df_phish['input_ids'].head())

## Step 2:- Bulilding the model

In [70]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from tqdm import tqdm

# Choosing a Neural Network Architecture
model_name = 'bert-large-uncased'
num_labels = len(df_phish['label'].unique())

# Loading the BERT model for sequence classification
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)

train_data, val_data = train_test_split(df_phish, test_size=0.22, random_state=42)

class CustomDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return {
            'input_ids': torch.tensor(self.data.iloc[idx]['input_ids'], dtype=torch.long),
            'label': torch.tensor(self.data.iloc[idx]['label'], dtype=torch.long)
        }

train_dataset = CustomDataset(train_data)
val_dataset = CustomDataset(val_data)

train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=4, shuffle=False)



model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-large-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


**Step 3 - Fine_tuning and training the model**

In [71]:
# Setting device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Optimizer and learning rate
optimizer = AdamW(model.parameters(), lr=1e-5) #lr is kept significantly low

# Training loop
epochs = 3

for epoch in range(epochs):
    model.train()
    total_loss = 0

    for batch in tqdm(train_loader, desc=f'Epoch {epoch + 1}/{epochs}'):
        input_ids = batch['input_ids'].to(device)
        labels = batch['label'].to(device)

        optimizer.zero_grad()

        outputs = model(input_ids, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        optimizer.step()

    average_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch + 1}/{epochs}, Average Training Loss: {average_loss:.4f}")

Epoch 1/3: 100%|██████████| 1009/1009 [23:02<00:00,  1.37s/it]


Epoch 1/3, Average Training Loss: 0.2760


Epoch 2/3: 100%|██████████| 1009/1009 [23:04<00:00,  1.37s/it]


Epoch 2/3, Average Training Loss: 0.0428


Epoch 3/3: 100%|██████████| 1009/1009 [23:04<00:00,  1.37s/it]

Epoch 3/3, Average Training Loss: 0.1681





**Step 4 - Evaluating the model**

In [73]:
model.eval()
predictions = []
true_labels = []

with torch.no_grad():
    for batch in tqdm(val_loader, desc="Validation"):
        input_ids = batch['input_ids'].to(device)
        labels = batch['label'].to(device)

        outputs = model(input_ids)
        logits = outputs.logits

        predictions.extend(logits.argmax(dim=1).cpu().numpy())
        true_labels.extend(labels.cpu().numpy())

accuracy = accuracy_score(true_labels, predictions)
print(f"Validation Accuracy: {accuracy:.4f}")

Validation: 100%|██████████| 285/285 [02:13<00:00,  2.14it/s]

Validation Accuracy: 0.9429





**Deploying the model to test a different dataset.**

Will perform similar process to test a different dataset of "spam and ham" emails to test the previously developed model


In [2]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.metrics import accuracy_score
from tqdm import tqdm

uploaded = files.upload()

NameError: ignored

In [76]:
file_new = 'F_spam_ham.csv'
df_new_phish = pd.read_csv(file_new)

# Displaying some rows
print("First few rows of the dataset:")
print(df_new_phish.head())

# Info
print("\nDataset information:")
print(df_new_phish.info())

# Displaying stats
print("\nStats  of the dataset:")
print(df_phish.describe())



First few rows of the dataset:
                                                text  label
0  Subject: still on your own\r\ngoodbye\r\ngilt ...      1
1  Subject: organizational announcement\r\nwe are...      0
2  Subject: re : cp & l\r\nokay guys ,\r\nthis on...      0
3  Subject: this service is provided by licensed ...      1
4  Subject: are you a successful penny stock fan ...      1

Dataset information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4212 entries, 0 to 4211
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    4212 non-null   object
 1   label   4212 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 65.9+ KB
None

Stats  of the dataset:
             label
count  5171.000000
mean      0.289886
std       0.453753
min       0.000000
25%       0.000000
50%       0.000000
75%       1.000000
max       1.000000


In [77]:

# Load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased')

max_seq_length = 512

def preprocess_text_for_bert(text):
    if pd.isnull(text):  # Check for NaN values
        return [tokenizer.pad_token_id] * max_seq_length  # Return padding for NaN
    else:
        # Tokenizing the text
        tokens = tokenizer.tokenize(tokenizer.decode(tokenizer.encode(text)))

        # To split the possible longer sequences
        tokens = tokens[:max_seq_length]

        input_ids = tokenizer.convert_tokens_to_ids(tokens)

        # Padding
        padding_length = max_seq_length - len(input_ids)
        input_ids += [tokenizer.pad_token_id] * padding_length

        return input_ids


In [78]:
# Preprocessing the text in the new dataset
df_new_phish['input_ids'] = df_new_phish['text'].apply(preprocess_text_for_bert)

# Creating a CustomDataset for the new dataset
new_dataset = CustomDataset(df_new_phish)
new_loader = DataLoader(new_dataset, batch_size=4, shuffle=False)

model.eval()

# Lists to store predictions and true labels
new_predictions = []


In [1]:
# Test loop
with torch.no_grad():
    for batch in tqdm(new_loader, desc="Testing"):
        input_ids = batch['input_ids'].to(device)

        # Inference
        outputs = model(input_ids)
        logits = outputs.logits

        new_predictions.extend(logits.argmax(dim=1).cpu().numpy())

NameError: ignored

In [None]:
# Lists to store predictions
new_true_labels = []

# Extracting labels from df_new_phish
new_true_labels = df_new_phish['label'].tolist()  # Adjust column name if needed





In [None]:
new_accuracy = accuracy_score(new_true_labels, new_predictions)
print(f"Accuracy on the New Dataset: {new_accuracy:.4f}")