# Importing libraries, loading and transforming data

In [1]:
#%pip install -q mlflow nlp
#%pip install -q nlp
#%pip install imblearn
#%pip install -q bs4




In [1]:
#imports
import pandas as pd
import gc #garbage collector
import numpy as np
import torch #pytorch m;achine learning library
from transformers import AutoTokenizer, pipeline, Trainer, TrainingArguments, DataCollatorWithPadding, AutoModelForSequenceClassification
from nlp import Dataset
from imblearn.over_sampling import RandomOverSampler # A method from the imbalanced-learn library to handle imbalanced datasets. It will generate new samples in the classes which are under-represented.
#import datasets
from tqdm import tqdm #prgress bar library, after calling can use progress bars when applying functions to a DataFrame or Series.
tqdm.pandas()

  from .autonotebook import tqdm as notebook_tqdm
2024-06-11 16:02:32.766930: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
# set parameters
train_fraction = 0.8 # fraction of a dataset used for training (the rest used for validation)
num_train_epochs = 3 # epochs to train
batch_size = 16 # batch size for training and validation
warmup_steps = 50
weight_decay = 0.02
BERT_MODEL = "distilbert-base-cased"
output_dir = "./phishing-email-detection"

In [2]:
df = pd.read_csv("/Users/filip/Desktop/Cognitive/Intro To NLP/Phishing Project/Phishing_Email.csv").drop(['Unnamed: 0'], axis=1).drop_duplicates()
#drop unnamed po to aby pozbyć się kolumny z indeksami z pliku csv i drop duplicates aby pozbyć się duplikatów wiadomka
print(df.shape)
df.sample(5)

(17539, 2)


Unnamed: 0,Email Text,Email Type
501,ios posting please post .,Safe Email
17216,davis claims ene willing to take haircut fyi ....,Safe Email
1078,Shouldn't a politician know not to tell the tr...,Safe Email
10197,help wanted . work from home . free details we...,Phishing Email
17667,"URL: http://www.newsisfree.com/click/-0,835789...",Safe Email


In [3]:
# create target
df['label'] = (df['Email Type']=="Phishing Email").astype(int) # zamiana wartości w kolumnie Email Type na wartości 0 i 1 
df['title'] = df['Email Text'] # zmiana nazwy kolumny Email Text na title 
df = df[['title', 'label']] # wybranie kolumn title i label jako nowy dataframe

print(df.shape)
df.sample(5)

(17539, 2)


Unnamed: 0,title,label
4540,"Tech Update Today\nVITAL SIGNS FOR JULY 15, 20...",0
6445,"synthetic compounds summary dear colleagues , ...",0
8730,> Martin Mentioned:\n> >I've used this a few t...,0
8313,"re : 8 . 137 , disc : low vowels in pie lingui...",0
5317,Cell Booster AntennaBOOST \n Your Recep...,1


In [4]:
# drop null records
df.dropna(inplace=True)

In [5]:
df['label'].mean() #0.373 - 37.3% of the emails are phishing emails - imbalanced dataset

0.37393089291823467

In [13]:
# Assuming df is your DataFrame and 'label' is the column with the types
type_0_count = len(df[df['label'] == 0])
type_1_count = len(df[df['label'] == 1])

# Calculate how many type 0 instances to remove
remove_n = type_0_count - type_1_count

# Randomly select type 0 instances to remove
drop_indices = df[df['label'] == 0].sample(remove_n, random_state=1).index

# Drop these rows from the DataFrame
df_balanced = df.drop(drop_indices)

df_balanced['label'].mean() #0.5 - balanced dataset
df_balanced.shape


(13116, 2)

In [15]:
balanced_dataset = Dataset.from_pandas(df_balanced)

In [17]:
del df
del df_balanced
del drop_indices
del type_0_count
del type_1_count

gc.collect()

688

In [19]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-cased", use_fast=True, low_cpu_mem_usage=False)
#tokenizer z BERTa , use fast uzywa RUSTa co przyspiesza tokenizacje, low_cpu_mem_usage - zmniejsza użycie pamięci 

In [21]:
def preprocess_function(examples):
    if "title" in examples and examples["title"] is not None:
        # If examples["title"] is a string, tokenize it
    
        if isinstance(examples["title"], str):
            return tokenizer(examples["title"], truncation=True)
        # If examples["title"] is a list, check that it only contains strings
        elif isinstance(examples["title"], list) and all(isinstance(item, str) for item in examples["title"]):
            return tokenizer(examples["title"], truncation=True)
    # Return an empty dictionary if examples["title"] is None or not a string or a list of strings
    return {}

balanced_dataset = balanced_dataset.map(preprocess_function, batched=True)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
100%|██████████| 14/14 [00:14<00:00,  1.04s/it]


In [14]:
balanced_dataset

Dataset(features: {'title': Value(dtype='string', id=None), 'label': Value(dtype='int64', id=None), 'input_ids': Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None), 'attention_mask': Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None)}, num_rows: 21960)

In [22]:
# Dataset has a built in train test split method
balanced_dataset = balanced_dataset.train_test_split(test_size=1-0.7) #0.3 test size

100%|██████████| 10/10 [00:06<00:00,  1.48it/s]
100%|██████████| 4/4 [00:02<00:00,  1.50it/s]


In [23]:
# DataCollatorWithPadding creates batch of data. It also dynamically pads text to the 
#  length of the longest element in the batch, making them all the same length. 
#  It's possible to pad your text in the tokenizer function with padding=True, dynamic padding is more efficient.
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [24]:
tokenizer.decode(balanced_dataset['train'][0]['input_ids'])

"[CLS] sintassi comparata dell'accordo participiale ramanzo michele loporcaro sintassi comparata dell'accordo participiale romanzo rosenberg & sellier, torino, 1998 isbn 88 - 7011 - 719 - 7, 272 pages, lit. 65. 000 address of the publisher : via andrea doria 14, i - 10123 torino grosenb @ tin. it ( credit cards accepted ) this book offers a comprehensive account of romance past participle ( pp ) agreement in verbal periphrastics, a much - debated topic in romance linguistics as well as in theoretical syntax. its main bulk consists of a systematic inventory of agreement systems throughout romance ( chs. 3 - 4 ), which is unprecedented as to both empirical coverage and level of detail. beside the standard languages, dialects ( especially, but by no means exclusively, italo - romance ones ) are considered thoroughly, based in part on first - hand data. one of the basic points of this work is that no sensible account of pp agreement can be arrived at without in - depth consideration of dia

# Loading and training model

In [25]:
# LOADING AND TRAINING MODEEL

model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-cased", num_labels=2,
    output_attentions = False, # Whether the model returns attentions weights.
    output_hidden_states = False # Whether the model returns all hidden-states.
)

model.config.id2label = {0: 'SAVE EMAIL', 1: 'PHISHING EMAIL'}
#The message "You should probably TRAIN this model on a down-stream task to be able to use it
#for predictions and inference" is suggesting that you should fine-tune this model on your
#specific task (phishing email detection in this case) before using it for predictions or inference

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [26]:
# number of trainable parameters
print(model.num_parameters(only_trainable=True)/1e6)

65.783042


In [27]:
from datasets import load_metric 

metric = load_metric("accuracy") # do oceny 

def compute_metrics(eval_pred): 
    logits, labels = eval_pred #rozdzielenie przewidywania (logits) i etykiet (labels) 
    predictions = np.argmax(logits, axis=-1)  #argmax zwraca indeksy największych wartości w kolumnie wzdłuż osi
    return metric.compute(predictions=predictions, references=labels)

  metric = load_metric("accuracy") # do oceny
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


In [26]:
#%pip install accelerate -U

In [28]:
#%pip install accelerate -U
training_args = TrainingArguments(
    output_dir="./phishing-email-detection", #"./phishing-email-detection"
    logging_dir='./logs',
    num_train_epochs=3, # 3
    per_device_train_batch_size=1, #16 - number of samples to process at once per batch
    per_device_eval_batch_size=1, #16
    gradient_accumulation_steps=2,  # Added this line to fix the MPS error
    logging_strategy='steps', # log every step
    logging_first_step=True,
    load_best_model_at_end=True, #trainer will load the best model found during training at the end of training
    logging_steps=1,
    evaluation_strategy='epoch',# when evaluate model - after each epoch
    warmup_steps=50, #50
    weight_decay=0.02, #0.02
    eval_steps=1,
    save_strategy='epoch',
    report_to="mlflow",  # log to mlflow
)



# Define the trainer:
# instantiate the trainer class and check for available devices
trainer = Trainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=balanced_dataset['train'],
    eval_dataset=balanced_dataset['test'],
    data_collator=data_collator # A function to batch together samples of data.
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [28]:

#device = torch.device('cpu')
#model = model.to(device)


In [30]:
#import os

#os.environ['PYTORCH_MPS_HIGH_WATERMARK_RATIO'] = '0'

In [29]:
# Get initial metrics

trainer.evaluate()

# zmiejszam batch sieze - max: 3,4 gb ramu, 2.47 + 926MB = 3.4GB 
# mam za slabego kompa :<

# 8gb version of mac is not enought to have mps accerlating and pytorch 2.0 or mps is only work in 13+ version
# wywala mi bład ze względy na za mało pamięci - jest pare opcji aby to naprawić
# 1. zmniejszenie batch size - mna batch size = 6 muli 40 minut
# 2. zmniejszenie ilości danych - i tak jest mało danych
# 3. zwiększenie ilości pamięci - ale to nie jest możliwe bo nie chce kupować nowego komputera
# 4. simpler model  - moze DistillBert? ale nie wiem czy accuracy bedzie wystarczająca 
 

# https://github.com/AUTOMATIC1111/stable-diffusion-webui/issues/9133 


 18%|█▊        | 720/3935 [11:09<1:09:23,  1.30s/it]

KeyboardInterrupt: 

In [34]:
trainer.train()



RuntimeError: MPS backend out of memory (MPS allocated: 2.44 GB, other allocations: 969.38 MB, max allowed: 3.40 GB). Tried to allocate 72.00 MB on private pool. Use PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0 to disable upper limit for memory allocations (may cause system failure).

In [24]:
trainer.evaluate()

{'eval_loss': 0.03707250580191612,
 'eval_accuracy': 0.9936247723132969,
 'eval_runtime': 41.3827,
 'eval_samples_per_second': 106.131,
 'eval_steps_per_second': 6.645,
 'epoch': 3.0}

In [25]:
trainer.save_model()

In [26]:
tokenizer.save_vocabulary(save_directory=output_dir)

('./phishing-email-detection/vocab.txt',)

In [27]:
# make a classification pipeline
pipe = pipeline("text-classification", output_dir, tokenizer=BERT_MODEL)
sample_title = '''Why do employees leave companies — analysis of IBM employee data'''
pipe(sample_title, top_k=None)

[{'label': 'SAVE EMAIL', 'score': 0.9475719332695007},
 {'label': 'PHISHING EMAIL', 'score': 0.05242803692817688}]