In [1]:
import re
import pandas as pd

In [2]:
def preprocess(text, 
            phone_token = ' <PHONE> ',
            email_token = ' <EMAIL> ',
            url_token = ' <URL> ',
            num_token = ' <NUM> ',):
    
    #Capitalization removal
    text = text.lower()

    #PHONE NUMBER token substitution
    text = re.sub(r'(\(\d{2}\))\s?\d{8,}|\d{10,}', 
                  phone_token, text, flags=re.MULTILINE)
    #EMAIL token substitution
    text = re.sub("([A-Za-z0-9]+[.-_])*[A-Za-z0-9]+@[A-Za-z0-9-]+(\.[A-Z|a-z]{2,})+", 
                  email_token, text, flags=re.MULTILINE)
    #URL token substitution
    text = re.sub(r"(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))", 
                  url_token, text, flags=re.MULTILINE)
    #NUMERIC token substitution
    text = re.sub(r'[0-9]+', 
                  num_token, text, flags=re.MULTILINE)
    #Special characters removal
    text = re.sub(r'([^\w\s<>])|(_)', 
                  " ", text, flags=re.MULTILINE)
    #Multiple space removal
    text = re.sub(r'\s+', 
                  " ", text, flags=re.MULTILINE)

    return text.strip()

In [3]:
df = pd.read_csv('../Dataset_5971.csv')

In [4]:
df.LABEL = (df.LABEL.str.lower()=='smishing')*1
df = df[['TEXT', 'LABEL']]
df.TEXT = df.TEXT.apply(preprocess)
df.columns = ['text', 'label_ids']

In [5]:
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict
from transformers import AutoModelForSequenceClassification, AutoTokenizer

In [6]:
df_train, df_test = train_test_split(df, test_size=0.2, stratify=df.label_ids, random_state=0)
train_dataset = Dataset.from_pandas(df_train)
test_dataset = Dataset.from_pandas(df_test)

In [7]:
dataset = DatasetDict({'train':train_dataset, 'eval':test_dataset})

In [8]:
#here you choose betwwen cased and uncased

ckpt = 'bert-base-uncased'
# ckpt = 'distilbert-base-uncased'

tokenizer = AutoTokenizer.from_pretrained(ckpt)
model = AutoModelForSequenceClassification.from_pretrained(ckpt, num_labels=2)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [9]:
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(labels, preds)
    acc = accuracy_score(labels, preds)
    rec = recall_score(labels, preds)
    prec = precision_score(labels, preds)
    return {
        'accuracy': acc,
        'precision': prec,
        'recall': rec,
        'f1': f1,
    }

In [10]:
def tokenize(batch):
    return tokenizer(batch['text'], truncation=True, max_length=32, return_special_tokens_mask=True)

In [11]:
dataset = dataset.map(tokenize)
dataset = dataset.remove_columns(['text'])
dataset.set_format('torch')

  0%|          | 0/4776 [00:00<?, ?ex/s]

  0%|          | 0/1195 [00:00<?, ?ex/s]

In [12]:
from transformers import Trainer, TrainingArguments

batch_size = 64
logging_steps = len(dataset['train'])//batch_size
model_name = "smishing_model"

training_args = TrainingArguments(output_dir=model_name,
                                  num_train_epochs=10,
                                  learning_rate=2e-5,
                                  per_device_train_batch_size=batch_size,
                                  per_device_eval_batch_size=batch_size,
                                  weight_decay=0.01,
                                  evaluation_strategy='epoch',
                                  disable_tqdm=False,
                                  logging_steps=logging_steps,
                                  save_strategy='no',
                                  log_level='error')

In [13]:
trainer = Trainer(model=model, 
                  args=training_args,
                  compute_metrics=compute_metrics,
                  train_dataset=dataset['train'],
                  eval_dataset=dataset['eval'],
                  tokenizer=tokenizer)

In [14]:
trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.2097,0.107311,0.957322,0.77305,0.851562,0.810409
2,0.0728,0.102647,0.962343,0.807407,0.851562,0.828897
3,0.0488,0.093429,0.967364,0.850394,0.84375,0.847059
4,0.04,0.124803,0.961506,0.780822,0.890625,0.832117
5,0.0302,0.118234,0.96318,0.813433,0.851562,0.832061
6,0.0237,0.126255,0.961506,0.810606,0.835938,0.823077
7,0.0238,0.139647,0.96318,0.83871,0.8125,0.825397
8,0.0179,0.157937,0.961506,0.797101,0.859375,0.827068
9,0.0122,0.174385,0.961506,0.80597,0.84375,0.824427
10,0.015,0.173175,0.961506,0.801471,0.851562,0.825758


TrainOutput(global_step=750, training_loss=0.048866226891676585, metrics={'train_runtime': 301.0372, 'train_samples_per_second': 158.651, 'train_steps_per_second': 2.491, 'total_flos': 785386500249600.0, 'train_loss': 0.048866226891676585, 'epoch': 10.0})