En esta primera celda aglutinamos todos las librerías y herramientas que utilicemos

In [3]:
from datasets import load_dataset
from sklearn.model_selection import train_test_split
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from transformers import DataCollatorWithPadding
from transformers import TrainingArguments
from transformers import Trainer
import numpy as np
import torch
from sklearn.metrics import accuracy_score, f1_score


Cargo el dataset

In [4]:
ds = load_dataset("pirocheto/phishing-url")

Aquí, de los datos del dataset, convierto los que están dedidos al entrenamiento a pandas y los expongo en pantalla para así ver las etiquetas

In [5]:
ds['train'].to_pandas().head

<bound method NDFrame.head of                                                     url  length_url  \
0     https://www.todayshomeowner.com/how-to-make-ho...          82   
1     http://thapthan.ac.th/information/confirmation...          93   
2     http://app.dialoginsight.com/T/OFC4/L2S/3888/B...         121   
3                              https://www.bedslide.com          24   
4     https://tabs.ultimate-guitar.com/s/sex_pistols...          73   
...                                                 ...         ...   
7653  https://snip.ly/www.netflix.com-signIn-account...          63   
7654                       http://webchat.freenode.net/          28   
7655  http://mr-statucki.com/wp-content/uploads/2009...          67   
7656    https://www.computerhope.com/jargon/c/cdrom.htm          47   
7657                           https://bravonia.com.tw/          24   

      length_hostname  ip  nb_dots  nb_hyphens  nb_at  nb_qm  nb_and  nb_or  \
0                  23   0        2    

Creo una variable, df, en la que meto el paso anterior, de forma que la pueda llamar más adelante 

In [6]:

df = ds['train'].to_pandas()

In [7]:
print(df.head())

                                                 url  length_url  \
0  https://www.todayshomeowner.com/how-to-make-ho...          82   
1  http://thapthan.ac.th/information/confirmation...          93   
2  http://app.dialoginsight.com/T/OFC4/L2S/3888/B...         121   
3                           https://www.bedslide.com          24   
4  https://tabs.ultimate-guitar.com/s/sex_pistols...          73   

   length_hostname  ip  nb_dots  nb_hyphens  nb_at  nb_qm  nb_and  nb_or  ...  \
0               23   0        2           7      0      0       0      0  ...   
1               14   1        2           0      0      0       0      0  ...   
2               21   1        3           0      0      0       0      0  ...   
3               16   0        2           0      0      0       0      0  ...   
4               24   0        3           1      0      0       0      0  ...   

   domain_in_title  domain_with_copyright  whois_registered_domain  \
0                1                

En esta celda imprimo en pantalla cuántos datos hay de las etiquetas de la columna `status`, "legitimate"y "phishing", para comprobar la equidad de datos

In [8]:
print(df['status'].value_counts())

status
legitimate    3829
phishing      3829
Name: count, dtype: int64


Imprimo en pantalla la suma de los valores nulos de cada columna

In [9]:
print(df.isnull().sum())

url                0
length_url         0
length_hostname    0
ip                 0
nb_dots            0
                  ..
web_traffic        0
dns_record         0
google_index       0
page_rank          0
status             0
Length: 89, dtype: int64


In [10]:
df['label'] = df['status'].map({'legitimate': 0, 'phishing': 1})

In [11]:
df = df.drop(columns=['status'])

In [12]:
print(df['label'].value_counts())

label
0    3829
1    3829
Name: count, dtype: int64


In [13]:
df_bert = df[['url', 'label']].copy()
print(df_bert['label'].unique())
print(df_bert['label'].dtype)

[0 1]
int64


In [14]:
train_df, val_df = train_test_split(
    df_bert,
    test_size = 0.2,
    random_state = 42,
    stratify = df_bert['label']
)

    

In [15]:
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)
test_dataset = ds['test']


In [16]:
modelo = "bert-base-uncased"

In [17]:
model_bert = AutoModelForSequenceClassification.from_pretrained(modelo, num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [18]:
tokenizador = AutoTokenizer.from_pretrained("bert-base-uncased")

In [19]:
token_train = train_dataset.map(
    lambda x: tokenizador(
        x['url'],
        truncation = True,
        padding = 'max_length',
        max_length = 128
    ),
    batched = True
)


Map:   0%|          | 0/6126 [00:00<?, ? examples/s]

In [20]:
token_val = val_dataset.map(
    lambda x: tokenizador(
        x['url'],
        truncation = True,
        padding = 'max_length',
        max_length = 128
    ),
    batched = True
)

Map:   0%|          | 0/1532 [00:00<?, ? examples/s]

In [21]:
token_test = test_dataset.map(
    lambda x: tokenizador(
        x['url'],
        truncation = True,
        padding = 'max_length',
        max_length = 128
    ),
    batched = True
)

In [22]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizador)

In [23]:
training_args = TrainingArguments(
    output_dir = 'traininig/phishing-bert',
    evaluation_strategy = 'epoch',
    save_strategy = 'epoch',
    num_train_epochs = 3,
    per_device_train_batch_size = 8,
    per_device_eval_batch_size = 8,
    logging_strategy = 'steps',
    logging_steps = 100,
    load_best_model_at_end = True,
)
    

In [24]:
entrenador = Trainer(model = model_bert,
                     tokenizer = tokenizador,
                     train_dataset = token_train,
                     eval_dataset = token_val,
                     args = training_args,
                     data_collator = data_collator
                    )
                     

In [25]:
entrenador.train



Epoch,Training Loss,Validation Loss
1,0.277,0.283573
2,0.1424,0.316227
3,0.0463,0.263521




TrainOutput(global_step=2298, training_loss=0.16470211749495994, metrics={'train_runtime': 1766.8765, 'train_samples_per_second': 10.401, 'train_steps_per_second': 1.301, 'total_flos': 1208863743851520.0, 'train_loss': 0.16470211749495994, 'epoch': 3.0})

In [42]:


etiquetas = preds.label_ids
predicciones = preds.predictions.argmax(axis=-1)
print('label_ids:', preds.label_ids)
print('predictions:', preds.predictions)

#print(predicciones.shape, etiquetas.shape)
print('Accuracy:', accuracy_score(etiquetas, predicciones))
print('F1 Score:', f1_score(etiquetas, predicciones))
      

label_ids: None
predictions: [[-3.721324   3.3459704]
 [ 4.340714  -4.181532 ]
 [ 4.3926773 -4.4100895]
 ...
 [ 4.101392  -3.6909788]
 [ 3.8135448 -3.574047 ]
 [-4.0030975  3.5844572]]


InvalidParameterError: The 'y_true' parameter of accuracy_score must be an array-like or a sparse matrix. Got None instead.

In [33]:
print(token_test.column_name

['url', 'length_url', 'length_hostname', 'ip', 'nb_dots', 'nb_hyphens', 'nb_at', 'nb_qm', 'nb_and', 'nb_or', 'nb_eq', 'nb_underscore', 'nb_tilde', 'nb_percent', 'nb_slash', 'nb_star', 'nb_colon', 'nb_comma', 'nb_semicolumn', 'nb_dollar', 'nb_space', 'nb_www', 'nb_com', 'nb_dslash', 'http_in_path', 'https_token', 'ratio_digits_url', 'ratio_digits_host', 'punycode', 'port', 'tld_in_path', 'tld_in_subdomain', 'abnormal_subdomain', 'nb_subdomains', 'prefix_suffix', 'random_domain', 'shortening_service', 'path_extension', 'nb_redirection', 'nb_external_redirection', 'length_words_raw', 'char_repeat', 'shortest_words_raw', 'shortest_word_host', 'shortest_word_path', 'longest_words_raw', 'longest_word_host', 'longest_word_path', 'avg_words_raw', 'avg_word_host', 'avg_word_path', 'phish_hints', 'domain_in_brand', 'brand_in_subdomain', 'brand_in_path', 'suspecious_tld', 'statistical_report', 'nb_hyperlinks', 'ratio_intHyperlinks', 'ratio_extHyperlinks', 'ratio_nullHyperlinks', 'nb_extCSS', 'rat

In [40]:
print(len(token_test))

3772


In [44]:
print(token_test.column_names)


['url', 'length_url', 'length_hostname', 'ip', 'nb_dots', 'nb_hyphens', 'nb_at', 'nb_qm', 'nb_and', 'nb_or', 'nb_eq', 'nb_underscore', 'nb_tilde', 'nb_percent', 'nb_slash', 'nb_star', 'nb_colon', 'nb_comma', 'nb_semicolumn', 'nb_dollar', 'nb_space', 'nb_www', 'nb_com', 'nb_dslash', 'http_in_path', 'https_token', 'ratio_digits_url', 'ratio_digits_host', 'punycode', 'port', 'tld_in_path', 'tld_in_subdomain', 'abnormal_subdomain', 'nb_subdomains', 'prefix_suffix', 'random_domain', 'shortening_service', 'path_extension', 'nb_redirection', 'nb_external_redirection', 'length_words_raw', 'char_repeat', 'shortest_words_raw', 'shortest_word_host', 'shortest_word_path', 'longest_words_raw', 'longest_word_host', 'longest_word_path', 'avg_words_raw', 'avg_word_host', 'avg_word_path', 'phish_hints', 'domain_in_brand', 'brand_in_subdomain', 'brand_in_path', 'suspecious_tld', 'statistical_report', 'nb_hyperlinks', 'ratio_intHyperlinks', 'ratio_extHyperlinks', 'ratio_nullHyperlinks', 'nb_extCSS', 'rat

In [45]:
print(token_test[:5])


{'url': ['https://clubedemilhagem.com/home.php', 'http://www.medicalnewstoday.com/articles/188939.php', 'https://en.wikipedia.org/wiki/NBC_Nightly_News', 'http://secure.web894.com/customer_center/customer-IDPP00C139/myaccount/identity/?cmd=_session=&amp;02df5c40bef38f0b3d11339b7beab5d8&amp;dispatch=ecb2f39f76aef328f62cfcea40da0211815e207f', 'https://en.wikipedia.org/wiki/Transaction_processing'], 'length_url': [36, 51, 46, 185, 52], 'length_hostname': [19, 24, 16, 17, 16], 'ip': [0, 0, 0, 1, 0], 'nb_dots': [2, 3, 2, 2, 2], 'nb_hyphens': [0, 0, 0, 1, 0], 'nb_at': [0, 0, 0, 0, 0], 'nb_qm': [0, 0, 0, 1, 0], 'nb_and': [0, 0, 0, 2, 0], 'nb_or': [0, 0, 0, 0, 0], 'nb_eq': [0, 0, 0, 3, 0], 'nb_underscore': [0, 0, 2, 2, 1], 'nb_tilde': [0, 0, 0, 0, 0], 'nb_percent': [0, 0, 0, 0, 0], 'nb_slash': [3, 4, 4, 7, 4], 'nb_star': [0, 0, 0, 0, 0], 'nb_colon': [1, 1, 1, 1, 1], 'nb_comma': [0, 0, 0, 0, 0], 'nb_semicolumn': [0, 0, 0, 2, 0], 'nb_dollar': [0, 0, 0, 0, 0], 'nb_space': [0, 0, 0, 0, 0], 'nb_www