In [10]:
from tokenizers import Tokenizer, models, pre_tokenizers, processors, trainers, Regex
from tokenizers.normalizers import Lowercase, Strip
from tokenizers.pre_tokenizers import Whitespace, Split
import regex as re
import random

In [16]:
# custom pre-tokenizer for handling IPs, ports, and commas
def custom_pretokenizer(text):
    # make sure everything is in lowercase for consistent formatting
    text = text.strip().lower()

    # split the IP addresses by the dots and tokenize each chunk of the IP address
    text = re.sub(r'(\d+)\.(\d+)\.(\d+)\.(\d+)', r'\1 . \2 . \3 . \4', text)

    # then tokenize the numbers and words- split on commas
    tokens = re.split(r'[, ]+', text)

    return [(match.start(), match.end()) for match in re.finditer(r'\S+', text)]

In [17]:
# tokenizer initializaation
custom_tokenizer = Tokenizer(models.BPE())

In [18]:
# set up a normalizer
custom_tokenizer.normalizer = Strip()

In [19]:
# setting the pre-tokenizer
custom_tokenizer.pre_tokenizer = pre_tokenizers.Sequence([
    pre_tokenizers.Split(Regex(r'[,.\s]+'), behavior="isolated")
])

In [20]:
# set up a trainer to learn the vocabulary
trainer = trainers.BpeTrainer(special_tokens=["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"])

In [21]:
# random data (randomly generate data of the form <source port>, <source IP>, <source application name>, <destination port>, <destination IP>, <packets transmitted> for proof of concept)
def generate_ip():
    return '.'.join(str(random.randint(0, 255)) for _ in range(4))

def generate_entry():
    source_port = random.randint(1024, 65535)
    source_ip = generate_ip()
    app_name = random.choice([
        "chrome", "firefox", "edge", "ssh", "mysql", "dnsmasq",
        "nginx-proxy", "apache-tomcat", "python-app", "dhcp-client",
        "dns-server", "ntp-service", "ftp-client", "smtp-mail",
        "telnet-session", "snmp-monitor", "remote-desktop",
        "teamviewer", "zoom-meeting", "slack-chat",
        "webex-call", "ping-tool", "traceroute-utility",
        "arp-mapper", "icmp-diagnostic"
    ]) # asked gpt to give me 20 random application names
    dest_port = random.choice([80, 443])
    dest_ip = generate_ip()
    packets_transmitted = random.randint(10, 100)
    return f"{source_port}, {source_ip}, {app_name}, {dest_port}, {dest_ip}, {packets_transmitted}"

data = []

for _ in range(100): # generate 100 random entries
    data.append(generate_entry())

print("\n".join(data))


20758, 249.91.33.235, remote-desktop, 443, 100.252.204.170, 40
62113, 54.44.143.180, icmp-diagnostic, 80, 246.229.158.239, 96
52128, 3.218.128.150, traceroute-utility, 80, 107.88.13.218, 96
59895, 136.45.3.192, zoom-meeting, 443, 12.83.107.182, 73
23528, 219.13.204.197, python-app, 443, 185.245.243.44, 79
35021, 243.252.152.183, firefox, 80, 238.195.67.60, 93
41919, 54.211.101.200, edge, 80, 132.90.15.7, 56
6384, 246.92.249.52, teamviewer, 443, 189.30.30.189, 11
45258, 23.89.48.157, telnet-session, 80, 190.63.172.192, 46
44396, 120.135.149.33, python-app, 80, 213.50.184.133, 20
62975, 170.97.155.149, webex-call, 80, 151.1.80.98, 79
29197, 58.38.172.121, ssh, 443, 133.112.54.63, 67
49578, 62.91.21.121, snmp-monitor, 80, 73.223.217.141, 86
18346, 127.210.194.59, dhcp-client, 443, 82.53.53.167, 74
11095, 2.89.159.16, ping-tool, 443, 218.73.77.42, 56
19895, 122.120.117.220, traceroute-utility, 443, 134.186.26.156, 41
41731, 76.16.243.47, dns-server, 80, 164.19.24.194, 64
11665, 108.162.177

In [22]:
# Train the tokenizer
custom_tokenizer.train_from_iterator(data, trainer)

In [23]:
encoded = custom_tokenizer.encode("443, 192.168.1.1, chrome, 80, 93.184.216.34, 50")
print(encoded.tokens)

['443', ', ', '192', '.', '168', '.', '1', '.', '1', ', ', 'chrome', ', ', '80', ', ', '93', '.', '184', '.', '216', '.', '34', ', ', '50']


In [24]:
! pip install huggingface_hub



In [25]:
! huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) y
Token is valid (permission: fineGrained).
The token `IP-FLOW` has been saved to /root/.cache/huggingface/stored_tokens
[1m[31mCannot authenticate through git-credential as no helper is defined on your machine.
You might have to re-auth

In [26]:
from tokenizers import Tokenizer

In [27]:
custom_tokenizer.save("ip_flow_tokenizer.json") # saving this locally

In [28]:
# now we'll push it to huggingface hub

In [29]:
from transformers import PreTrainedTokenizerFast
from huggingface_hub import HfApi

In [30]:
tokenizer = Tokenizer.from_file("ip_flow_tokenizer.json")

In [31]:
hf_tokenizer = PreTrainedTokenizerFast(
    tokenizer_object=tokenizer,
    unk_token="[UNK]",
    pad_token="[PAD]",
    cls_token="[CLS]",
    sep_token="[SEP]",
    mask_token="[MASK]"
)

In [32]:
hf_tokenizer.push_to_hub("namita-ach/ip-flow-tokenizer1") # here we're splitting the IP addresses into chunks based off the period in between

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


CommitInfo(commit_url='https://huggingface.co/namita-ach/ip-flow-tokenizer1/commit/d8a050becc857106f4804ce36729577f94ee8d91', commit_message='Upload tokenizer', commit_description='', oid='d8a050becc857106f4804ce36729577f94ee8d91', pr_url=None, repo_url=RepoUrl('https://huggingface.co/namita-ach/ip-flow-tokenizer1', endpoint='https://huggingface.co', repo_type='model', repo_id='namita-ach/ip-flow-tokenizer1'), pr_revision=None, pr_num=None)

In [33]:
# now let's see if we got the tokenizer
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("namita-ach/ip-flow-tokenizer")

OSError: namita-ach/ip-flow-tokenizer is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models'
If this is a private repository, make sure to pass a token having permission to this repo either by logging in with `huggingface-cli login` or by passing `token=<your_token>`