In [None]:
!pip install scapy

In [None]:
import sagemaker
from sagemaker import get_execution_role
import json,boto3
from scapy.all import *
import csv,binascii

sess = sagemaker.Session(default_bucket='packets-bucket-web')
role = get_execution_role()

bucket = 'packets-bucket-web'
prefix = 'text-classification'
training_ground_truth_files = ['UNSW-NB15_1.csv', 'UNSW-NB15_2.csv', 'UNSW-NB15_3.csv']
training_pcap_files = ['1.pcap', '2.pcap', '3.pcap']
testing_pcap_files = ['4.pcap']
testing_ground_truth_files = ['UNSW-NB15_4.csv']
# Paketų kiekis kuris bus imamamas iš kiekvieno pcap failo
num_packets = 50000 # Pakeisiti jeigu atitinkamai ruosiama mokymui/testavimui


In [None]:
!unzip training_data/csv_files.zip

In [None]:
def process_packets(packet_file,csv_file):
    new_packets = []
    packets = PcapReader(packet_file)
    counter0 = 0
    counter1 = 0
    size = 0
    for packet in packets:
        if IP in packet:
            src_ip = packet[IP].src
            dst_ip = packet[IP].dst  
        if TCP in packet:
            data = packet[TCP].payload
            data = binascii.hexlify(bytes(data))
            tcp_sport=packet.sport
            tcp_dport=packet.dport
            packet_time = packet.time
            if len(data) != 0:
                for row in csv_file:
                    if row[0] == src_ip and row[1] == tcp_sport and row[2] == dst_ip and row[3] == tcp_dport and row[49] == packet_time:
                        if row[61] == 0:
                            if counter0 == num_packets/2:
                                packet_class = 1
                                counter1 = counter1 + 1
                            else:
                                counter0 = counter0 + 1
                        else:
                            if counter1 == num_packets/2:
                                packet_class = 0
                                counter0 = counter0 + 1
                            else:
                                counter1 = counter1 + 1
                    new_packet = tuple((packet_class, data))
                    new_packets.append(new_packet)
                    size = size + 1
                    print("Added new packet")
                    if size > num_packets:
                        break
            else:
                print("skipping packet: empty payload")
        else:
            print("skipping packet: no TCP found")
    print("Number of class 1 packets: ",counter0,"Number of class 2 packets: ",counter1)
    return new_packets



In [None]:
# visi apmokymui skirti failai apdorojami ir sujungiami į naują bendrą masyvą
combined_list_new_packets = []
for packet_file,csv_file in zip(training_pcap_files, training_ground_truth_files):
    #CSV failu nuskaitymas i masyva
    with open(csv_file) as f:
            file_read = csv.reader(f)
    data = process_packets(packet_file,file_read)
    combined_list_new_packets.extend(data)

In [None]:
with open('training.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    for packet in combined_list_new_packets:
        writer.writerow(packet)

In [None]:
# visi testavimui skirti failai apdorojami ir sujungiami į naują bendrą masyvą
combined_list_new_packets = []
for packet_file,csv_file in zip(testing_pcap_files, testing_ground_truth_files):
    #CSV failu nuskaitymas i masyva
    with open(csv_file) as f:
        file_read = csv.reader(f)
    process_packets(packet_file,csv_file)
    combined_list_new_packets.extend(data)

In [None]:
with open('testing.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    for packet in combined_list_new_packets:
        writer.writerow(packet)

In [None]:
index_to_label = {}
with open("training_data/classes.txt") as f:
    for i, label in enumerate(f.readlines()):
        index_to_label[str(i)] = label.strip()
print(index_to_label)


In [None]:
from random import shuffle
import multiprocessing
from multiprocessing import Pool
import csv
import nltk

nltk.download("punkt")

In [None]:
def transform_instance(row):
    cur_row = []
    label = "__label__" + index_to_label[row[0]]  # Prefix the index-ed label with __label__
    cur_row.append(label)
    cur_row.extend(nltk.word_tokenize(row[1].lower()))
    return cur_row

In [None]:
def preprocess(input_file, output_file, keep=1):
    all_rows = []
    with open(input_file, "r") as csvinfile:
        csv_reader = csv.reader(csvinfile, delimiter=",")
        for row in csv_reader:
            all_rows.append(row)
    shuffle(all_rows)
    all_rows = all_rows[: int(keep * len(all_rows))]
    pool = Pool(processes=multiprocessing.cpu_count())
    transformed_rows = pool.map(transform_instance, all_rows)
    pool.close()
    pool.join()

    with open(output_file, "w") as csvoutfile:
        csv_writer = csv.writer(csvoutfile, delimiter=" ", lineterminator="\n")
        csv_writer.writerows(transformed_rows)


In [None]:
%%time

# Preparing the training dataset

# Since preprocessing the whole dataset might take a couple of mintutes,
# we keep 20% of the training dataset for this demo.
# Set keep to 1 if you want to use the complete dataset
preprocess("training.csv", "packets.train", keep=0.2)

# Preparing the validation dataset
preprocess("testing.csv", "packets.validation")

In [None]:
%%time

train_channel = prefix + "/train"
validation_channel = prefix + "/validation"

sess.upload_data(path="packets.train", bucket=bucket, key_prefix=train_channel)
sess.upload_data(path="packets.validation", bucket=bucket, key_prefix=validation_channel)

s3_train_data = "s3://{}/{}".format(bucket, train_channel)
s3_validation_data = "s3://{}/{}".format(bucket, validation_channel)

In [None]:
s3_output_location = "s3://{}/{}/output".format(bucket, prefix)

In [None]:
region_name = boto3.Session().region_name

In [None]:
container = sagemaker.amazon.amazon_estimator.get_image_uri(region_name, "blazingtext", "latest")
print("Using SageMaker BlazingText container: {} ({})".format(container, region_name))

In [None]:
bt_model = sagemaker.estimator.Estimator(
    container,
    role,
    instance_count=1,
    instance_type="ml.c4.4xlarge",
    volume_size=30,
    max_run=360000,
    input_mode="File",
    output_path=s3_output_location,
    hyperparameters={
        "mode": "supervised",
        "epochs": 1,
        "min_count": 2,
        "learning_rate": 0.05,
        "vector_dim": 10,
        "early_stopping": True,
        "patience": 4,
        "min_epochs": 5,
        "word_ngrams": 2,
    },
)

In [None]:
train_data = sagemaker.inputs.TrainingInput(
    s3_train_data,
    distribution="FullyReplicated",
    content_type="text/plain",
    s3_data_type="S3Prefix",
)
validation_data = sagemaker.inputs.TrainingInput(
    s3_validation_data,
    distribution="FullyReplicated",
    content_type="text/plain",
    s3_data_type="S3Prefix",
)
data_channels = {"train": train_data, "validation": validation_data}


access_key = ""
secret_access_key = ""