# DATASET CONVERSION - PCAP -> XML -> NPY

## PCAP TO XML

1. Convert `.pcap` to `.xml` using [ISCX FlowMeter](https://github.com/ISCX/ISCXFlowMeter) or [CIC FlowMeter](https://github.com/ISCX/CICFlowMeter).
2. Preprocess data from `.xml` to `.npy` with `Data_Extraction_Revised.py`.

## XML TO NPY

In [2]:
import xml.etree.ElementTree as ET
import numpy as np
import os
import time

### ISCX-IDS-2012 Dataset

In [None]:
import_directory = '/home/aryn/spectre-dev/dataset/ISCX-IDS-2012/PCAP-XML/'
export_codebase = '/home/aryn/spectre-dev/dataset/ISCX-IDS-2012/PCAP-NPY/destinationPayload_'
export_dataset = '/mnt/Data/SPECTRE/Dataset/ISCX-2012/PCAP-NPY/destinationPayload_'

### Malware_Capture_Facility_Project Dataset - CTU-Malware-Capture-Botnet-135-1 Stlrat DDoS

In [18]:
import_directory = '/home/aryn/spectre-dev/dataset/Malware_Capture_Facility_Project/PCAP-XML/'
export_codebase = '/home/aryn/spectre-dev/dataset/Malware_Capture_Facility_Project/PCAP-NPY/destinationPayload_' 
export_dataset = '/mnt/Data/SPECTRE/Dataset/Malware_Capture_Facility_Project/PCAP-NPY/destinationPayload_'

In [19]:
files = os.listdir(import_directory)

In [None]:
errors = []

start_time = time.time()
i = -1
data_array = np.empty((0, 2))
counter = 0
actual = (50**2) * 3
for file in files:
    print(file)
    try:
        tree = ET.parse(import_directory + file)
        print('Reading File ', file)
        root = tree.getroot()
    except:
        errors += file
        continue
    for child in root:
        for next_child in child:
            if next_child.tag == 'destinationPayloadAsUTF':
                if next_child.text is not None:
                    x = next_child.text
                    if len(x) > actual:
                        x = x[: actual]
                    else:
                        while len(x) < actual:
                            x += x
                        x = x[:actual]
                    if child.find('Tag').text == 'Normal':
                        data_array = np.vstack((data_array, np.array([np.fromstring(x, dtype=np.uint8), 0])))
                    else:
                        data_array = np.vstack((data_array, np.array([np.fromstring(x, dtype=np.uint8), 1])))
                    counter += 1
    print('Time taken: {}'.format(time.time() - start_time))
    start_time = time.time()

    
    np.save(export_codebase + file, np.array(data_array))
    # Dataset Directory
    np.save(export_dataset + file, np.array(data_array))
    
    data_array = np.empty((0, 2))

In [None]:
print('Error in Opening Files = ', errors)
print('Counter = ', counter)
print('DONE!')

---

# PCAP -> NPY

## Test 1 - Using Kitsune libraries

Reference: https://github.com/dongtsi/TrafficManipulator/blob/ebc82f78df8544dfc908dd60f77c5a26d3ae8624/extractor.py

In [None]:
import kitsune_py.FEKitsune as Fe
from kitsune_py.KitsuneTools import RunFE
import numpy as np
from scapy.all import * 

# Define input and output file paths
pcap_file = "/home/aryn/spectre-dev/dataset/Malware_Capture_Facility_Project/2015-09-10_winlinux.pcap"
feat_file = "/home/aryn/spectre-dev/dataset/ISCX-IDS-2012/PCAP-NPY/destinationPayload_2015-09-10_winlinux.npy"

scapyin = rdpcap(pcap_file)

FE = Fe.Kitsune(scapyin, np.Inf)
feature, _ = RunFE(FE)

# Show shape of feature vectors
display(np.asarray(feature).shape)

np.save(feat_file, feature)


## Test 2 - Using numpy and scapy

In [2]:
import numpy as np

In [None]:
from scapy.all import rdpcap

def pcap_to_npy(pcap_file, npy_file):
    packets = rdpcap(pcap_file)
    data = []

    for packet in packets:
        # Extract data from packets as needed, e.g., packet length
        data.append(len(packet))

    np_data = np.array(data)
    np.save(npy_file, np_data)

pcap_file = '/mnt/Data/SPECTRE/Dataset/Malware_Capture_Facility_Project/2015-09-10_winlinux.pcap'
npy_file = '/home/aryn/spectre-dev/dataset/Malware_Capture_Facility_Project/PCAP-NPY/destinationPayload_2015-09-10_winlinux.npy'

pcap_to_npy(pcap_file, npy_file)

In [3]:
import pandas as pd

pcap_npy = '/home/aryn/spectre-dev/dataset/Malware_Capture_Facility_Project/PCAP-NPY/destinationPayload_2015-09-10_winlinux.npy'

data = np.load(pcap_npy)
df = pd.DataFrame(data)
with pd.option_context('display.max_rows', 5, 'display.max_columns', 5):
    print(df)


          0
0        72
1        72
...     ...
539687  950
539688  950

[539689 rows x 1 columns]


In [4]:
pcap_npy_preview = '/home/aryn/spectre-dev/dataset/ISCX-IDS-2012/PCAP-NPY/destinationPayload_TestbedMonJun14Flows.xml.npy'

data = np.load(pcap_npy_preview, allow_pickle=True)
df = pd.DataFrame(data)
with pd.option_context('display.max_rows', 5, 'display.max_columns', 5):
    print(df)

                                                       0  1
0      [43, 79, 75, 32, 68, 111, 118, 101, 99, 111, 1...  0
1      [46, 46, 46, 46, 46, 46, 46, 46, 119, 119, 119...  0
...                                                  ... ..
74614  [71, 111, 32, 97, 119, 97, 121, 44, 32, 119, 1...  0
74615  [71, 111, 32, 97, 119, 97, 121, 44, 32, 119, 1...  0

[74616 rows x 2 columns]
