In [25]:
from pathlib import Path
import pyshark
import torch
import torch.nn as nn
import numpy as np
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
import argparse
import nest_asyncio
nest_asyncio.apply()

In [26]:
# sets up constants / arguments

# argparser = argparse.ArgumentParser()
# #''' Switch between the toy and REAL EXAMPLES
# argparser.add_argument("--lang1", help="Language 1 class",
#                        type=str, default="./data/englishpcaps")
# argparser.add_argument("--lang2", help="Language 2 class",
#                        type=str, default="./data/spanishpcaps")
# argparser.add_argument("--passes", help="Number of passes through train",
#                        type=int, default=5)
# argparser.add_argument("--batch", help="Number of items in each batch",
#                        type=int, default=1)
# argparser.add_argument("--learnrate", help="Learning rate for SGD",
#                        type=float, default=0.1)

# args = argparser.parse_args()
args = {"lang1":'./data/englishpcaps', "lang2":'./data/spanishpcaps', "passes":5, "batch":1, "learnrate":0.1}


vocab = [(0, 0, 0, 0), (0, 0, 0, 1), (0, 0, 0, 2), (0, 0, 1, 0), (0, 0, 1, 1), (0, 0, 1, 2),
 (0, 0, 2, 0), (0, 0, 2, 1), (0, 0, 2, 2), (0, 1, 0, 0), (0, 1, 0, 1), (0, 1, 0, 2),
 (0, 1, 1, 0), (0, 1, 1, 1), (0, 1, 1, 2), (0, 1, 2, 0), (0, 1, 2, 1), (0, 1, 2, 2),
 (0, 2, 0, 0), (0, 2, 0, 1), (0, 2, 0, 2), (0, 2, 1, 0), (0, 2, 1, 1), (0, 2, 1, 2),
 (0, 2, 2, 0), (0, 2, 2, 1), (0, 2, 2, 2), (1, 0, 0, 0), (1, 0, 0, 1), (1, 0, 0, 2),
 (1, 0, 1, 0), (1, 0, 1, 1), (1, 0, 1, 2), (1, 0, 2, 0), (1, 0, 2, 1), (1, 0, 2, 2),
 (1, 1, 0, 0), (1, 1, 0, 1), (1, 1, 0, 2), (1, 1, 1, 0), (1, 1, 1, 1), (1, 1, 1, 2),
 (1, 1, 2, 0), (1, 1, 2, 1), (1, 1, 2, 2), (1, 2, 0, 0), (1, 2, 0, 1), (1, 2, 0, 2),
 (1, 2, 1, 0), (1, 2, 1, 1), (1, 2, 1, 2), (1, 2, 2, 0), (1, 2, 2, 1), (1, 2, 2, 2),
 (2, 0, 0, 0), (2, 0, 0, 1), (2, 0, 0, 2), (2, 0, 1, 0), (2, 0, 1, 1), (2, 0, 1, 2),
 (2, 0, 2, 0), (2, 0, 2, 1), (2, 0, 2, 2), (2, 1, 0, 0), (2, 1, 0, 1), (2, 1, 0, 2),
 (2, 1, 1, 0), (2, 1, 1, 1), (2, 1, 1, 2), (2, 1, 2, 0), (2, 1, 2, 1), (2, 1, 2, 2),
 (2, 2, 0, 0), (2, 2, 0, 1), (2, 2, 0, 2), (2, 2, 1, 0), (2, 2, 1, 1), (2, 2, 1, 2),
 (2, 2, 2, 0), (2, 2, 2, 1), (2, 2, 2, 2)]

In [27]:
# define some helper functions
def pcap_to_lengths(pcap_file) -> [int]:
    length = []
    capture = pyshark.FileCapture(pcap_file)
    for packet in capture:
        length.append(packet.length)

    return length

def lengths_to_tokens(lengths: [int]) -> [int]:
    # 0 = 2 smallest
    # 2 = largest length
    # 1 = everything else

    lengths_mod = lengths
    lengths_mod.remove(min(lengths_mod))

    min_length2 = min(lengths_mod) # second smallest
    min_length = min(lengths)      # smallest
    max_length = max(lengths)      # largest

    for i in range(len(lengths)):
        if lengths[i] == min_length or lengths[i] == min_length2:
            lengths[i] = 0
        elif lengths[i] == max_length:
            lengths[i] = 2
        else:
            lengths[i] = 1

    return lengths

def tokens_to_tuples(tokens: [int]) -> [(int, int, int, int)]:
    tuples = []
    i = 0
    while i + 3 < len(tokens):
        tuples.append((tokens[i], tokens[i+1], tokens[i+2], tokens[i+3]))
        i = i+1

    return tuples

def count_tuples(words: [(int, int, int, int)]) -> [int]:
    # return type is {int: int} where key is index of tuple in vocab and value is the count
    count = []
    for i in range(len(vocab)):
        count.append(words.count(vocab[i]))

    return count

In [30]:
# read in the pcap files, takes 10 seconds per file


import json
# if the data is englishpcaps, the lengths are already cached as json, and are read from a json file instead of actually parsing the pcaps again
if args["lang1"] == "./data/englishpcaps":
    with open('data/lang1_lengths.json', 'r') as f:
        lang1_lengths = json.load(f)        
else:
    directory1 = Path(args["lang1"])
    pcap_files1 = list(directory1.glob('*.pcapng'))
    lang1_lengths = [pcap_to_lengths(f) for f in pcap_files1]
    
# if the data is spanishpcaps, the lengths are already cached as json, and are read from a json file instead of actually parsing the pcaps again
if args["lang2"] == "./data/spanishpcaps":
    with open('data/lang2_lengths.json', 'r') as f:
        lang2_lengths = json.load(f)
else:
    directory2 = Path(args["lang2"])
    pcap_files2 = list(directory2.glob('*.pcapng'))
    lang2_lengths = [pcap_to_lengths(f) for f in pcap_files2]

In [31]:
#write lengths to json, helpful for caching a large dataset
# import json
# 
# # Write the matrix to a JSON file
# with open('data/lang1_lengths.json', 'w') as f:
#     json.dump(lang1_lengths, f)
# with open('data/lang2_lengths.json', 'w') as f:
#     json.dump(lang2_lengths, f)

In [32]:
# process the pcaps, create the token count matrix ready to pass into the model

# convert lengths to tokens
lang1_tokens = [lengths_to_tokens(l) for l in lang1_lengths]
lang2_tokens = [lengths_to_tokens(l) for l in lang2_lengths]

# convert tokens to tuples
lang1_tuples = [tokens_to_tuples(l) for l in lang1_tokens]
lang2_tuples = [tokens_to_tuples(l) for l in lang2_tokens]

# convert tuples to counts of each sample to insert into matrix -> [{int: int}]
lang1_counts = [count_tuples(t) for t in lang1_tuples]
lang2_counts = [count_tuples(t) for t in lang2_tuples]

m1 = np.asmatrix(lang1_counts)
n, _ = m1.shape
zeros = np.zeros((n, 1))
m1 = np.hstack((zeros, m1))

m2 = np.asmatrix(lang2_counts)
n, _ = m2.shape
ones = np.ones((n, 1))
m2 = np.hstack((ones, m2))

langdata = np.vstack((m1, m2))

assert langdata.shape == ((len(lang1_counts) + len(lang2_counts)), 1 + len(vocab))

In [33]:
# split the data into training and testing datasets
# Define the LangDataset class to represent the data

class LangDataset(Dataset):
    def __init__(self, data):
        self.n_samples, self.n_features = data.shape
        # The first column is label, the rest are the features
        self.n_features -= 1

        self.feature = torch.from_numpy(data[:, 1:].astype(np.float32)) # size [n_samples, n_features]
        self.label = torch.from_numpy(data[:, [0]].astype(np.float32)) # size [n_samples, 1]

    # support indexing such that dataset[i] can be used to get i-th sample
    def __getitem__(self, index):
        return self.feature[index], self.label[index]

    # we can call len(dataset) to return the size
    def __len__(self):
        return self.n_samples
    
train_np, test_np = train_test_split(langdata, test_size=0.15)
train, test = LangDataset(train_np), LangDataset(test_np)
print("Read in %i train and %i test" % (len(train), len(test)))

Read in 205 train and 37 test


In [34]:
# define the Logreg model and the step function

class SimpleLogreg(nn.Module):
    def __init__(self, num_features):
        """
        Initialize the parameters you'll need for the model.

        :param num_features: The number of features in the linear model
        """
        super(SimpleLogreg, self).__init__()
        self.linear = nn.Linear(num_features, 1)

    def forward(self, x):
        """
        Compute the model prediction for an example.

        :param x: Example to evaluate
        """
        return torch.sigmoid(self.linear(x))

    def evaluate(self, data):
        with torch.no_grad():
            y_predicted = self(data.feature)
            y_predicted_cls = y_predicted.round()
            acc = y_predicted_cls.eq(data.label).sum() / float(data.label.shape[0])
            return acc

    def inspect(self, vocab, limit=10):
        """
        A fundtion to find the top features and print them.
        """

        None
        weights = logreg.linear.weight[0].detach().numpy()

def step(epoch, ex, model, optimizer, criterion, inputs, labels):
    """Take a single step of the optimizer, we factored it into a single
    function so we could write tests.


    :param epoch: The current epoch
    :param ex: Which example / minibatch you're one
    :param model: The model you're optimizing
    :param inputs: The current set of inputs
    :param labels: The labels for those inputs

    A) get predictions
    B) compute the loss from that prediction
    C) backprop
    D) update the parameters
    """
    optimizer.zero_grad()
    prediction = model(inputs)
    loss = criterion(prediction, labels)
    loss.backward()
    optimizer.step()

    if (ex+1) % 20 == 0:
      acc_train = model.evaluate(train)
      acc_test = model.evaluate(test)
      print(f'Epoch: {epoch+1}/{num_epochs}, Example {ex}, loss = {loss.item():.4f}, train_acc = {acc_train.item():.4f} test_acc = {acc_test.item():.4f}')

In [35]:
# Initialize model
logreg = SimpleLogreg(train.n_features)
num_epochs = args["passes"]
batch = args["batch"]
total_samples = len(train)

In [36]:
# initialize the loss function and the optimizer, then train the model

# Replace these with the correct loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(logreg.parameters(), lr=args["learnrate"])

train_loader = DataLoader(dataset=train,
                          batch_size=batch,
                          shuffle=True,
                          num_workers=0)
dataiter = iter(train_loader)

In [37]:
# Iterations (i think this is running the model?
for epoch in range(num_epochs):
  for ex, (inputs, labels) in enumerate(train_loader):
    # Run your training process
    step(epoch, ex, logreg, optimizer, criterion, inputs, labels)

Epoch: 1/5, Example 19, loss = -0.0000, train_acc = 0.4878 test_acc = 0.5676
Epoch: 1/5, Example 39, loss = -0.0000, train_acc = 0.4878 test_acc = 0.5676
Epoch: 1/5, Example 59, loss = -0.0000, train_acc = 0.4878 test_acc = 0.5676
Epoch: 1/5, Example 79, loss = -0.0000, train_acc = 0.4878 test_acc = 0.5676
Epoch: 1/5, Example 99, loss = -0.0000, train_acc = 0.4878 test_acc = 0.5676
Epoch: 1/5, Example 119, loss = -0.0000, train_acc = 0.4878 test_acc = 0.5676
Epoch: 1/5, Example 139, loss = -0.0000, train_acc = 0.4878 test_acc = 0.5676
Epoch: 1/5, Example 159, loss = -0.0000, train_acc = 0.4878 test_acc = 0.5676
Epoch: 1/5, Example 179, loss = -0.0000, train_acc = 0.4878 test_acc = 0.5676
Epoch: 1/5, Example 199, loss = -0.0000, train_acc = 0.4878 test_acc = 0.5676
Epoch: 2/5, Example 19, loss = -0.0000, train_acc = 0.4878 test_acc = 0.5676
Epoch: 2/5, Example 39, loss = -0.0000, train_acc = 0.4878 test_acc = 0.5676
Epoch: 2/5, Example 59, loss = -0.0000, train_acc = 0.4878 test_acc = 0

In [38]:
# i dont actually know what this model is doing or like if theres an easier way (there probably is)

logreg.inspect(vocab)