### Drive

### Import

In [1]:
import numpy as np
import torch
import gzip
import random
import time
import os
from collections import deque
from tqdm import tqdm
from PIL import Image
from torch.utils import data
from torch.utils.data import Dataset, DataLoader, Subset
from torch.utils.data import TensorDataset, DataLoader, random_split
import torch.nn as nn
from torch.utils.checkpoint import checkpoint
from torch.optim.lr_scheduler import StepLR
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, models, transforms
from torchsummary import summary #network summary
import matplotlib.pyplot as plt

from collections import deque
from collections import OrderedDict
import multiprocessing
import torch.optim as optim
from sklearn.metrics import roc_auc_score

## DataSet Creation

In [2]:
class BinaryDataset(data.Dataset):
    """
    Loader for binary files. 
    
    If you use the sort_by_file_size option, the dataset will store files from smallest to largest. This is meant to used with RandomChunkSampler to sammple batches of similarly sized files to maximize performance. 
    
    TODO: Auto un-gzip files if they have g-zip compression 
    """
    def __init__(self, good_dir, bad_dir, sort_by_size=False, max_len=4000000):
        
        #Tuple (file_path, label, file_size)
        self.all_files = []
        self.max_len = max_len
        
        for roor_dir, dirs, files in os.walk(good_dir):
            for file in files:
                to_add = os.path.join(roor_dir,file)
                self.all_files.append(  (to_add, 0, os.path.getsize(to_add))  )
                
        for roor_dir, dirs, files in os.walk(bad_dir):
            for file in files:
                to_add = os.path.join(roor_dir,file)
                self.all_files.append(   (to_add, 1, os.path.getsize(to_add))  )
                
        if sort_by_size:
            self.all_files.sort(key=lambda filename: filename[2])

    def __len__(self):
        return len(self.all_files)

    def __getitem__(self, index):
        
        to_load, y, _ = self.all_files[index]
        
        try:
            with gzip.open(to_load, 'rb') as f:
                x = f.read(self.max_len)
                #Need to use frombuffer b/c its a byte array, otherwise np.asarray will get wonked on trying to convert to ints
                #So decode as uint8 (1 byte per value), and then convert
                x = np.frombuffer(x, dtype=np.uint8).astype(np.int16)+1 #index 0 will be special padding index
        except OSError:
            #OK, you are not a gziped file. Just read in raw bytes from disk. 
            with open(to_load, 'rb') as f:
                x = f.read(self.max_len)
                #Need to use frombuffer b/c its a byte array, otherwise np.asarray will get wonked on trying to convert to ints
                #So decode as uint8 (1 byte per value), and then convert
                x = np.frombuffer(x, dtype=np.uint8).astype(np.int16)+1 #index 0 will be special padding index
            
        #x = np.pad(x, self.max_len-x.shape[0], 'constant')    
        x = torch.tensor(x)

        return x, torch.tensor([y])
    
class RandomChunkSampler(torch.utils.data.sampler.Sampler):
    """
    Samples random "chunks" of a dataset, so that items within a chunk are always loaded together. Useful to keep chunks in similar size groups to reduce runtime. 
    """
    def __init__(self, data_source, batch_size):
        """
        data_source: the souce pytorch dataset object
        batch_size: the size of the chunks to keep together. Should generally be set to the desired batch size during training to minimize runtime. 
        """
        self.data_source = data_source
        self.batch_size = batch_size
        
    def __iter__(self):
        n = len(self.data_source)
        
        data = [x for x in range(n)]

        # Create blocks
        blocks = [data[i:i+self.batch_size] for i in range(0,len(data),self.batch_size)]
        # shuffle the blocks
        random.shuffle(blocks)
        # concatenate the shuffled blocks
        data[:] = [b for bs in blocks for b in bs]
        
        return iter(data)
        
    def __len__(self):
        return len(self.data_source)
    
#We want to hadnel true variable length
#Data loader needs equal length. So use special function to padd all the data in a single batch to be of equal length
#to the longest item in the batch
def pad_collate_func(batch):
    """
    This should be used as the collate_fn=pad_collate_func for a pytorch DataLoader object in order to pad out files in a batch to the length of the longest item in the batch. 
    """
    vecs = [x[0] for x in batch]
    labels = [x[1] for x in batch]
    
    x = torch.nn.utils.rnn.pad_sequence(vecs, batch_first=True)
    #stack will give us (B, 1), so index [:,0] to get to just (B)
    y = torch.stack(labels)[:,0]
    
    return x, y

In [3]:
batch_size = 8

train_data = BinaryDataset("/Users/nando/OfflineStuff/BinaryMalware/train/benign_updated", '/Users/nando/OfflineStuff/BinaryMalware/train/malware', sort_by_size=False, max_len=2000000)
# train_loader = DataLoader(train_data, batch_size, collate_fn=pad_collate_func)
m = len(train_data)
print(m)
train_data, val_data = random_split(train_data, [int(m-m*0.2+1), int(m*0.2)])

train_loader = DataLoader(train_data, batch_size, collate_fn=pad_collate_func)
valid_loader = DataLoader(val_data, batch_size, collate_fn=pad_collate_func)



test_data = BinaryDataset("/Users/nando/OfflineStuff/BinaryMalware/test/benign_updated", '/Users/nando/OfflineStuff/BinaryMalware/test/malware', sort_by_size=False, max_len=2000000)
test_loader = DataLoader(test_data, batch_size, collate_fn=pad_collate_func)

# for a,b in train_loader:
#   print(a[0].size())

11001


## Network

### LowMemConvBase

In [None]:
"""
Classifying Sequences of Extreme Length with Constant Memory Applied to Malware Detection
Edward Raff, William Fleshman, Richard Zak, Hyrum Anderson and Bobby Filar and Mark Mclean
https://arxiv.org/abs/2012.09390
"""
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F

def drop_zeros_hook(module, grad_input, grad_out):
    """
    This function is used to replace gradients that are all zeros with None
    In pyTorch None will not get back-propogated
    So we use this as a approximation to saprse BP to avoid redundant and useless work
    """
    grads = []
    with torch.no_grad():
        for g in grad_input:
            if torch.nonzero(g).shape[0] == 0:
                grads.append(g.to_sparse())
            else:
                grads.append(g)
                
    return tuple(grads)


class CatMod(torch.nn.Module):
    def __init__(self):
        super(CatMod, self).__init__()

    def forward(self, x):
        return torch.cat(x, dim=2)
    
    
class LowMemConvBase(nn.Module):
    
    def __init__(self, chunk_size=65536, overlap=512, min_chunk_size=1024):
        """
        chunk_size: how many bytes at a time to process. Increasing may improve compute efficent, but use more memory. Total memory use will be a function of chunk_size, and not of the length of the input sequence L
        
        overlap: how many bytes of overlap to use between chunks
        
        """
        super(LowMemConvBase, self).__init__()
        self.chunk_size = chunk_size
        self.overlap = overlap
        self.min_chunk_size = min_chunk_size
            
        #Used for pooling over time in a more efficent way
        self.pooling = nn.AdaptiveMaxPool1d(1)
        self.cat = CatMod()
        self.cat.register_backward_hook(drop_zeros_hook)
        self.receptive_field = None
        
        #Used to force checkpoint code to behave correctly due to poor design https://discuss.pytorch.org/t/checkpoint-with-no-grad-requiring-inputs-problem/19117/11
        self.dummy_tensor = torch.ones(1, dtype=torch.float32, requires_grad=True)
    
    def processRange(self, x, **kwargs):
        """
        This method does the work to convert an LongTensor input x of shape (B, L) , where B is the batch size and L is the length of the input. The output of this functoin should be a tensor of (B, C, L), where C is the number of channels, and L is again the input length (though its OK if it got a little shorter due to convs without padding or something). 
        
        """
        pass
    
    def determinRF(self):
        """
        This function evaluates the receptive field & stride of our sub-network.
        """
        
        if self.receptive_field is not None:
            return self.receptive_field, self.stride, self.out_channels
        
        if not hasattr(self, "device_ids"):
            #We are training with just one device. Lets find out where we should move the data
            cur_device = next(self.embd.parameters()).device
        else:
            cur_device = "cpu"
            
        #Lets do a simple binary search to figure out how large our RF is. 
        #It can't be larger than our chunk size! So use that as upper bound
        min_rf = 1
        max_rf = self.chunk_size
        
        with torch.no_grad():
            
            tmp = torch.zeros((1,max_rf)).long().to(cur_device)
            
            while True:
                test_size = (min_rf+max_rf)//2
                is_valid = True
                try:
                    self.processRange(tmp[:,0:test_size])
                except:
                    is_valid = False
                
                if is_valid:
                    max_rf = test_size
                else:
                    min_rf = test_size+1
                    
                if max_rf == min_rf:
                    self.receptive_field = min_rf
                    out_shape = self.processRange(tmp).shape
                    self.stride = self.chunk_size//out_shape[2]
                    self.out_channels = out_shape[1]
                    break
                    
                
        return self.receptive_field, self.stride, self.out_channels
                
    
    def pool_group(self, *args):
        x = self.cat(args)
        x = self.pooling(x)
        return x
    
    def seq2fix(self, x, pr_args={}):
        """
        Takes in an input LongTensor of (B, L) that will be converted to a fixed length representation (B, C), 
        where C is the number of channels provided by the base_network given at construction. 
        """
        
        receptive_window, stride, out_channels = self.determinRF()
        
        if x.shape[1] < receptive_window: #This is a tiny input! Pad it out 
            x = F.pad(x, (0, receptive_window-x.shape[1]), value=0) # 0 is the pad value  
        batch_size = x.shape[0]
        length = x.shape[1]

        #Let's go through the input data without gradients first, and find the positions that "win"
        #the max-pooling. Most of the gradients will be zero, and we don't want to waste valuable
        #memory and time computing them. 
        #Once we know the winners, we will go back and compute the forward activations on JUST
        #the subset of positions that won!
        winner_values = np.zeros((batch_size, out_channels))-1.0
        winner_indices = np.zeros((batch_size, out_channels), dtype=np.int64)
            
        if not hasattr(self, "device_ids"):
            #We are training with just one device. Lets find out where we should move the data
            cur_device = next(self.embd.parameters()).device
        else:
            cur_device = None

        step = self.chunk_size #- self.overlap
        start = 0
        end = start+step
        
        with torch.no_grad():
            while start < end and (end-start) >= max(self.min_chunk_size, receptive_window):
                x_sub = x[:,start:end]
                if cur_device is not None:
                    x_sub = x_sub.to(cur_device)
                activs = self.processRange(x_sub.long(), **pr_args)
                activ_win, activ_indx = F.max_pool1d(activs, kernel_size=activs.shape[2], return_indices=True)
                #We want to remove only last dimension, but if batch size is 1, np.squeeze
                #will screw us up and remove first dim too.
                #activ_win = np.squeeze(activ_win.cpu().numpy())
                #activ_indx = np.squeeze(activ_indx.cpu().numpy())
                activ_win = activ_win.cpu().numpy()[:,:,0]
                activ_indx = activ_indx.cpu().numpy()[:,:,0]
                selected = winner_values < activ_win
                winner_indices[selected] = activ_indx[selected]*stride + start 
                winner_values[selected]  = activ_win[selected]
                start = end
                end = min(start+step, length)

        # Now we know every index that won, we need to compute values and with gradients! 

        # Find unique winners for every batch
        final_indices = [np.unique(winner_indices[b,:]) for b in range(batch_size)]
        
        # Collect inputs that won for each batch
        chunk_list = [[x[b:b+1,max(i-receptive_window,0):min(i+receptive_window,length)] for i in final_indices[b]] for b in range(batch_size)]
        # Convert to a torch tensor of the bytes
        chunk_list = [torch.cat(c, dim=1)[0,:] for c in chunk_list]
        
        # Pad out shorter sequences to the longest one
        x_selected = torch.nn.utils.rnn.pad_sequence(chunk_list, batch_first=True)
        
        # Shape is not (B, L). Compute it.     
        if cur_device is not None:
            x_selected = x_selected.to(cur_device)
        x_selected = self.processRange(x_selected.long(), **pr_args)
        x_selected = self.pooling(x_selected)
        x_selected = x_selected.view(x_selected.size(0), -1)
            
        return x_selected
        

### AvastConv

In [None]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F

def vec_bin_array(arr, m=8):
    """
    Arguments: 
    arr: Numpy array of positive integers
    m: Number of bits of each integer to retain

    Returns a copy of arr with every element replaced with a bit vector.
    Bits encoded as int8's.
    """
    to_str_func = np.vectorize(lambda x: np.binary_repr(x).zfill(m))
    strs = to_str_func(arr)
    ret = np.zeros(list(arr.shape) + [m], dtype=np.int8)
    for bit_ix in range(0, m):
        fetch_bit_func = np.vectorize(lambda x: x[bit_ix] == '1')
        ret[...,bit_ix] = fetch_bit_func(strs).astype(np.int8)

    return (ret*2-1).astype(np.float32)/16

class AvastConv(LowMemConvBase):
    
    def __init__(self, out_size=2, channels=48, window_size=32, stride=4, embd_size=8):
        super(AvastConv, self).__init__()
        self.embd = nn.Embedding(257, embd_size, padding_idx=0)
        for i in range(1, 257):
            self.embd.weight.data[i,:] = torch.tensor(vec_bin_array(np.asarray([i])))
        for param in self.embd.parameters():
             param.requires_grad = False
                 
        self.conv_1 = nn.Conv1d(8, channels, window_size, stride=stride, bias=True)
        self.conv_2 = nn.Conv1d(channels, channels*2, window_size, stride=stride, bias=True)
        self.pool = nn.MaxPool1d(4)
        self.conv_3 = nn.Conv1d(channels*2, channels*3, window_size//2, stride=stride*2, bias=True)
        self.conv_4 = nn.Conv1d(channels*3, channels*4, window_size//2, stride=stride*2, bias=True)
        
        self.fc_1 = nn.Linear(channels*4, channels*4)
        self.fc_2 = nn.Linear(channels*4, channels*3)
        self.fc_3 = nn.Linear(channels*3, channels*2)
        self.fc_4 = nn.Linear(channels*2, out_size)
        
    
    def processRange(self, x):
        # Fixed embedding
        with torch.no_grad():
            x = self.embd(x)
            x = torch.transpose(x,-1,-2)
         
        x = F.relu(self.conv_1(x))
        x = F.relu(self.conv_2(x))
        x = self.pool(x)
        x = F.relu(self.conv_3(x))
        x = F.relu(self.conv_4(x))
        
        return x
    
    def forward(self, x):
        post_conv = x = self.seq2fix(x)
        
        x = F.selu(self.fc_1(x))
        x = F.selu(self.fc_2(x))
        penult = x = F.selu(self.fc_3(x))
        x = self.fc_4(x)
        
        return x, penult, post_conv

## Train

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device = torch.device("mps" if torch.backends.mps.is_available() else 'cpu')
print(device)

In [None]:
net = AvastConv()

# If BCELoss is used instead of logit
# channel_size = 48
# net.fc_4 = nn.Sequential(
#                nn.Linear(channel_size*2, 2),
#                nn.Sigmoid())

net.to(device)
summary(net, (100,))

In [None]:
# checkpoint = torch.load('logs/avastcnn.checkpoint', map_location=device)
# try:
#   net.load_state_dict(checkpoint['model_state_dict'])
# except:
#   print("No valid checkpoint data")

# del checkpoint['model_state_dict']
# del checkpoint['optimizer_state_dict']
# args_to_use = checkpoint

lt = time.localtime()
base_name = 'logs/log-' + str(lt[2])+'_v cc'+str(lt[1])+'-'+str(lt[3])+':'+str(lt[4])

if not os.path.exists(base_name):
    os.makedirs(base_name)
file_name = os.path.join(base_name, base_name)

In [None]:
n_epochs = 20

optimizer = optim.AdamW(net.parameters()) # AdamW has shown better generalization than Adam and on par with SGD and Momentum used in state of the art
criterion = nn.CrossEntropyLoss()
scheduler = StepLR(optimizer, step_size=n_epochs//10, gamma=0.5) # Decays the learning rate of each parameter group by gamma every step_size epochs.

headers = ['epoch', 'train_acc','train_loss', 'val_acc'] # For csv printing

with open(base_name + ".csv", 'w') as csv_log_out:
    csv_log_out.write(",".join(headers) + "\n")

    for epoch in (range(n_epochs)):
        preds = []
        truths = []
        running_loss = 0.0
        train_loss = []

        train_correct = 0
        train_total = 0

        # Dictionary used to store the values for the csv
        epoch_stats = {'epoch':epoch} 
        
        # Set model to train mode
        net.train() 

        for inputs, labels in tqdm(train_loader):

            ## Step 1: Load the data on the device
            inputs, labels = inputs.to(device), labels.to(device)

            # Keep inputs on CPU, the net will load chunks of input onto device as needed
            # labels = labels.to(device)

            ## Step 2: Run the model on the input data            
            outputs, penult, post_conv = net(inputs)
            # outputs, penultimate_activ, conv_active = net.forward_extra(inputs)

            ## Step 3: Calculate the loss
            loss = criterion(outputs, labels)
            loss = loss #+ decov_lambda*(decov_penalty(penultimate_activ) + decov_penalty(conv_active))
            #     loss = loss + decov_lambda*(decov_penalty(conv_active))
            train_loss.append(loss.detach().cpu().numpy())

            ## Step 4: Perform backpropagation
            # Before calculating the gradients, we need to ensure that they are all zero. 
            # The gradients would not be overwritten, but actually added to the existing ones.
            optimizer.zero_grad()

            # Perform backpropagation
            loss.backward()

            ## Step 5: Update the parameters
            optimizer.step()

            running_loss += loss.item()

            _, predicted = torch.max(outputs.data, 1)

            with torch.no_grad():
                preds.extend(F.softmax(outputs, dim=-1).data[:,1].detach().cpu().numpy().ravel())
                truths.extend(labels.detach().cpu().numpy().ravel())

            

            train_total += labels.size(0)
            train_correct += (predicted == labels).sum().item()

        ## End Train Loop

        print("Training Accuracy: {}".format(train_correct*100.0/train_total))
        print("Training Loss: {}".format(np.mean(train_loss)))
      
        epoch_stats['train_acc'] = train_correct*1.0/train_total
        epoch_stats['train_loss'] = np.mean(train_loss)

        ## Save the net and current state!
        net_path = os.path.join(base_name, "epoch_{}.checkpoint".format(epoch))

        ## Test Set Eval
        net.eval()
        eval_train_correct = 0
        eval_train_total = 0

        preds = []
        truths = []
        with torch.no_grad():
            for inputs, labels in tqdm(valid_loader):

                inputs, labels = inputs.to(device), labels.to(device)

                outputs, _, _ = net(inputs)

                _, predicted = torch.max(outputs.data, 1)

                preds.extend(F.softmax(outputs, dim=-1).data[:,1].detach().cpu().numpy().ravel())
                truths.extend(labels.detach().cpu().numpy().ravel())

                eval_train_total += labels.size(0)
                eval_train_correct += (predicted == labels).sum().item()

        print("Val Accuracy: {}".format(eval_train_correct*100.0/eval_train_total))

        epoch_stats['val_acc'] = eval_train_correct*1.0/eval_train_total

        csv_log_out.write(",".join([str(epoch_stats[h]) for h in headers]) + "\n")
        csv_log_out.flush()
        
        scheduler.step()