In [1]:
import os
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torchvision.datasets import DatasetFolder
from torch.utils.tensorboard import SummaryWriter
import numpy as np
from math import ceil
from sklearn.model_selection import train_test_split
from torchsummary import summary

In [2]:
class BinaryTransform:
    def __init__(self, input_length):
        self.input_length = input_length

    def __call__(self, binary_data):
        binary_data = np.frombuffer(binary_data, dtype=np.uint8)
        
        l = len(binary_data)

        # Pad or truncate the binary data
        if l < self.input_length:
            padding = np.zeros(self.input_length - l, dtype=np.uint8)
            binary_data = np.concatenate((binary_data, padding))
        elif l > self.input_length:
            excess = ceil(l / self.input_length)
            padding = np.zeros(self.input_length * excess - l, dtype=np.uint8)
            binary_data = np.concatenate((binary_data, padding))
            binary_data = binary_data.reshape(len(binary_data)//excess, -1)
            binary_data = np.mean(binary_data, axis=1)
            
        # Scale the data to [0, 1]
        scaled_data = binary_data / 255.0

        return torch.tensor(scaled_data, dtype=torch.float32)

In [17]:
# Define the dataset class
class BinaryDataset(Dataset):
    def __init__(self, root_dir, transform=None):
        self.root_dir = root_dir
        self.transform = transform
        self.classes = os.listdir(root_dir)

    def __len__(self):
        return sum([len(files) for _, _, files in os.walk(self.root_dir)])

    def __getitem__(self, idx):
        class_name = self.classes[idx % 2]
        class_dir = os.path.join(self.root_dir, class_name)
        files = os.listdir(class_dir)
        file_name = files[idx % len(files)]
        file_path = os.path.join(class_dir, file_name)
        with open(file_path, 'rb') as f:
            binary_data = f.read()
        label = torch.tensor(1) if class_name == 'malware' else torch.tensor(0)
        if self.transform:
            binary_data = self.transform(binary_data)
        return binary_data, label

In [3]:
# Define the convolutional network
class ConvNet(nn.Module):
    def __init__(self):
        super(ConvNet, self).__init__()
        self.conv1 = nn.Conv1d(1, 16, kernel_size=(10,), stride=(1,))
        self.fc1 = nn.Linear(65488, 2)  # Adjust the input size based on your data size

    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = F.max_pool1d(x, kernel_size=4, stride=2, padding=0, dilation=1, ceil_mode=False)
        x = x.view(-1, self.fc1.in_features)
        return F.sigmoid(self.fc1(x))

In [4]:
# Set up device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [5]:
# Define data paths
train_data_path = "./data/train"
test_data_path = "./data/test"

In [6]:
# Define the input length and instantiate the transform
input_length = 16384
transform = BinaryTransform(input_length)

In [7]:
# Create dataset
dataset = DatasetFolder(root=train_data_path, loader=lambda x: open(x, 'rb').read(), extensions=('',), transform=transform)

In [8]:
# Split dataset into train and validation sets
train_indices, val_indices = train_test_split(list(range(len(dataset))), test_size=0.2, random_state=42)

In [9]:
# Create train and validation datasets and dataloaders
train_dataset = torch.utils.data.Subset(dataset, train_indices)
val_dataset = torch.utils.data.Subset(dataset, val_indices)

In [10]:
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64)

In [11]:
# Instantiate model
model = ConvNet().to(device)

In [12]:
summary(model,(1,16384))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv1d-1            [-1, 16, 16375]             176
            Linear-2                    [-1, 2]         130,978
Total params: 131,154
Trainable params: 131,154
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.06
Forward/backward pass size (MB): 2.00
Params size (MB): 0.50
Estimated Total Size (MB): 2.56
----------------------------------------------------------------


In [13]:
# Define optimizer and loss function
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.BCEWithLogitsLoss()

In [14]:
# Set up TensorBoard
writer = SummaryWriter()

In [15]:
# Training loop
best_val_loss = float('inf')
patience = 3
counter = 0

In [19]:
for epoch in range(10):  # You can adjust the number of epochs
    model.train()
    train_loss = 0.0
    for batch_idx, (data, target) in enumerate(train_loader):
        data, target = data.to(device), target.to(device)
        optimizer.zero_grad()
        output = model(data)
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()

    # Validate
    model.eval()
    val_loss = 0.0
    correct = 0
    total = 0
    with torch.no_grad():
        for batch_idx, (data, target) in enumerate(val_loader):
            data, target = data.to(device), target.to(device)
            output = model(data)
            loss = criterion(output, target)
            val_loss += loss.item()
            _, predicted = output.max(1)
            total += target.size(0)
            correct += predicted.eq(target).sum().item()
        
    # Write to TensorBoard
    writer.add_scalar('Loss/train', train_loss/len(train_loader.dataset), epoch)
    writer.add_scalar('Loss/val', val_loss/len(val_loader.dataset), epoch)
    writer.add_scalar('Accuracy/val', 100.*correct/total, epoch)

    print(f'Epoch {epoch}, Train Loss: {train_loss/len(train_loader.dataset)}, Val Loss: {val_loss/len(val_loader.dataset)}, Val Acc: {100.*correct/total}')

    # Early stopping
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        counter = 0
    else:
        counter += 1
        if counter >= patience:
            print("Early stopping")
            break

RuntimeError: Given groups=1, weight of size [16, 1, 10], expected input[1, 64, 16384] to have 1 channels, but got 64 channels instead

In [ ]:
# Close TensorBoard writer
writer.close()