In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!git clone 'https://github.com/AliM100/Ransomware_Detection.git'
!pip install patool

In [None]:
!unzip /content/drive/MyDrive/malimg_dataset.zip -d data

In [None]:
import os
import sys
import os
from math import log
import numpy as np
import scipy as sp
from PIL import Image
import matplotlib.pyplot as plt
import cv2
import shutil
import pandas as pd
import patoolib
import seaborn as sns
from torch.utils.tensorboard import SummaryWriter
from datetime import datetime
from sklearn import metrics
from sklearn.metrics import average_precision_score,accuracy_score
from torch.optim.lr_scheduler import ReduceLROnPlateau
import tensorflow
import torch
import torch.nn as nn
import torchvision.transforms as transforms
import torchvision
from Ransomware_Detection.mal_dataset import maldataset
from Ransomware_Detection.data_conversion import convert_data
from Ransomware_Detection.dataset import load_data,prepare_data


In [None]:
class_index = {'Adialer.C': 0,
                'Agent.FYI': 1,
                'Allaple.A': 2,
                'Allaple.L': 3,
                'Alueron.gen!J': 4,
                'Autorun.K': 5,
                'C2LOP.P': 6,
                'C2LOP.gen!g': 7,
                'Dialplatform.B': 8,
                'Dontovo.A': 9,
                'Fakerean': 10,
                'Instantaccess': 11,
                'Lolyda.AA1': 12,
                'Lolyda.AA2': 13,
                'Lolyda.AA3': 14,
                'Lolyda.AT': 15,
                'Malex.gen!J': 16,
                'Obfuscator.AD': 17,
                'Rbot!gen': 18,
                'Skintrim.N': 19,
                'Swizzor.gen!E': 20,
                'Swizzor.gen!I': 21,
                'VB.AT': 22,
                'Wintrim.BX': 23,
                'Yuner.A': 24}



In [None]:
data_path="data"
img_path="data/malimg_paper_dataset_imgs"
data_csvs="data/csvs"
save_checkpoints_path="data/checkpoint"
batch_size= 10

os.makedirs(save_checkpoints_path,exist_ok=True)
os.makedirs(data_csvs,exist_ok=True)

data_prepare=prepare_data(data_path,img_path,class_index)

if not os.path.exists(f"{data_csvs}/train.csv"):
    data_prepare.create_csv_data()

target_size_custom = (224, 224)

transform = transforms.Compose([
    transforms.Resize(target_size_custom),
    # transforms.Grayscale(num_output_channels=1),
    transforms.Lambda(lambda x: x.float()),
    transforms.Normalize((0.5,), (0.5,))])

training_set=maldataset(csv_file=f"{data_csvs}/train.csv",root_dir=img_path, class_index=class_index, transform=transform)
validation_set=maldataset(csv_file=f"{data_csvs}/val.csv",root_dir=img_path, class_index=class_index, transform=transform)

training_loader = torch.utils.data.DataLoader(training_set, batch_size=batch_size, shuffle=True)
validation_loader = torch.utils.data.DataLoader(validation_set, batch_size=batch_size, shuffle=False)


classes = ('Adialer.C','Agent.FYI','Allaple.A','Allaple.L','Alueron.gen!J','Autorun.K','C2LOP.P','C2LOP.gen!g','Dialplatform.B','Dontovo.A',
           'Fakerean','Instantaccess','Lolyda.AA1','Lolyda.AA2','Lolyda.AA3','Lolyda.AT','Malex.gen!J','Obfuscator.AD','Rbot!gen','Skintrim.N',
           'Swizzor.gen!E','Swizzor.gen!I','VB.AT','Wintrim.BX','Yuner.A')


In [None]:
import torch
import torch.nn as nn
####best model of the best 97.9 accuracy on malimg 25 classes
###dropout 0.25 -> 0.01 accuracy 97.85
### single linear layer maybe better choice
class CustomModel(nn.Module):
    def __init__(self, num_classes=26):
        super(CustomModel, self).__init__()

        self.conv1 = nn.Conv2d(1, 64, kernel_size=(3, 3), stride=1, padding=1)
        self.pool1 = nn.MaxPool2d(kernel_size=(2, 2), stride=2)
        self.batch_norm1 = nn.BatchNorm2d(64)

        self.conv2 = nn.Conv2d(64, 32, kernel_size=(3, 3), stride=1, padding=1)
        self.pool2 = nn.MaxPool2d(kernel_size=(2, 2), stride=2)
        self.batch_norm2 = nn.BatchNorm2d(32)

        self.conv3 = nn.Conv2d(32, 32, kernel_size=(3, 3), stride=1, padding=1)
        self.pool3 = nn.MaxPool2d(kernel_size=(2, 2), stride=2)
        self.batch_norm3 = nn.BatchNorm2d(32)

        self.conv4 = nn.Conv2d(32, 16, kernel_size=(3, 3), stride=1, padding=1)
        self.pool4 = nn.MaxPool2d(kernel_size=(2, 2), stride=2)
        self.batch_norm4 = nn.BatchNorm2d(16)

        self.conv5 = nn.Conv2d(16, 16, kernel_size=(3, 3), stride=1, padding=1)
        self.pool5 = nn.MaxPool2d(kernel_size=(2, 2), stride=2)
        self.batch_norm5 = nn.BatchNorm2d(16)


        self.flatten = nn.Flatten()

        self.fc1_input_size = 16 * (target_size_custom[0] // 32) * (target_size_custom[1] // 32)
        self.fc1 = nn.Linear(self.fc1_input_size, 256)
        self.dropout1 = nn.Dropout(0.25)
        self.batch_norm_fc1 = nn.BatchNorm1d(256)

        self.fc2 = nn.Linear(256, 128)
        self.dropout2 = nn.Dropout(0.25)
        self.batch_norm_fc2 = nn.BatchNorm1d(128)

        self.fc3 = nn.Linear(128, num_classes)

    def forward(self, x):
        x = self.batch_norm1(self.pool1(torch.relu(self.conv1(x))))
        x = self.batch_norm2(self.pool2(torch.relu(self.conv2(x))))
        x = self.batch_norm3(self.pool3(torch.relu(self.conv3(x))))
        x = self.batch_norm4(self.pool4(torch.relu(self.conv4(x))))
        x = self.batch_norm5(self.pool5(torch.relu(self.conv5(x))))

        x = self.flatten(x)

        x = self.batch_norm_fc1(self.dropout1(torch.relu(self.fc1(x))))
        x = self.batch_norm_fc2(self.dropout2(torch.relu(self.fc2(x))))
        x = self.fc3(x)

        return x

In [None]:
import torch
import torch.nn as nn
######## with dropout ####ORIGINAL
class ResidualBlock(nn.Module):
    def __init__(self, in_channels, out_channels, stride=1):
        super(ResidualBlock, self).__init__()

        self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=stride, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(out_channels)
        self.relu = nn.ReLU(inplace=True)
        self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(out_channels)

        self.shortcut = nn.Sequential()
        if stride != 1 or in_channels != out_channels:
            self.shortcut = nn.Sequential(
                nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(out_channels)
            )

    def forward(self, x):
        residual = x
        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)
        out = self.conv2(out)
        out = self.bn2(out)
        out += self.shortcut(residual)
        out = self.relu(out)
        return out

class CustomModel(nn.Module):
    def __init__(self, num_classes=25):
        super(CustomModel, self).__init__()

        self.conv1 = nn.Conv2d(1, 64, kernel_size=3, stride=1, padding=1)
        self.bn1 = nn.BatchNorm2d(64)
        self.relu = nn.ReLU(inplace=True)
        self.pool = nn.MaxPool2d(kernel_size=2, stride=2)

        self.layer1 = self.make_layer(64, 64, 2)
        self.layer2 = self.make_layer(64, 128, 2, stride=2)
        self.layer3 = self.make_layer(128, 256, 2, stride=2)
        self.layer4 = self.make_layer(256, 512, 2, stride=2)


        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
        self.fc = nn.Linear(512, num_classes)

        self.dropout = nn.Dropout(0.25)

    def make_layer(self, in_channels, out_channels, blocks, stride=1):
        layers = []
        layers.append(ResidualBlock(in_channels, out_channels, stride))
        for _ in range(1, blocks):
            layers.append(ResidualBlock(out_channels, out_channels))
        return nn.Sequential(*layers)

    def forward(self, x):
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.pool(x)

        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)


        x = self.avgpool(x)
        x = torch.flatten(x, 1)
        x = self.dropout(x)
        x = self.fc(x)
        return x

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)


model = CustomModel(num_classes=25).to(device)
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001, weight_decay=5e-4)


def update_lr(optimizer, lr):
    for param_group in optimizer.param_groups:
        param_group['lr'] = lr

In [None]:
def train_one_epoch(epoch_index, tb_writer):
    running_loss = 0.
    last_loss = 0.
    all_true_labels = []
    all_predicted_labels = []
    # Here, we use enumerate(training_loader) instead of
    # iter(training_loader) so that we can track the batch
    # index and do some intra-epoch reporting

    for i, data in enumerate(training_loader):
        # Every data instance is an input + label pair
        inputs, labels = data
        inputs, labels = inputs.to(device), labels.to(device)
        # Zero your gradients for every batch!
        optimizer.zero_grad()

        # Make predictions for this batch
        outputs = model(inputs.float())

        # Compute the loss and its gradients
        loss = loss_fn(outputs, labels)
        loss.backward()

        # Adjust learning weights
        optimizer.step()

        # Gather data and report
        running_loss += loss.item()

        _, predicted = torch.max(outputs, 1)
        all_true_labels.extend(labels.cpu().numpy())
        all_predicted_labels.extend(predicted.cpu().numpy())

        if i % 1000 == 999:
            last_loss = running_loss / 1000 # loss per batch
            print('  batch {} loss: {}'.format(i + 1, last_loss))
            tb_x = epoch_index * len(training_loader) + i + 1
            tb_writer.add_scalar('Loss/train', last_loss, tb_x)
            running_loss = 0.

    # scheduler.step(loss)
    train_accuracy = accuracy_score(all_true_labels, all_predicted_labels)

    return last_loss, train_accuracy

In [None]:
# Initializing in a separate cell so we can easily add more epochs to the same run
checkpoint="checkpoints"
os.makedirs(checkpoint,exist_ok=True)
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
writer = SummaryWriter('runs/trainer_{}'.format(timestamp))
epoch_number = 0

EPOCHS = 100

best_vloss = 1_000_000.
best_accuracy=0
for epoch in range(EPOCHS):
    print('EPOCH {}:'.format(epoch_number + 1))

    # Make sure gradient tracking is on, and do a pass over the data
    model.train(True)
    avg_loss, train_accuracy = train_one_epoch(epoch_number, writer)


    running_vloss = 0.0
    # Set the model to evaluation mode, disabling dropout and using population
    # statistics for batch normalization.
    model.eval()
    val_true_labels = []
    val_predicted_labels = []
    # Disable gradient computation and reduce memory consumption.
    with torch.no_grad():
        for i, vdata in enumerate(validation_loader):
            vinputs, vlabels = vdata
            vinputs, vlabels = vinputs.to(device), vlabels.to(device)
            voutputs = model(vinputs.float())
            vloss = loss_fn(voutputs, vlabels)
            running_vloss += vloss

            _, predicted = torch.max(voutputs, 1)
            val_true_labels.extend(vlabels.cpu().numpy())
            val_predicted_labels.extend(predicted.cpu().numpy())


    avg_vloss = running_vloss / (i + 1)
    val_accuracy = accuracy_score(val_true_labels, val_predicted_labels)

    print('LOSS train {} valid {}'.format(avg_loss, avg_vloss))
    print('Accuracy train {} valid {}'.format(train_accuracy, val_accuracy))

    # Log the running loss averaged per batch
    # for both training and validation
    writer.add_scalars('Training vs. Validation Loss',
                    { 'Training' : avg_loss, 'Validation' : avg_vloss },
                    epoch_number + 1)
    writer.add_scalars('Training vs. Validation Accuracy',
                { 'Training' : train_accuracy, 'Validation' : val_accuracy },
                epoch_number + 1)
    writer.flush()

    # Track best performance, and save the model's state
    if val_accuracy > best_accuracy :#or avg_vloss < best_vloss:
        #best_vloss = avg_vloss
        best_accuracy = val_accuracy
        model_path = 'checkpoints/model_{}_{}'.format(timestamp, epoch_number)
        torch.save(model.state_dict(), model_path)

    epoch_number += 1

In [None]:
%tensorboard --logdir runs

In [None]:
transform = transforms.Compose([
    transforms.Resize(target_size_custom),
    # transforms.Grayscale(num_output_channels=1),
    transforms.Lambda(lambda x: x.float()),
    transforms.Normalize((0.5,), (0.5,))
    ])

test_set=maldataset(csv_file=f"{data_csvs}/test.csv",root_dir=img_path, class_index=class_index, transform=transform)
test_loader = torch.utils.data.DataLoader(test_set, batch_size=batch_size, shuffle=False)


model = CustomModel(num_classes=25).to(device)
model.load_state_dict(torch.load("checkpoints/*"), strict=False)

running_testloss=0.0
model.eval()
test_true_labels = []
test_predicted_labels = []
with torch.no_grad():
    for i, data in enumerate(test_loader):
        # Every data instance is an input + label pair
        inputs, labels = data
        inputs, y_gt = inputs.to(device), labels.to(device)

        y_test_predicted = model(inputs.float())
        test_loss = loss_fn(y_test_predicted, y_gt)
        print(test_loss)
        running_testloss += test_loss.item()

        _, predicted = torch.max(y_test_predicted, 1)
        print(predicted)
        test_true_labels.extend(labels.cpu().numpy())
        test_predicted_labels.extend(predicted.cpu().numpy())

    avg_testloss = running_testloss / (i + 1)
    test_accuracy = accuracy_score(test_true_labels, test_predicted_labels)

    IoU=metrics.jaccard_score(test_true_labels, test_predicted_labels,average="micro")
    f1=metrics.f1_score(test_true_labels, test_predicted_labels,average="micro")
    print("micro IoU",IoU)
    print("micro f1",f1)

    IoU=metrics.jaccard_score(test_true_labels, test_predicted_labels,average="macro")
    f1=metrics.f1_score(test_true_labels, test_predicted_labels,average="macro")
    print("macro IoU",IoU)
    print("macro f1",f1)

    print('LOSS test{}, Accuracy test{}'.format(avg_testloss,test_accuracy))



In [None]:
def confusion_matrix(confusion_matrix, class_names, figsize = (10,7), fontsize=14):
    df_cm = pd.DataFrame(
        confusion_matrix, index=class_names, columns=class_names,
    )
    fig = plt.figure(figsize=figsize)
    try:
        heatmap = sns.heatmap(df_cm, annot=True, fmt="d")
    except ValueError:
        raise ValueError("Confusion matrix values must be integers.")
    heatmap.yaxis.set_ticklabels(heatmap.yaxis.get_ticklabels(), rotation=0, ha='right', fontsize=fontsize)
    heatmap.xaxis.set_ticklabels(heatmap.xaxis.get_ticklabels(), rotation=45, ha='right', fontsize=fontsize)
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

In [None]:
c_matrix = metrics.confusion_matrix(test_true_labels, test_predicted_labels)
df_confusion = pd.crosstab(test_true_labels, test_predicted_labels)
df_confusion.to_csv(os.path.join(data_path,"confusion_matrix.csv"))

confusion_matrix(c_matrix, classes, figsize = (20,7), fontsize=14)

In [None]:
report = metrics.classification_report(test_true_labels, test_predicted_labels, target_names=classes,  output_dict=True)
df_report = pd.DataFrame(report).transpose()
print(df_report)