# Data preprocessing and insight

In [1]:
import numpy as np
import pandas as pd
import torch

## Data loading

In [2]:
data = pd.read_csv("dynamic_api_call_sequence_per_malware_100_0_306.csv")
data.head()

Unnamed: 0,hash,t_0,t_1,t_2,t_3,t_4,t_5,t_6,t_7,t_8,...,t_91,t_92,t_93,t_94,t_95,t_96,t_97,t_98,t_99,malware
0,071e8c3f8922e186e57548cd4c703a5d,112,274,158,215,274,158,215,298,76,...,71,297,135,171,215,35,208,56,71,1
1,33f8e6d08a6aae939f25a8e0d63dd523,82,208,187,208,172,117,172,117,172,...,81,240,117,71,297,135,171,215,35,1
2,b68abd064e975e1c6d5f25e748663076,16,110,240,117,240,117,240,117,240,...,65,112,123,65,112,123,65,113,112,1
3,72049be7bd30ea61297ea624ae198067,82,208,187,208,172,117,172,117,172,...,208,302,208,302,187,208,302,228,302,1
4,c9b3700a77facf29172f32df6bc77f48,82,240,117,240,117,240,117,240,117,...,209,260,40,209,260,141,260,141,260,1


## Missing values

Let's count the number of missing values.

In [3]:
data.isnull().sum().sum()

0

Luckily, we have no missing values.

## Data filtering

In [4]:
data = data.drop("hash", axis=1)
data.head()

Unnamed: 0,t_0,t_1,t_2,t_3,t_4,t_5,t_6,t_7,t_8,t_9,...,t_91,t_92,t_93,t_94,t_95,t_96,t_97,t_98,t_99,malware
0,112,274,158,215,274,158,215,298,76,208,...,71,297,135,171,215,35,208,56,71,1
1,82,208,187,208,172,117,172,117,172,117,...,81,240,117,71,297,135,171,215,35,1
2,16,110,240,117,240,117,240,117,240,117,...,65,112,123,65,112,123,65,113,112,1
3,82,208,187,208,172,117,172,117,172,117,...,208,302,208,302,187,208,302,228,302,1
4,82,240,117,240,117,240,117,240,117,172,...,209,260,40,209,260,141,260,141,260,1


In [5]:
y = np.array(data["malware"], dtype=np.float32)
y

array([1., 1., 1., ..., 1., 1., 1.], dtype=float32)

In [6]:
sequences = data.to_numpy()[:, :-1]
sequences

array([[112, 274, 158, ..., 208,  56,  71],
       [ 82, 208, 187, ..., 171, 215,  35],
       [ 16, 110, 240, ...,  65, 113, 112],
       ...,
       [ 82, 240, 117, ..., 260, 141, 260],
       [ 82, 240, 117, ..., 141, 260, 141],
       [112, 274, 158, ..., 208,  56,  71]])

In [7]:
sequences.shape

(43876, 100)

## Train-Val-Test split

In [8]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(sequences, y, test_size=0.1, random_state=4835, stratify=y)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=1.0/9.0, random_state=2548, stratify=y_train)
Xs = [X_train, X_val, X_test]
Xnames = ["X_train", "X_val", "X_test"]
ys = [y_train, y_val, y_test]
ynames = ["y_train", "y_val", "y_test"]

## One-hot encoding

In [9]:
uniqueCalls = np.unique(sequences)
uniqueCalls

array([  0,   2,   3,   5,   6,   7,   8,   9,  10,  11,  12,  13,  14,
        15,  16,  17,  18,  19,  20,  22,  23,  24,  25,  26,  28,  29,
        30,  31,  32,  33,  34,  35,  37,  38,  39,  40,  42,  43,  44,
        45,  46,  47,  48,  49,  50,  52,  54,  55,  56,  57,  58,  59,
        60,  62,  63,  64,  65,  67,  68,  70,  71,  72,  73,  74,  75,
        76,  77,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,
        90,  91,  92,  93,  94,  95,  96,  97,  98,  99, 100, 103, 104,
       106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118,
       119, 120, 121, 122, 123, 124, 125, 127, 128, 129, 130, 131, 132,
       133, 134, 135, 136, 139, 140, 141, 142, 143, 144, 145, 146, 147,
       148, 149, 150, 151, 153, 154, 156, 157, 158, 159, 160, 161, 162,
       163, 164, 165, 166, 168, 170, 171, 172, 173, 174, 175, 176, 177,
       178, 179, 180, 181, 182, 183, 184, 185, 187, 188, 190, 192, 194,
       195, 196, 197, 198, 199, 200, 201, 202, 204, 205, 206, 20

Apparently, not all 307 calls appear in the data set. It's only 264 different calls.

In [10]:
def getUniqueCall(nonUniqueCall):
    return np.argwhere(nonUniqueCall == uniqueCalls)[0, 0]

def getOneHotSequences(sequences):
    hotIndices = torch.tensor([[x, y, getUniqueCall(value)] for (x, y), value in np.ndenumerate(sequences)])
    return torch.sparse.FloatTensor(hotIndices.T, torch.ones(len(hotIndices)))

In [11]:
for i in range(len(Xs)):
    Xs[i] = getOneHotSequences(Xs[i])
    print(Xs[i].shape)
for i in range(len(ys)):
    ys[i] = torch.from_numpy(ys[i])

torch.Size([35100, 100, 264])
torch.Size([4388, 100, 264])
torch.Size([4388, 100, 264])


In [12]:
for tensor, name in zip(Xs + ys, Xnames + ynames):
    torch.save(tensor, name + ".pt")