In [1]:
import os
# move to project root
os.chdir('/home/rcgonzal/DSC180Malware/m2v-adversarial-hindroid/')

import pandas as pd
import numpy as np

from src.model.model import M2VDroid
from src.model.hindroid import Hindroid
from src.data.hindroid_etl import make_models
from src.analysis.analysis import create_performance_table
from src.utils import find_apps
from src.attack.attack import *

%load_ext autoreload
%autoreload 2

# Purpose
This notebook should guide a user with some detail in how to use this package. Note: all paths should be relative to the project directory unless of course the root indicator is present i.e. `/`.

# Data Selection
We assume you should have access to Android apps already decompiled into their Smali representations. If you have not done this, please look into how to use Apktool and Smali to decompile Android APKs (We may provide a script in the future). What we do provide is the `find_app` function which, given a directory, will recursively look for decompiled apps and return a DataFrame with their locations. This is how the `app_list.csv` file begins. 

In [2]:
find_apps('test/testdata/')

Unnamed: 0_level_0,app_dir
app,Unnamed: 1_level_1
testapp1,test/testdata/testapp1
testapp2,test/testdata/testapp2


In some cases like the file `data/out/all-apps/app_list.csv`, we add more columns to this table such as what category an app is from and whether is it malware or not in order for us to label our examples.

In [32]:
all_apps = pd.read_csv('data/out/all-apps/app_list.csv', dtype=str, index_col='app')
all_apps

Unnamed: 0_level_0,app_dir,category,malware
app,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
com.kaktus.hyungkaktus,/teams/DSC180A_FA20_A00/a04malware/random-apps...,random-apps,0
com.wedup.duduamzaleg,/teams/DSC180A_FA20_A00/a04malware/random-apps...,random-apps,0
com.dublin_mobile123.cheat_gta_5,/teams/DSC180A_FA20_A00/a04malware/random-apps...,random-apps,0
com.appall.optimizationbox,/teams/DSC180A_FA20_A00/a04malware/random-apps...,random-apps,0
live.wallpaper.t910001560,/teams/DSC180A_FA20_A00/a04malware/random-apps...,random-apps,0
...,...,...,...
iseed.parumo1,/teams/DSC180A_FA20_A00/a04malware/personal-gr...,random-apps,0
com.dailyblah.rajanikaparthy.talkingtimer,/teams/DSC180A_FA20_A00/a04malware/personal-gr...,random-apps,0
com.cakrabuana.lagunaffpopuler,/teams/DSC180A_FA20_A00/a04malware/personal-gr...,random-apps,0
com.odev.talkingtom,/teams/DSC180A_FA20_A00/a04malware/personal-gr...,random-apps,0


In [33]:
all_apps.category.value_counts()

malware         5516
random-apps      581
popular-apps     324
Name: category, dtype: int64

**Aside:** `all-apps` is a special folder in our out project because it houses all apps -- and their API data in `app-data`! When parsed in our ETL, each app is extracted into its own `.csv` containing every API call made within it, making it easy to pick and choose which apps we want to select just by knowing their names (or md5s for malware). 

With that said, let's return to selecting our data. We want to split our data into stratified halves, both with equal amounts of benign apps and malware. We also have a category `random-apps` which we will keep.

In [4]:
# all_apps = all_apps[all_apps.category != 'random-apps'] # no longer needed, we will be considering random apps
training_sample = (
    all_apps.groupby('malware')
    .apply(lambda x: x.sample(frac=1/3, random_state=42)) # perform stratified sample
    .drop(columns='malware').reset_index().drop(columns='malware').set_index('app') # reset the index
)
training_sample

Unnamed: 0_level_0,app_dir,category
app,Unnamed: 1_level_1,Unnamed: 2_level_1
droom.sleepIfUCan,/teams/DSC180A_FA20_A00/a04malware/popular-app...,popular-apps
com.prowebce003423CECEAPROSO.android,/teams/DSC180A_FA20_A00/a04malware/random-apps...,random-apps
com.nurinmaru.farm0032,/teams/DSC180A_FA20_A00/a04malware/random-apps...,random-apps
com.hulu.plus,/teams/DSC180A_FA20_A00/a04malware/popular-app...,popular-apps
com.triovent.reciepeapp,/teams/DSC180A_FA20_A00/a04malware/random-apps...,random-apps
...,...,...
868e4c839030f28c22998f8a0147f44d,/teams/DSC180A_FA20_A00/a04malware/malware/You...,malware
cfa8c24967ee5c269b83cfc7e50c6352,/teams/DSC180A_FA20_A00/a04malware/malware/You...,malware
5de94bc0c4cc183c0ee5a48a7ae5ae43,/teams/DSC180A_FA20_A00/a04malware/malware/RuM...,malware
51daa9651d546de844280eaf722c3e67,/teams/DSC180A_FA20_A00/a04malware/malware/Fak...,malware


In [5]:
testing_sample = all_apps[['app_dir', 'category']].loc[all_apps.index.difference(training_sample.index)]
testing_sample

Unnamed: 0_level_0,app_dir,category
app,Unnamed: 1_level_1,Unnamed: 2_level_1
00268453be254779f0c7590de47db944,/teams/DSC180A_FA20_A00/a04malware/malware/Dro...,malware
002a7270ec52ec68ea3d979c85261308,/teams/DSC180A_FA20_A00/a04malware/malware/Ban...,malware
0030e0003b7226e9142683e49b41a423,/teams/DSC180A_FA20_A00/a04malware/malware/Fak...,malware
00335946abb79777f9fe2d0d96651e03,/teams/DSC180A_FA20_A00/a04malware/malware/Vid...,malware
0038be31cfed95e13a33d87142eada70,/teams/DSC180A_FA20_A00/a04malware/malware/Fak...,malware
...,...,...
sts.al,/teams/DSC180A_FA20_A00/a04malware/popular-app...,popular-apps
taha.islam.renewal,/teams/DSC180A_FA20_A00/a04malware/random-apps...,random-apps
toolbox.m.incoming.stop,/teams/DSC180A_FA20_A00/a04malware/random-apps...,random-apps
ua.com.citysites.belorechensk,/teams/DSC180A_FA20_A00/a04malware/random-apps...,random-apps


In [9]:
# create two separate directories for each sample and save both to their respective directory
os.makedirs('data/out/train-set', exist_ok=True)
os.makedirs('data/out/test-set', exist_ok=True)
training_sample.to_csv('data/out/train-set/app_list.csv')
testing_sample.to_csv('data/out/test-set/app_list.csv')

Now we must train a model on the training set. To do that we must run the ETL pipeline on that directory. Therefore we set `config/etl-params/etl-params.json` as shown below and then execute `python run.py data`. *This may take a few hours run especially the random walks!*

```json
{
    "outfolder": "data/out/train-half",
    "parse_params": {
        "nprocs": 16
    },
    "feature_params": {
        "redo": false,
        "walk_args": {
            "nprocs": 16,
            "length": 60,
            "n": 3,
            "metapaths": [
                ["app", "api", "app"],
                ["app", "api", "method", "api", "app"],
                ["app", "api", "package", "api", "app"],
                ["app", "api", "package", "api", "method", "api", "app"],
                ["app", "api", "method", "api", "package", "api", "app"]
            ]
        },
        "w2v_args": {
            "size": 128,
            "window": 7,
            "min_count": 0,
            "negative": 5,
            "sg": 1,
            "workers": 16,
            "iter": 5
        }
    },
    "hindroid_params": {
        "redo": false
    }
}
```

In [11]:
%time !python run.py data

2021-03-02 21:16:01.889695: W tensorflow/stream_executor/platform/default/dso_loader.cc:60] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/nvidia/lib:/usr/local/nvidia/lib64
2021-03-02 21:16:01.889761: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2021-03-02 21:16:03.620259: I tensorflow/compiler/jit/xla_cpu_device.cc:41] Not creating XLA devices, tf_xla_enable_xla_devices not set
2021-03-02 21:16:03.620431: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcuda.so.1
2021-03-02 21:16:03.657845: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1720] Found device 0 with properties: 
pciBusID: 0000:8a:00.0 name: GeForce GTX 1080 Ti computeCapability: 6.1
coreClock: 1.582GHz coreCount: 28 deviceMemorySize: 10.92GiB deviceMemoryBandwidt

In [8]:
%time make_models('data/out/train-half/')

Fitting models:


  0%|          | 0/31 [00:00<?, ?it/s]

	Fitting AAT model...


100%|██████████| 31/31 [00:29<00:00,  1.06it/s]
  0%|          | 0/31 [00:00<?, ?it/s]

	Fitting ABAT model...


100%|██████████| 31/31 [14:33<00:00, 28.17s/it]
  0%|          | 0/31 [00:00<?, ?it/s]

	Fitting APAT model...


100%|██████████| 31/31 [01:10<00:00,  2.28s/it]
  0%|          | 0/31 [00:00<?, ?it/s]

	Fitting ABPBTAT model...


100%|██████████| 31/31 [2:28:04<00:00, 286.59s/it]  
  0%|          | 0/31 [00:00<?, ?it/s]

	Fitting APBPTAT model...


100%|██████████| 31/31 [1:39:07<00:00, 191.84s/it]


              acc    recall        f1
kernel                               
AAT      1.000000  1.000000  1.000000
ABAT     0.991107  0.996737  0.995113
APAT     1.000000  1.000000  1.000000
ABPBTAT  1.000000  1.000000  1.000000
APBPTAT  0.974638  0.989848  0.986094
CPU times: user 4h 9min 28s, sys: 14min 44s, total: 4h 24min 13s
Wall time: 4h 24min 11s


From here, we can create the models we will use. Note that we included `"hindroid_params"` in the config file. Therefore we also fitted a Hindroid model on the data. We will also describe how to utilize that class though both models are largely the same.

In [2]:
m2vDroid = M2VDroid('data/out/train-set/',
                    classifier_args={'max_depth':4, 'n_jobs':-1})
m2vDroid.name

'train-set'

In [8]:
# also saves output table to a folder
m2vDroid.fit_predict('data/out/test-set/')

Computing new edges
<stellargraph.core.graph.StellarGraph object at 0x7f3f87a43c40>
Running random walk
Running Word2Vec
Fitting model
              precision    recall  f1-score   support

           0       0.99      0.46      0.62       371
           1       0.95      1.00      0.97      3677

    accuracy                           0.95      4048
   macro avg       0.97      0.73      0.80      4048
weighted avg       0.95      0.95      0.94      4048



Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,120,121,122,123,124,125,126,127,m2vDroid,true
app,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
00268453be254779f0c7590de47db944,-0.449032,-1.202108,0.185910,-0.418299,0.197234,-0.032786,0.613179,0.312402,1.422977,-0.223549,...,-0.557727,-1.199390,0.237443,-1.451841,1.073622,-0.574838,0.175047,-0.162512,1,1
002a7270ec52ec68ea3d979c85261308,-0.084931,0.368828,0.375811,0.661619,-0.839094,-0.181433,-0.071430,-0.349385,-0.236794,0.133636,...,-0.484232,-0.394212,0.019846,-0.543612,-0.315674,-0.232581,0.661621,0.060567,1,1
0030e0003b7226e9142683e49b41a423,-0.306066,-0.173652,0.162710,0.335997,-0.595538,0.178637,0.320537,-0.079408,0.039939,-0.155147,...,-0.180247,-0.208453,0.366364,-0.232035,-0.093705,-0.380263,0.103787,-0.079222,1,1
00335946abb79777f9fe2d0d96651e03,-0.009991,-0.058666,0.460960,0.193677,-0.463901,0.467588,-0.112754,-0.119379,-0.077636,0.003909,...,-0.157391,-0.424092,0.012341,-0.152814,-0.294870,-0.233297,0.047864,0.281478,1,1
0038be31cfed95e13a33d87142eada70,-0.630949,-0.440778,0.604206,0.669105,-0.655404,0.723882,0.139422,0.250296,-0.206485,-0.573729,...,0.289600,-0.139725,0.364534,-0.245115,-0.176167,-0.298281,-0.064914,0.179287,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
sts.al,0.506667,1.068946,0.662136,-0.435601,-0.772628,-0.640351,-1.167849,-0.179379,0.524084,0.052810,...,-0.067021,-0.169583,-0.728771,-0.780243,-0.442614,0.367091,0.368531,0.507996,0,0
taha.islam.renewal,-0.285983,0.002031,0.210271,0.330722,-0.141879,1.251570,-0.249743,0.435072,0.036629,0.001042,...,0.424430,-0.120964,0.638874,0.292902,-0.041647,-0.279142,-0.014098,-0.191005,1,0
toolbox.m.incoming.stop,-0.390785,-0.060382,0.877227,0.972244,-0.525150,0.317062,-0.876197,-0.210319,0.520908,0.025187,...,-1.273346,-0.143025,-0.133812,0.456153,0.085507,0.146431,0.255563,-1.298162,0,0
ua.com.citysites.belorechensk,-0.773604,0.247088,-0.386567,0.765891,-0.562402,1.008386,-0.576490,-0.002315,0.222517,0.223049,...,0.236988,-0.269480,-1.312227,0.154840,0.035490,-1.143552,0.955441,0.136958,1,0


In [3]:
hindroid = Hindroid('data/out/train-set/')
%time hindroid.fit_predict('data/out/test-set/')
# hindroid.fit_predict('data/out/test-sample/')

Computing unique APIs per app


Building A-test matrix: 100%|██████████| 4048/4048 [01:11<00:00, 56.36it/s] 


Making predictions


Predicting AAT, batch:   0%|          | 0/41 [00:00<?, ?it/s]




Predicting AAT, batch: 100%|██████████| 41/41 [00:28<00:00,  1.44it/s]
Predicting ABAT, batch:   0%|          | 0/41 [00:00<?, ?it/s]




Predicting ABAT, batch: 100%|██████████| 41/41 [14:42<00:00, 21.53s/it]
Predicting APAT, batch:   0%|          | 0/41 [00:00<?, ?it/s]




Predicting APAT, batch: 100%|██████████| 41/41 [01:37<00:00,  2.38s/it]
Predicting ABPBTAT, batch:   0%|          | 0/41 [00:00<?, ?it/s]




Predicting ABPBTAT, batch: 100%|██████████| 41/41 [2:36:06<00:00, 228.45s/it]  
Predicting APBPTAT, batch:   0%|          | 0/41 [00:00<?, ?it/s]




Predicting APBPTAT, batch: 100%|██████████| 41/41 [1:25:06<00:00, 124.55s/it]


AAT:
              precision    recall  f1-score   support

           0       0.99      0.85      0.92       371
           1       0.99      1.00      0.99      3677

    accuracy                           0.99      4048
   macro avg       0.99      0.93      0.95      4048
weighted avg       0.99      0.99      0.99      4048

ABAT:
              precision    recall  f1-score   support

           0       0.90      0.84      0.87       371
           1       0.98      0.99      0.99      3677

    accuracy                           0.98      4048
   macro avg       0.94      0.91      0.93      4048
weighted avg       0.98      0.98      0.98      4048

APAT:
              precision    recall  f1-score   support

           0       0.98      0.79      0.88       371
           1       0.98      1.00      0.99      3677

    accuracy                           0.98      4048
   macro avg       0.98      0.90      0.93      4048
weighted avg       0.98      0.98      0.98      4048

AB

Unnamed: 0_level_0,AAT,ABAT,APAT,ABPBTAT,APBPTAT,true
app,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
00268453be254779f0c7590de47db944,1,1,1,1,1,1
002a7270ec52ec68ea3d979c85261308,1,1,1,1,1,1
0030e0003b7226e9142683e49b41a423,1,1,1,1,1,1
00335946abb79777f9fe2d0d96651e03,1,1,1,1,1,1
0038be31cfed95e13a33d87142eada70,1,1,1,1,1,1
...,...,...,...,...,...,...
sts.al,0,0,0,0,0,0
taha.islam.renewal,0,1,1,1,1,0
toolbox.m.incoming.stop,0,0,0,0,0,0
ua.com.citysites.belorechensk,1,1,1,1,1,0


Now that the code has finished running, let us view the results.

In [3]:
hindroid = Hindroid('data/out/train-set/')
%time hindroid.fit_predict('data/out/all-apps/')
# hindroid.fit_predict('data/out/test-sample/')

Computing unique APIs per app


Building A-test matrix: 100%|██████████| 6421/6421 [03:13<00:00, 33.24it/s]  


Making predictions


Predicting AAT, batch:   0%|          | 0/65 [00:00<?, ?it/s]




Predicting AAT, batch: 100%|██████████| 65/65 [00:48<00:00,  1.33it/s]
Predicting ABAT, batch:   0%|          | 0/65 [00:00<?, ?it/s]




Predicting ABAT, batch: 100%|██████████| 65/65 [20:23<00:00, 18.82s/it]
Predicting APAT, batch:   0%|          | 0/65 [00:00<?, ?it/s]




Predicting APAT, batch: 100%|██████████| 65/65 [02:07<00:00,  1.96s/it]
Predicting ABPBTAT, batch:   0%|          | 0/65 [00:00<?, ?it/s]




Predicting ABPBTAT, batch: 100%|██████████| 65/65 [3:28:29<00:00, 192.45s/it]  
Predicting APBPTAT, batch:   0%|          | 0/65 [00:00<?, ?it/s]




Predicting APBPTAT, batch: 100%|██████████| 65/65 [2:34:52<00:00, 142.96s/it]  


AAT:
              precision    recall  f1-score   support

           0       1.00      0.84      0.91       905
           1       0.97      1.00      0.99      5516

    accuracy                           0.98      6421
   macro avg       0.99      0.92      0.95      6421
weighted avg       0.98      0.98      0.98      6421

ABAT:
              precision    recall  f1-score   support

           0       0.94      0.80      0.86       905
           1       0.97      0.99      0.98      5516

    accuracy                           0.96      6421
   macro avg       0.95      0.90      0.92      6421
weighted avg       0.96      0.96      0.96      6421

APAT:
              precision    recall  f1-score   support

           0       0.99      0.82      0.90       905
           1       0.97      1.00      0.98      5516

    accuracy                           0.97      6421
   macro avg       0.98      0.91      0.94      6421
weighted avg       0.97      0.97      0.97      6421

AB

Unnamed: 0_level_0,AAT,ABAT,APAT,ABPBTAT,APBPTAT,true
app,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
com.kaktus.hyungkaktus,0,0,0,0,0,0
com.wedup.duduamzaleg,0,0,0,0,0,0
com.dublin_mobile123.cheat_gta_5,0,0,0,0,0,0
com.appall.optimizationbox,1,0,1,1,1,0
live.wallpaper.t910001560,0,1,1,0,1,0
...,...,...,...,...,...,...
iseed.parumo1,1,1,1,1,1,0
com.dailyblah.rajanikaparthy.talkingtimer,0,0,0,0,0,0
com.cakrabuana.lagunaffpopuler,0,0,0,0,0,0
com.odev.talkingtom,0,0,0,0,0,0


In [16]:
create_performance_table('data/out/test-set/m2v-train-set/predictions.csv', 
                         'data/out/test-set/hindroid-train-set/predictions.csv', 
                         'reports/assets/baseline_performance_chart.csv')

m2vDroid    int64
AAT         int64
ABAT        int64
APAT        int64
ABPBTAT     int64
APBPTAT     int64
true        int64
dtype: object


Unnamed: 0,ACC,TPR,F1,TP,TN,FP,FN
m2vDroid,0.949852,0.999728,0.97313,3676,169,202,1
AAT,0.985672,0.999184,0.992169,3674,316,55,3
ABAT,0.976285,0.990481,0.986992,3642,310,61,35
APAT,0.979249,0.998096,0.988685,3670,294,77,7
ABPBTAT,0.986166,0.99864,0.992432,3672,320,51,5
APBPTAT,0.975791,0.991841,0.986742,3647,303,68,30
