In [1]:
import os
os.chdir("..")
!pwd

/datasets/home/home-00/10/410/rcgonzal/DSC180Malware/m2v-adversarial-hindroid


This should become `app_parser.py`

In [2]:
import sys
import os


import pandas as pd
import numpy as np
from pathlib import Path
import re
from src.utils import find_apps
from p_tqdm import p_umap

API_DATA_COLUMNS =  ["app", "api", "invoke_type", "class", "method", "package", "context"]
PACKAGE_CLEANING_PATTERN = r"\/[^/]+;->.*"

class Application():
    """
    Defines a application/APK.
    """
    smali_class_pattern = r"L[\w/]*;"
    API_call_pattern = r"invoke-.*"
    
    def extract_app_name(self):
        return os.path.basename(self.app_dir)  
    
    def __init__(self, app_dir):
        self.app_dir = app_dir
        self.app_name = self.extract_app_name()
        self.API_data = None
        self.apis = set()
        self.smali_list = []
        self.num_methods = 0
        
        
    def find_smali_filepaths(self):
        """
        Retrieves a list of paths to all smali files in the app directory. 
        Records paths in self.smali_list and returns them.
        """
        # reset current list in case
        self.smali_list = []
        
        for result in os.walk(self.app_dir):
            current_dir = result[0]
            files = result[2]
            for filename in files:
                smali_ext = '.smali'
                if filename[-len(smali_ext):] == smali_ext:
                    self.smali_list.append(os.path.join(current_dir, filename))

        return self.smali_list
    
    def parse_smali(self, filepath):
        """Parses a singluar smali file
        
        filepath: str, path to smali file"""
        with open(filepath, 'r') as file:
            lines = file.readlines()

        if lines:
            # get class name
            current_class = lines.pop(0).split()[-1]

            # scan for code blocks and API calls
            line_iter = iter(lines)
            current_method = ""
            apis_in_method = set()
            for line in line_iter:
                if ".method" in line:
                    current_method = current_class + "->" + line.split()[-1]
                    self.num_methods += 1
                elif "invoke-" in line:
                    split = line.split()
                    invoke_type = (
                        split[0]
                        .split("-")[-1] # remove invoke
                        .split("/")[0] # remove "/range"
                    )
                    api_call = split[-1]
                    self.apis.add(api_call)
                    package = re.sub(PACKAGE_CLEANING_PATTERN, "", api_call)
                    context=line.strip()

                    self.API_data.append([self.app_name, api_call, invoke_type, current_class, current_method, package, context])
    
    def parse(self):
        """
        Parses all smali files within the app.
        """
        self.API_data = []
        
        for file_path in self.find_smali_filepaths():
            self.parse_smali(file_path)
            
        api_data = pd.DataFrame(self.API_data, columns=API_DATA_COLUMNS)
            
        return api_data

            
def get_data(outfolder, data_source=None, nprocs=2, trained_source=None, recompute=False):
    '''
    Retrieve data for year/location/group from the internet
    and return data (or write data to file, if `outfolder` is
    not `None`).
    '''
    # setup
    os.makedirs(outfolder, exist_ok=True)
    os.makedirs(os.path.join(outfolder, 'app_data'), exist_ok=True)
    app_to_parse_path = os.path.join(outfolder, 'app_data', 'app_list.csv')  # location of any predetermined apps

    try:  # search for predetermined list of apps
        apps_df = pd.read_csv(app_to_parse_path)
    except FileNotFoundError:  # if no such file, create one by looking for apps under data_source directory
        apps_df = find_apps(data_source)
        apps_df.to_csv(app_to_parse_path)
        
    def parse_app(app_dir, outfolder):
        app = Application(app_dir)
        outpath = os.path.join(outfolder, app.app_name+".csv")
        if os.path.exists(outpath) and not recompute:
            return
        else:
            app.parse().to_csv(outpath, index=False)

    print("STEP 1 - PARSING APPS")
    # concurrent execution of smali parsing
    app_parser = p_umap(parse_app, 
                        apps_df.app_dir,
                        [outfolder]*len(apps_df.app_dir),
                        num_cpus=nprocs)

In [20]:
get_data(outfolder='data/out/random-apps/', data_source="/teams/DSC180A_FA20_A00/a04malware/random-apps/", nprocs=16, recompute=True)

STEP 1 - PARSING APPS


HBox(children=(FloatProgress(value=0.0, max=232.0), HTML(value='')))




In [3]:
get_data(outfolder='data/out/test-sample/', nprocs=16, recompute=True)

STEP 1 - PARSING APPS


HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))




In [4]:
get_data(outfolder='data/out/all_apps/', nprocs=16, recompute=True)

STEP 1 - PARSING APPS


HBox(children=(FloatProgress(value=0.0, max=5487.0), HTML(value='')))




In [26]:
pd.read_csv("data/out/all_apps/all_apps.csv").category.value_counts()

malware         4931
popular-apps     324
random-apps      232
Name: category, dtype: int64

In [4]:
pd.read_csv("data/out/random-apps/air.G2JWeirdKittyRescue.csv")

Unnamed: 0,app,api,invoke_type,class,method,package,context
0,air.G2JWeirdKittyRescue,Ljava/lang/Object;-><init>()V,direct,Lair/G2JWeirdKittyRescue/AppEntry$1;,Lair/G2JWeirdKittyRescue/AppEntry$1;-><init>(L...,Ljava/lang,"invoke-direct {p0}, Ljava/lang/Object;-><i..."
1,air.G2JWeirdKittyRescue,Lair/G2JWeirdKittyRescue/AppEntry;->access$000...,static,Lair/G2JWeirdKittyRescue/AppEntry$1;,Lair/G2JWeirdKittyRescue/AppEntry$1;->onClick(...,Lair/G2JWeirdKittyRescue,"invoke-static {v0}, Lair/G2JWeirdKittyResc..."
2,air.G2JWeirdKittyRescue,Lair/G2JWeirdKittyRescue/AppEntry;->access$100...,static,Lair/G2JWeirdKittyRescue/AppEntry$1;,Lair/G2JWeirdKittyRescue/AppEntry$1;->onClick(...,Lair/G2JWeirdKittyRescue,"invoke-static {}, Lair/G2JWeirdKittyRescue..."
3,air.G2JWeirdKittyRescue,Lair/G2JWeirdKittyRescue/AppEntry;->access$200...,static,Lair/G2JWeirdKittyRescue/AppEntry$1;,Lair/G2JWeirdKittyRescue/AppEntry$1;->onClick(...,Lair/G2JWeirdKittyRescue,"invoke-static {}, Lair/G2JWeirdKittyRescue..."
4,air.G2JWeirdKittyRescue,Lcom/adobe/air/InstallOfferPingUtils;->PingAnd...,static,Lair/G2JWeirdKittyRescue/AppEntry$1;,Lair/G2JWeirdKittyRescue/AppEntry$1;->onClick(...,Lcom/adobe/air,"invoke-static {v0, v1, v3, v2, v3}, Lcom/a..."
...,...,...,...,...,...,...,...
66395,air.G2JWeirdKittyRescue,Ljava/lang/StringBuilder;->append(I)Ljava/lang...,virtual,Lso/cuo/platform/admob/ClassicBannerHandler$Ba...,Lso/cuo/platform/admob/ClassicBannerHandler$Ba...,Ljava/lang,"invoke-virtual {v3, v4}, Ljava/lang/String..."
66396,air.G2JWeirdKittyRescue,Ljava/lang/StringBuilder;->toString()Ljava/lan...,virtual,Lso/cuo/platform/admob/ClassicBannerHandler$Ba...,Lso/cuo/platform/admob/ClassicBannerHandler$Ba...,Ljava/lang,"invoke-virtual {v3}, Ljava/lang/StringBuil..."
66397,air.G2JWeirdKittyRescue,Lcom/adobe/fre/FREContext;->dispatchStatusEven...,virtual,Lso/cuo/platform/admob/ClassicBannerHandler$Ba...,Lso/cuo/platform/admob/ClassicBannerHandler$Ba...,Lcom/adobe/fre,"invoke-virtual {v1, v2, v3}, Lcom/adobe/fr..."
66398,air.G2JWeirdKittyRescue,Lcom/google/android/gms/ads/AdListener;->onAdO...,super,Lso/cuo/platform/admob/ClassicBannerHandler$Ba...,Lso/cuo/platform/admob/ClassicBannerHandler$Ba...,Lcom/google/android/gms/ads,"invoke-super {p0}, Lcom/google/android/gms..."


In [8]:
from dask.distributed import Client
import dask.dataframe as dd

In [10]:
os.path.join('data/out/random-apps/', '*.csv')

'data/out/random-apps/*.csv'

In [22]:
path = os.path.join('data/out/random-apps/', '*.csv')
data = dd.read_csv(path)