# DSC180A Checkpoint #2

## Setup

In [1]:
import re
import pandas as pd
import os
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

## ETL & feature extraction

In [2]:
# Transform smali file to call-anlysis table
def process_smali(sml):
    methods = pd.DataFrame(re.findall(r'method.* (\w+)[(].+[)].+;([\d\D]*?)\.end method', sml))
    if len(methods) == 0:
        return -1
    
    def process_method(x):
        res = pd.DataFrame(re.findall(r'invoke-(\w{5,9})\s.+}, (.*);->(.+)[(]', x[1]))
        if len(res) == 0:
            return
        res.columns = ['invoke_type', 'package_long', 'call']
        res['method'] = x[0]
        res['package'] = res.package_long.apply(lambda x:x[1:x.find('/')])
        res['type'] = res.package.apply(lambda x:x if x in ['android', 'androidx', 'google', 'java', 'javax', 'kotlin'] else 'self')
        return res
    
    dfs = methods.apply(process_method, axis=1)
    try:
        return pd.concat(dfs.tolist())
    except:
        return -1

In [3]:
# Generate Markov Chain
def generate_chain(df):
    return df.type.value_counts() / len(df)

In [4]:
# Apply smali file analysis to all smali files in an apk
def process_apk(path, apk_name):
    print(path)
    i = 0
    df = pd.DataFrame()
    num_apicalls = 0
    for root, dirs, files in os.walk(path, topdown=False):  
        for name in files:
            if name.endswith('.smali'):
                if i >= 85000:
                    break
                f = open(os.path.join(root, name))
                sml_df = process_smali(f.read())
                if type(sml_df) == int:
                    pass
                else:
                    df = pd.concat([df, sml_df], ignore_index=True)
                f.close()
                i += 1
    df['name'] = apk_name        
    return df

In [5]:
# Parse all training data in directory which contains all malware or benign-ware
def parse_all(path, is_mal):
    fin = pd.DataFrame()
    wares = [i for i in os.listdir(path)]
    if is_mal:
        for d in wares:
            d_path = path + '/' + d
            varieties = [i for i in os.listdir(d_path)]
            for v in varieties:
                v_path = d_path + '/' + v
                try:
                    df = process_apk(v_path, d + ' ' + v)
                    fin = pd.concat([fin, df], ignore_index=True)
                    fin.to_csv('fin.csv')
                    print('finish {}'.format(d + ' ' + v))
                except:
                    pass
    else:
        for d in wares:
            d_path = path + '/' + d
            try:
                df = process_apk(d_path, d)
                fin = pd.concat([fin, df], ignore_index=True)
                fin.to_csv('benigns.csv')
                print('finish {}'.format(d))
            except:
                pass
    return dfs

In [6]:
def parse_pp(wares, is_mal):
    fin = pd.DataFrame()
    path = '/teams/DSC180A_FA20_A00/a04malware/malware'
    if is_mal:
        for d in wares:
            d_path = path + '/' + d
            varieties = [i for i in os.listdir(d_path)]
            for v in varieties:
                v_path = d_path + '/' + v
                try:
                    df = process_apk(v_path, d + ' ' + v)
                    fin = pd.concat([fin, df], ignore_index=True)
                    fin.to_csv('ff4.csv')
                    print('finish {}'.format(d + ' ' + v))
                except:
                    pass

## Data Parsing

In [30]:
os.listdir('/teams/DSC180A_FA20_A00/a04malware/popular-apps')[:200]

['net.updategames.granny',
 'com.gameloft.android.ANMP.GloftA8HM',
 'com.devsisters.gb',
 'com.huobi.cn',
 'com.gretech.gomplayerko',
 'com.buffstudio.sevendays_free',
 'com.Fren2y.FlyBattle',
 'tasty.buzzfeed.japan.recipes.cooking',
 'com.celiniumapps.gotseven',
 'com.cyberbots.roulette',
 'com.sega.comixzone',
 'com.MIPL.ShreeAnnapurnaTiffin',
 'kr.ac.mokwon',
 'com.facebook.katana',
 'com.gasterus.polsampaifinish',
 'com.OsamahAsad.TastiestChineseCuisine',
 'com.aircrunch.shopalerts',
 'com.han.dominoes',
 'ru.omdevelopment.ref.qurangerman.free',
 'com.microsoft.todos',
 'com.tabkeey.taapseewallpaper',
 'com.framingitup.app',
 'com.roboxy.muruganclocklivewallpaper',
 'com.nimblebit.bitcity',
 'com.iedutainments.free.no',
 'com.appbuilder.u1180923p1557621',
 'jp.co.cheep.markun.settings',
 'com.baseballguideandtips',
 'com.solosalon.servicer',
 'net.minhawebradio.joelcecilio',
 'com.luc.tetvn',
 'com.AtPlayMusic.NoteRacerTrombone',
 'com.sega.kidcham',
 'com.HarauStudio.EXOSong',
 'c

In [12]:
features_benign = pd.read_csv('benigns.csv')
features_benign = features_benign.drop('Unnamed: 0',axis=1)

In [15]:
len(features_benign.name.unique())

143

In [11]:
features_mal = features_mal.drop('Unnamed: 0',axis=1)

In [16]:
features_mal = pd.read_csv('malware_ma.csv')

In [27]:
chains_mal

Unnamed: 0_level_0,android,androidx,java,javax,kotlin,self
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Andup variety1,49420.0,0.0,119366.0,1028.0,0.0,69666.0
BankBot variety1,2636.0,0.0,1588.0,0.0,0.0,36.0
BankBot variety2,69411.0,0.0,48170.0,0.0,0.0,2267.0
BankBot variety3,125295.0,0.0,57518.0,729.0,0.0,21401.0
BankBot variety4,4083.0,0.0,6694.0,0.0,0.0,4882.0
...,...,...,...,...,...,...
Univert variety1,8850.0,0.0,4880.0,0.0,0.0,250.0
Vidro variety1,3486.0,0.0,13587.0,65.0,0.0,7408.0
VikingHorde variety1,21344.0,0.0,27995.0,62.0,0.0,42099.0
Vmvol variety1,493.0,0.0,8759.0,0.0,0.0,10704.0


In [None]:
# Malware
path_mal = '/teams/DSC180A_FA20_A00/a04malware/malware' # Path
dfs = parse_all(path_mal, 1)87 prank

In [None]:
# Safeware
path_saf = '/teams/DSC180A_FA20_A00/a04malware/popular-apps' # Path
features_saf = parse_all(path_saf, 0)

/teams/DSC180A_FA20_A00/a04malware/popular-apps/net.updategames.granny
finish net.updategames.granny
/teams/DSC180A_FA20_A00/a04malware/popular-apps/com.gameloft.android.ANMP.GloftA8HM
finish com.gameloft.android.ANMP.GloftA8HM
/teams/DSC180A_FA20_A00/a04malware/popular-apps/com.devsisters.gb
finish com.devsisters.gb
/teams/DSC180A_FA20_A00/a04malware/popular-apps/com.huobi.cn
finish com.huobi.cn
/teams/DSC180A_FA20_A00/a04malware/popular-apps/com.gretech.gomplayerko
finish com.gretech.gomplayerko
/teams/DSC180A_FA20_A00/a04malware/popular-apps/com.buffstudio.sevendays_free
finish com.buffstudio.sevendays_free
/teams/DSC180A_FA20_A00/a04malware/popular-apps/com.Fren2y.FlyBattle
finish com.Fren2y.FlyBattle
/teams/DSC180A_FA20_A00/a04malware/popular-apps/tasty.buzzfeed.japan.recipes.cooking
finish tasty.buzzfeed.japan.recipes.cooking
/teams/DSC180A_FA20_A00/a04malware/popular-apps/com.celiniumapps.gotseven
finish com.celiniumapps.gotseven
/teams/DSC180A_FA20_A00/a04malware/popular-apps/c

In [23]:
format_ser = pd.Series([0] * 6,['self','java','android','kotlin','androidx','javax'])
def form(ser):
    return ser.combine(format_ser, max).fillna(0)
def proc(x):
    return form(x.type.value_counts())

In [24]:
chains_mal = features_mal.groupby('name').apply(proc)
chains_benign = features_benign.groupby('name').apply(proc)
chains_mal['is_mal'] = [1] * len(chains_mal)
chains_benign['is_mal'] = [0] * len(chains_benign)
chains = pd.concat([chains_mal, chains_benign], ignore_index=True)

## Model

In [64]:
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

def ttsplit(X, y):
    return train_test_split(X, y, test_size=0.33, random_state=42)

def baseline_predict(X):
    return X.apply(lambda x:1 if x.num_smali >= 50000 else 0, axis=1)

def build_Log(X_train, y_train, C, max_it):
    return LogisticRegression(fit_intercept=True, C=C, max_iter=max_it).fit(X_train, y_train)

def build_Linear(X_train, y_train):
    return LinearRegression().fit(X_train, y_train)

def build_KN(X_train, y_train, n):
    return KNeighborsClassifier(n_neighbors=n).fit(X_train, y_train)

def accuracy(y_train, y_test, y_predict_train, y_predict_test):
    print('train Accuracy = {}, test Accuracy = {}'.format(accuracy_score(y_train, y_predict_train),
                                                 accuracy_score(y_test, y_predict_test)))

def mse(y_train, y_test, y_predict_train, y_predict_test):
    print('train MSE = {}, test MSE = {}'.format(mean_squared_error(y_train, y_predict_train),
                                                 mean_squared_error(y_test, y_predict_test)))
    return mean_squared_error(y_test, y_predict_test)


In [None]:
X = chains.drop('is_mal', axis=1)
y = chains.is_mal
X_train, X_test, y_train, y_test = ttsplit(X,y)
reg_log = build_Log(X_train, y_train, 0.1, 1000)
pred_train_log = reg_log.predict(X_train)
pred_test_log = reg_log.predict(X_test)
reg_lin = build_Linear(X_train, y_train, 0.1, 1000)
pred_train_lin = reg_lin.predict(X_train)
pred_test_lin = reg_lin.predict(X_test)
reg_knn = build_KN(X_train, y_train, 0.1, 1000)
pred_train_kn = reg_knn.predict(X_train)
pred_test_kn = reg_knn.predict(X_test)

In [None]:
accuracy(y_train, y_test, pred_train_log, pred_test_log)
accuracy(y_train, y_test, pred_train_lin, pred_test_lin)
accuracy(y_train, y_test, pred_train_kn, pred_test_kn)

In [None]:
tn, fp, fn, tp = confusion_matrix(y_test, pred_test).ravel()
(tn, fp, fn, tp)

In [None]:
pred_train = 
pred_test = 

## EDA

In [None]:
chains['pred'] = reg_##.predict(X)
def cate(x):
    r = ''
    if x.is_mal == x.pred:
        r += 'T'
    else:
        r += 'F'
    if x.res == 1:
        r += 'P'
    else:
        r += 'N'
    return r
chains['analysis'] = chains.apply(cate, axis=1)

In [None]:
fp_df = chains.loc[chains.analysis == 'FP']
fn_df = chains.loc[chains.analysis == 'FN']
tn_df = chains.loc[chains.analysis == 'TN']
tp_df = chains.loc[chains.analysis == 'TP']

In [None]:
fp_df.describe()

In [None]:
tn_df.describe()

In [None]:
fn_df.describe()

In [None]:
tp_df.describe()

In [None]:
def advanced_predict(reg, x):
    res = reg.predict(x)
    if res:
        if x.android > 0.4:
            return 1
        return 0
    else:
        return 0

In [None]:
reg = build_Log(X_train, y_train, 0.1, 1000)
pred_train_adv = [advanced_predict(reg, x) for x in X_train]
pred_test_adv = [advanced_predict(reg, x) for x in X_test]

In [None]:
accuracy(y_train, y_test, pred_train_adv, pred_test_adv)
tn, fp, fn, tp = confusion_matrix(y_test, pred_test_adv).ravel()
(tn, fp, fn, tp)