# **AVS AI Model training**

In [1]:
!ls

Antimalware_Android_ML.ipynb  constants.py	 main.py
apks			      csvs		 malware_2017_static
apktoimage.py		      gen_train_imgs.py  README.md
benign_2017_static	      images		 venv


Extract permissions from datasets

In [2]:
import os
import json

def remove_duplicates():
  lines_seen = set() # holds lines already seen
  outfile = open("all_permissions.txt", "w")
  for line in open("permissions.txt", "r"):
    if line not in lines_seen: # not a duplicate
      outfile.write(line)
      lines_seen.add(line)
  outfile.close()

def extract_permissions(filepath):
  with open(filepath) as f:
    data = json.load(f)
    permissions = data["permissions"]
    with open("permissions.txt","a+") as pfile:
      pfile.seek(0)
      pfromfile = pfile.readlines()
      for permission in permissions:
        if(permission+"\n" not in pfromfile):
          pfile.write(permission+"\n")
          print("Written "+permission+" to file.")


paths = ["./benign_2017_static/ApkMetaReport/","./malware_2017_static/ApkMetaReport/"]
for path in paths:
  files = os.listdir(path)
  for file in files:
    print(file)
    filepath = path + file
    extract_permissions(filepath)
  remove_duplicates()
print("Permissions stored in all_permissions.txt")

0ee12e5f5bf2c416d0419ff4bff7313e68a8df8423ba4d30ecc7cf362f088504.json
Written android.permission.ACCESS_NETWORK_STATE to file.
Written android.permission.INTERNET to file.
Written android.permission.ACCESS_WIFI_STATE to file.
Written android.permission.WRITE_EXTERNAL_STORAGE to file.
Written android.permission.READ_EXTERNAL_STORAGE to file.
Written android.permission.CAMERA to file.
Written android.permission.VIBRATE to file.
Written android.permission.WAKE_LOCK to file.
Written com.android.vending.BILLING to file.
Written com.google.android.c2dm.permission.RECEIVE to file.
Written com.mirmay.privatedownloader.permission.C2D_MESSAGE to file.
Written android.permission.RECEIVE_BOOT_COMPLETED to file.
d735886e49adc88c173e22de4dc11ce475aab81bfb4e3e295219a53a95c757e9.json
Written com.fsp.android.phonetracker.permission.C2D_MESSAGE to file.
Written com.google.android.gms.permission.ACTIVITY_RECOGNITION to file.
Written android.permission.READ_CALENDAR to file.
Written android.permission.BLU

Extract intents from datasets

In [3]:
def remove_duplicates():
  lines_seen = set() # holds lines already seen
  outfile = open("all_intents.txt", "w")
  for line in open("intents.txt", "r"):
    if line not in lines_seen: # not a duplicate
      outfile.write(line)
      lines_seen.add(line)
  outfile.close()

def extract_intents(filepath):
  with open(filepath) as f:
    data = json.load(f)
    intents = data["intents"]
    with open("intents.txt","a+") as ifile:
      ifile.seek(0)
      ifromfile = ifile.readlines()
      for intent in intents:
        if(intent+"\n" not in ifromfile):
          ifile.write(intent+"\n")
          print("Written "+intent+" to file.")


paths = ["./benign_2017_static/ApkMetaReport/","./malware_2017_static/ApkMetaReport/"]
for path in paths:
  files = os.listdir(path)
  #print(files)
  for file in files:
    print(file)
    filepath = path + file
    extract_intents(filepath)
remove_duplicates()
print("Intents stored in all_intents.txt")

0ee12e5f5bf2c416d0419ff4bff7313e68a8df8423ba4d30ecc7cf362f088504.json
Written com.evernote.android.job.v14.RUN_JOB to file.
Written com.google.android.gms.analytics.ANALYTICS_DISPATCH to file.
Written com.google.android.c2dm.intent.REGISTRATION to file.
Written com.google.firebase.INSTANCE_ID_EVENT to file.
Written com.google.android.gms.measurement.UPLOAD to file.
Written android.intent.action.MAIN to file.
Written com.android.vending.INSTALL_REFERRER to file.
Written com.liquidum.hexlock_lockDpb to file.
Written android.intent.action.BOOT_COMPLETED to file.
Written com.google.android.c2dm.intent.RECEIVE to file.
Written com.google.firebase.MESSAGING_EVENT to file.
Written net.vrallev.android.job.v14.RUN_JOB to file.
Written android.net.conn.CONNECTIVITY_CHANGE to file.
Written com.google.android.gms.appinvite.ACTION_PREVIEW to file.
d735886e49adc88c173e22de4dc11ce475aab81bfb4e3e295219a53a95c757e9.json
Written com.fsp.android.phonetracker.MetricsApi.ACTION_METRIC_EVENT to file.
Writte

Write permissions and intents to constants.py

In [4]:
pfile = open("all_permissions.txt", "r")
data = pfile.readlines()
for i in range(len(data)):
  data[i] = data[i].replace('\n', '')
with open("constants.py","w") as cons:
  cons.write("PERMISSIONS=(")
  for p in data[:-1]:
    if(p!=""):
      cons.write("'"+str(p)+"'")
      cons.write(",\n")
  cons.write("'")
  cons.write(str(data[-1]))
  cons.write("'")
  cons.write(")")
pfile.close()

In [5]:
ifile = open("all_intents.txt", "r")
data = ifile.readlines()
for i in range(len(data)):
  data[i] = data[i].replace('\n', '')
with open("constants.py","a") as cons:
  cons.write("\n")
  cons.write("INTENTS=(")
  for i in data[:-1]:
    if(i!=""):
      cons.write("'"+str(i)+"'")
      cons.write(",\n")
  cons.write("'")
  cons.write(str(data[-1]))
  cons.write("'")
  cons.write(")\n")
ifile.close()

Select features

In [1]:
import os
import json
import numpy as np

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif

from constants import PERMISSIONS
from constants import INTENTS


def load_json(fp):
  data = {}
  with open(fp) as f:
    data = json.load(f)
  return data["permissions"],data["intents"]

def get_feature_vector(apk):
  fv = [] #feature vector
  for permission in PERMISSIONS:
    status = 1 if permission in apk['permissions'] else 0
    fv.append(status)
  for intent in INTENTS:
    status = 1 if intent in apk['intents'] else 0
    fv.append(status)
  return fv

def prepare_dataset():
  paths = ["./benign_2017_static/ApkMetaReport/","./malware_2017_static/ApkMetaReport/"]
  apks = []
  for path in paths:
    files = os.listdir(path)
    for file in files:
      apk = {}
      filepath = path + file
      apk['permissions'],apk['intents']= load_json(filepath)
      apk['Malicious'] = paths.index(path) 
      apks.append(apk)
  return apks

def get_X_and_Y_matrices():
  print("Preparing dataset...")
  dataset = prepare_dataset()
  print("Dataset preparation completed.")
  print("Creating x and y matrices...")
  x = []
  y = []
  for apk in dataset:
    x.append(get_feature_vector(dataset[dataset.index(apk)]))
    y.append(apk['Malicious'])
  print("x and y matrices are created.")
  return np.array(x),np.array(y)

print("Fetching X and Y matrices...")
X, Y = get_X_and_Y_matrices()
print("X and Y matrices are fetched.")
print(len(Y))
input_dim=len(X[0])
print("Feature selection")

test = SelectKBest(score_func=f_classif, k=2000)
fit = test.fit(X, Y)
print(fit.scores_)

Fetching X and Y matrices...
Preparing dataset...
Dataset preparation completed.
Creating x and y matrices...


In [None]:
features = fit.transform(X)

In [None]:
indices = fit.get_support(True) # returns array of indices of selected features
mask = fit.get_support()

Write selected features into selected_features.py

In [None]:
from constants import PERMISSIONS
from constants import INTENTS
print(len(indices))
print(len(mask))
print(mask)

intentstartindex = 0
permissionslastindex = 0

for i in range(len(indices)):
  if(indices[i]<len(PERMISSIONS)):
    continue
  else:
    intentstartindex = i
    permissionslastindex = i-1
    break

print("PERMISSIONS:"+str(len(PERMISSIONS)))
print("INTENTS:"+str(len(INTENTS)))
print("permissionslastindex:"+str(permissionslastindex))
print("indices[permissionslastindex]:"+str(indices[permissionslastindex]))
print("intentstartindex:"+str(intentstartindex))
print("indices[intentstartindex]:"+str(indices[intentstartindex]))

with open("selected_features.py","w") as sf:
  sf.write("PERMISSIONS=(")
  for i in range(permissionslastindex):
      sf.write("'"+str(PERMISSIONS[indices[i]])+"'")
      sf.write(",\n")
  sf.write("'")
  sf.write(str(PERMISSIONS[indices[permissionslastindex]]))
  sf.write("'")
  sf.write(")\n")
  sf.write("INTENTS=(")
  for i in range(intentstartindex,len(indices)-1):
      sf.write("'"+str(INTENTS[indices[i]-len(PERMISSIONS)])+"'")
      sf.write(",\n")
  sf.write("'")
  sf.write(str(INTENTS[indices[-1]-len(PERMISSIONS)]))
  sf.write("'")
  sf.write(")")

In [None]:
from selected_features import PERMISSIONS
from selected_features import INTENTS

print("Number of permissions selected:"+str(len(PERMISSIONS)))
print("Numebr of intents selected:"+str(len(INTENTS)))

### **Train the model**

Split the dataset

In [None]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(features, Y, test_size = 0.2, random_state = 42)
print(len(y_train))
print(len(y_test))

Apply GridSearchCV

In [None]:
# from tensorflow import keras
# from tensorflow.keras.models import Sequential
# from tensorflow.keras.layers import Dense
# from tensorflow.keras.layers import Dropout
# from keras.constraints import maxnorm
# from keras.wrappers.scikit_learn import KerasClassifier

# from sklearn.model_selection import GridSearchCV
# from sklearn import metrics

# def create_model(optimizer='rmsprop', init_mode='uniform', activation='relu', neurons=30, dropout_rate=0.0, weight_constraint=0):
#   keras_model = Sequential()
#   keras_model.add(Dense(neurons, activation=activation, input_dim=2000, kernel_initializer=init_mode, kernel_constraint=maxnorm(weight_constraint)))
#   keras_model.add(Dropout(dropout_rate))
#   keras_model.add(Dense(1, kernel_initializer=init_mode, activation='sigmoid'))
#   keras_model.compile(optimizer=optimizer,
#               loss='binary_crossentropy',
#               metrics=['accuracy'])
#   return keras_model

# model = KerasClassifier(build_fn=create_model, verbose=0)

# batch_size = [10, 15, 20, 25, 30]
# epochs = [50, 100, 250, 500]
# # optimizer = ['SGD', 'RMSprop', 'Adagrad', 'Adadelta', 'Adam', 'Adamax', 'Nadam']
# # init_mode = ['uniform', 'lecun_uniform', 'normal', 'zero', 'glorot_normal', 'glorot_uniform', 'he_normal', 'he_uniform']
# # activation = ['softmax', 'softplus', 'softsign', 'relu', 'tanh', 'sigmoid', 'hard_sigmoid', 'linear']
# # neurons = [20, 30, 40, 50, 60]
# # dropout_rate = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
# # weight_constraint = [1, 2, 3, 4, 5]
# # learn_rate = [0.001, 0.01, 0.1, 0.2, 0.3]
# # momentum = [0.0, 0.2, 0.4, 0.6, 0.8, 0.9]

# param_grid = dict(batch_size=batch_size, epochs=epochs)
# # param_grid = dict(optimizer=optimizer)
# # param_grid = dict(init_mode=init_mode)
# # param_grid = dict(activation=activation)
# # param_grid = dict(neurons=neurons)
# # param_grid = dict(dropout_rate=dropout_rate, weight_constraint=weight_constraint)
# # param_grid = dict(learn_rate=learn_rate, momentum=momentum)

# # param_grid = dict(batch_size=batch_size, epochs=epochs, optimizer=optimizer, init_mode=init_mode,activation=activation,neurons=neurons, dropout_rate=dropout_rate, weight_constraint=weight_constraint)

# grid = GridSearchCV(estimator=model, param_grid=param_grid, cv=3)
# grid_result = grid.fit(features, Y)

# print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
# means = grid_result.cv_results_['mean_test_score']
# stds = grid_result.cv_results_['std_test_score']
# params = grid_result.cv_results_['params']
# for mean, stdev, param in zip(means, stds, params):
#   print("%f (%f) with: %r" % (mean, stdev, param))

Train the network using above parameters

In [None]:
import os
import json
import numpy as np
from selected_features import PERMISSIONS
from selected_features import INTENTS

from tensorflow import keras
from tensorflow.keras.regularizers import l2
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout
from keras.constraints import maxnorm

from sklearn.model_selection import train_test_split
from sklearn import metrics

def load_json(fp):
  data = {}
  with open(fp) as f:
    data = json.load(f)
  return data["permissions"],data["intents"]

def get_feature_vector(apk):
  fv = [] #feature vector
  for permission in PERMISSIONS:
    status = 1 if permission in apk['permissions'] else 0
    fv.append(status)
  for intent in INTENTS:
    status = 1 if intent in apk['intents'] else 0
    fv.append(status)
  return fv

def prepare_dataset():
  paths = ["./benign_2017_static/ApkMetaReport/","./malware_2017_static/ApkMetaReport/"]
  apks = []
  for path in paths:
    files = os.listdir(path)
    for file in files:
      apk = {}
      filepath = path + file
      apk['permissions'],apk['intents']= load_json(filepath)
      apk['Malicious'] = paths.index(path) 
      apks.append(apk)
  return apks

def get_X_and_Y_matrices():
  print("Preparing dataset...")
  dataset = prepare_dataset()
  print("Dataset preparation completed.")
  print("Creating x and y matrices...")
  x = []
  y = []
  for apk in dataset:
    x.append(get_feature_vector(dataset[dataset.index(apk)]))
    y.append(apk['Malicious'])
  print("x and y matrices are created.")
  return np.array(x),np.array(y)

print("Fetching X and Y matrices...")
X, Y = get_X_and_Y_matrices()
print("X and Y matrices are fetched.")
print(len(Y))

#split the dataset for training and testing
print("Splitting the dataset...")
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2, random_state = 42)
print(len(y_train))
print(len(y_test))

model = Sequential()
model.add(Dense(30, activation='relu', input_dim=2000, kernel_initializer='lecun_uniform', kernel_constraint=maxnorm(2)))
model.add(Dropout(0.2))
model.add(Dense(1, kernel_initializer='lecun_uniform', activation='sigmoid'))
#optimizer = SGD(lr=0.001, momentum=0.6)
model.compile(optimizer='rmsprop',
            loss='binary_crossentropy',
            metrics=['accuracy'])
model.fit(x_train, y_train, epochs=100, batch_size=20)

_, accuracy = model.evaluate(x_test, y_test)
print('Accuracy: %.2f' % (accuracy*100))

predictions = list((model.predict(x_test)>0.5).astype("int32"))
print("Accuracy: "+str(metrics.accuracy_score(y_test, predictions)*100)+"%")
print("Precision: "+str(metrics.precision_score(y_test, predictions)*100)+"%")
print("Recall: "+str(metrics.recall_score(y_test, predictions)*100)+"%")
print("F1-Score: "+str(metrics.f1_score(y_test, predictions)*100)+"%")

Save the model

In [None]:
model.reset_metrics()
# Export the model to a SavedModel
model.save('SavedModel', save_format='tf')

Convert the model to tflite format

In [None]:
import tensorflow as tf

converter = tf.lite.TFLiteConverter.from_saved_model('SavedModel')
tflite_model = converter.convert()
open("converted_model.tflite", "wb").write(tflite_model)

### **Test the model**

The following script can be used to test the model using random sample from the dataset.

In [None]:
import os
import json
import random
import numpy as np
from selected_features import PERMISSIONS
from selected_features import INTENTS

from tensorflow import keras
from tensorflow.keras.regularizers import l2
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

def load_json(fp):
  data = {}
  with open(fp) as f:
    data = json.load(f)
  return data["permissions"],data["intents"]

def get_feature_vector(apk):
  fv = [] #feature vector
  for permission in PERMISSIONS:
    status = 1 if permission in apk['permissions'] else 0
    fv.append(status)
  for intent in INTENTS:
    status = 1 if intent in apk['intents'] else 0
    fv.append(status)
  return fv

def get_apk(path):
  apk = {}
  files = os.listdir(path)
  file = files[random.randrange(20, 50, 3)]
  filepath = path + file
  apk['permissions'],apk['intents'] = load_json(filepath)
  return apk

def get_X():
  print("Fetching apk...")
  path = random.choice(["./benign_2017_static/ApkMetaReport/","./malware_2017_static/ApkMetaReport/"])
  if(path=="./benign_2017_static/ApkMetaReport/"):
    print("Original: Goodware")
  elif(path=="./malware_2017_static/ApkMetaReport/"):
    print("Original: Malware")
  else:
    print("Not chosen anything")
  apk = get_apk(path)
  print("Fetched apk.")
  print("Creating feature vector...")
  x = get_feature_vector(apk)
  print("Feature vector is created.")
  return x

x = np.array(get_X())
x=x.reshape(1,-1)
model = keras.models.load_model('SavedModel')
prediction = (model.predict(x)>0.5).astype("int32")
if(prediction == 1):
  print("Prediction: Malware!!!")
elif(prediction == 0):
  print("Prediction: Goodware :)")
else:
  print("Some error occured. Please check again.")