In [2]:
import sys
from scipy import io
import numpy as np
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from torch_geometric.utils import from_scipy_sparse_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingClassifier
import os.path as osp
import os
from tsnecuda import TSNE
import json
import pandas as pd
import matplotlib.pyplot as plt
sys.path.append('../')
from src.models import *

In [3]:
fp = '/datasets/dsc180a-wi20-public/Malware/group_data/group_02/sensitive_data'
g1 = io.loadmat(osp.join(fp, 'interim', 'graph', 'graph_1.mat'))
g2 = io.loadmat(osp.join(fp, 'interim', 'graph', 'graph_2.mat'))
post_indx = io.loadmat(osp.join(fp, 'interim', 'graph', 'graph_1.mat'))['post_indx'].reshape(-1,)

y = g1['post_label'].reshape(-1,)

In [4]:
g1_feature = np.hstack((np.array(g1['P'].sum(1)), np.array(g1['A'].T.sum(1))))
g1_feature = StandardScaler().fit_transform(g1_feature)
g2_feature = np.array(g2['P'].sum(1))
g2_feature = StandardScaler().fit_transform(g2_feature)
baseline_feature = get_baseline_feature(fp)

In [5]:
g1_node2vec = np.load(osp.join(fp, 'processed', 'node2vec', 'graph_1.npy'))
g1_infomax = np.load(osp.join(fp, 'processed', 'infomax', 'graph_1.npy'))
g1_metapath2vec =  np.load(osp.join(fp, 'processed', 'metapath2vec', 'graph_1.npy'))

g2_node2vec = np.load(osp.join(fp, 'processed', 'node2vec', 'graph_2.npy'))
g2_infomax = np.load(osp.join(fp, 'processed', 'infomax', 'graph_2.npy'))
g2_metapath2vec =  np.load(osp.join(fp, 'processed', 'metapath2vec', 'graph_2.npy'))

In [6]:
def evaluate_emb(X_train, X_test, y_train, y_test, clf):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    y_true = y_test.copy()
    tn, fp, fn, tp = metrics.confusion_matrix(y_true, y_pred).ravel()
    acc = metrics.accuracy_score(y_true, y_pred)
    fpr, tpr, thresholds = metrics.roc_curve(y_true, y_pred, pos_label = 1)
    auc = metrics.auc(fpr, tpr)
    precision = metrics.precision_score(y_true, y_pred)
    recall = metrics.recall_score(y_true, y_pred)
    return tn, fp, fn, tp, acc, auc, precision, recall

In [12]:
def evaluate_with_baseline(train_mask, test_mask, feature_1, feature_2, feature_3, y, clfs):
    if isinstance(feature_2, type(None)) and isinstance(feature_2, type(None)):
        X = feature_1
    else:
        if isinstance(feature_3, type(None)):
            X = np.hstack((feature_1, feature_2))
        else:
            X = np.hstack((feature_1, feature_2, feature_3))
    X_train, X_test, y_train, y_test = X[train_mask, :], X[test_mask, :], y[train_mask], y[test_mask]
    res = {}
    for clf in clfs:
        res[clf.__class__.__name__] = list(evaluate_emb(X_train, X_test, y_train, y_test, clf))
    res = pd.DataFrame(res).T
    res.columns = ['tn', 'fp', 'fn', 'tp', 'acc', 'auc', 'precision', 'recall']
    return res

In [13]:
train_mask, test_mask = train_test_split(np.arange(len(y)), test_size=0.2)

In [14]:
clfs = [LogisticRegression(
        verbose=False, max_iter=2000, class_weight = 'balanced', n_jobs=8, solver = 'lbfgs'),
        RandomForestClassifier(class_weight = 'balanced', n_jobs=8),
        GradientBoostingClassifier()
       ]

baseline

In [15]:
res = evaluate_with_baseline(train_mask, test_mask, baseline_feature, None, None, y, clfs)
display(res)

Unnamed: 0,tn,fp,fn,tp,acc,auc,precision,recall
LogisticRegression,15659.0,4147.0,232.0,645.0,0.78828,0.76304,0.134599,0.735462
RandomForestClassifier,17743.0,2063.0,535.0,342.0,0.87439,0.642903,0.142204,0.389966
GradientBoostingClassifier,19745.0,61.0,775.0,102.0,0.95958,0.556613,0.625767,0.116306


g1 node2vec

In [16]:
res = evaluate_with_baseline(train_mask, test_mask, g1_feature, g1_node2vec, baseline_feature, y, clfs)
display(res)

Unnamed: 0,tn,fp,fn,tp,acc,auc,precision,recall
LogisticRegression,15557.0,4249.0,239.0,638.0,0.78301,0.756475,0.13055,0.72748
RandomForestClassifier,19779.0,27.0,834.0,43.0,0.958372,0.523834,0.614286,0.049031
GradientBoostingClassifier,19753.0,53.0,783.0,94.0,0.95958,0.552254,0.639456,0.107184


g1 infomax

In [17]:
res = evaluate_with_baseline(train_mask, test_mask, g1_feature, g1_infomax, None, y, clfs)
display(res)

Unnamed: 0,tn,fp,fn,tp,acc,auc,precision,recall
LogisticRegression,15874.0,3932.0,248.0,629.0,0.797902,0.759346,0.137908,0.717218
RandomForestClassifier,19489.0,317.0,745.0,132.0,0.948653,0.567254,0.293987,0.150513
GradientBoostingClassifier,19727.0,79.0,776.0,101.0,0.958662,0.555588,0.561111,0.115165


g1 metapath2vec

In [18]:
res = evaluate_with_baseline(train_mask, test_mask, g1_feature, g1_metapath2vec, baseline_feature, y, clfs)
display(res)

Unnamed: 0,tn,fp,fn,tp,acc,auc,precision,recall
LogisticRegression,15517.0,4289.0,245.0,632.0,0.780786,0.752044,0.128429,0.720639
RandomForestClassifier,19781.0,25.0,839.0,38.0,0.958227,0.521034,0.603175,0.04333
GradientBoostingClassifier,19748.0,58.0,787.0,90.0,0.959145,0.549847,0.608108,0.102623


g2 node2vec

In [19]:
res = evaluate_with_baseline(train_mask, test_mask, g2_feature, g2_node2vec, baseline_feature, y, clfs)
display(res)

Unnamed: 0,tn,fp,fn,tp,acc,auc,precision,recall
LogisticRegression,15560.0,4246.0,235.0,642.0,0.783349,0.758831,0.131342,0.732041
RandomForestClassifier,19775.0,31.0,828.0,49.0,0.958468,0.527154,0.6125,0.055872
GradientBoostingClassifier,19743.0,63.0,785.0,92.0,0.959,0.550861,0.593548,0.104903


g2 infomax

In [20]:
res = evaluate_with_baseline(train_mask, test_mask, g2_feature, g2_infomax, None, y, clfs)
display(res)

Unnamed: 0,tn,fp,fn,tp,acc,auc,precision,recall
LogisticRegression,15339.0,4467.0,208.0,669.0,0.773969,0.768645,0.130257,0.762828
RandomForestClassifier,19169.0,637.0,703.0,174.0,0.935212,0.583121,0.21455,0.198404
GradientBoostingClassifier,19730.0,76.0,760.0,117.0,0.95958,0.564786,0.606218,0.133409


g2 metapath2vec

In [21]:
res = evaluate_with_baseline(train_mask, test_mask, g2_feature, g2_metapath2vec, baseline_feature, y, clfs)
display(res)

Unnamed: 0,tn,fp,fn,tp,acc,auc,precision,recall
LogisticRegression,15544.0,4262.0,232.0,645.0,0.78272,0.760137,0.131445,0.735462
RandomForestClassifier,19778.0,28.0,840.0,37.0,0.958033,0.520388,0.569231,0.042189
GradientBoostingClassifier,19749.0,57.0,785.0,92.0,0.95929,0.551013,0.61745,0.104903
