## Startup

In [None]:
!git clone https://github.com/tech-srl/code2vec

In [None]:
!wget https://s3.amazonaws.com/code2vec/model/java14m_model.tar.gz
!tar -xvzf java14m_model.tar.gz

In [None]:
!python3 code2vec/code2vec.py --load ./models/java14_model/saved_model_iter8.release --save_w2v ./models/java14_model/tokens.txt

In [4]:
from gensim.models import KeyedVectors as word2vec

vectors_text_path = 'models/java14_model/tokens.txt'
model = word2vec.load_word2vec_format(vectors_text_path, binary=False)

## Functions

In [5]:
import re

def camel_case_split(identifier):
    return re.findall('[a-zA-Z][^A-Z]*', identifier)

In [6]:
def camel_to_snake(name):
  name = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', name)
  return re.sub('([a-z0-9])([A-Z])', r'\1_\2', name).lower()

In [7]:
def get_iss_by_id(id, dicts):
    for item in dicts:
        if item["id"] == id:
            return item
    # return next(item for item in dicts if item["id"] == id)

In [8]:
import numpy as np
from gensim.utils import tokenize

def code2vec(code):
    vecs = []
    tokens = list(tokenize(code, deacc=True))
    for token in tokens:
        # model is only trained on lowercase
        # if token in vocab get vector
        if token.lower() in model.vocab:
            # token = token.lower()
            vecs.append(model[token.lower()])
        else:
            # split the token based on underscore and camelcase to prevent oov
            humps = camel_to_snake(token).split("_")
            for hump in humps:
                # again, lowercase only
                hump = hump.lower()
                # check if token piece is in vocab
                if hump in model.vocab:
                    vecs.append(model[hump])
    
    # average the vecs for the word pieces in this line of code
    mean = np.mean(vecs, axis=0)
    if not np.isnan(np.sum(mean)):
        return mean
    else:
        return np.zeros(128)

In [9]:
# line-wise
def array2vec(taints, sourcesinkonly = False):
    vecs = []
    for taint in taints:
        vecs.append(code2vec(taint['code']))
    
    if len(vecs) == 1:
        return vecs[0]
    # source and sink only
    elif sourcesinkonly:
        values = np.asarray([vecs[0], vecs[-1]], dtype=np.float32)
        weights = [0.75, 0.25]
        return np.average(values, axis=0, weights=weights)
    else:
        return np.average(vecs, axis=0)

In [10]:
# block-wise
def array2vec(snippet, sourcesinkonly = False):
    snippet = [i["code"] for i in snippet]
    # snippet = snippet[::-1]
    code = "\n".join(snippet)
    vec = code2vec(code)
    return vec

## Calculating Embeddings

In [None]:
import json

with open('../output/java_features.json', encoding="utf-8") as json_file:
    feats = json.load(json_file)

for iss in feats:
    if "cleared" in iss and len(iss["cleared"]) > 0:
        vec_cleared = array2vec(iss["cleared"])
    else:
        continue
    iss['embedding'] = vec_cleared

## Evaluation 4
Check a0 against all others. Look at the majority of rankings inside the |A| window. Assign label.

In [None]:
import torch
from torch import nn
import operator
import copy
from scipy import spatial

# compare all issues to a query. first similar should be from same category.
def evaluate_categories_3(categories, feats, category_check=True):
    cos = nn.CosineSimilarity(dim=0)

    # measures per category
    measures = [{
        "tp": 0,
        "fp": 0,
        "fn": 0,
    } for _ in range(len(categories))]

    # loop through all categories
    for (i_cat, category) in enumerate(categories):
        for (i_query, query_id) in enumerate(category):
            # put all similarity results in this array
            results = []

            query = get_iss_by_id(query_id, feats)
            others = copy.deepcopy(categories)
            del others[i_cat][i_query]

            # compare to all other issues
            for i_check, check_cat in enumerate(others):
                for check_iss in check_cat:
                    check = get_iss_by_id(check_iss, feats)
                    sim = 1 - spatial.distance.cosine(check['embedding'], query['embedding'])
                    # sim = cos(torch.Tensor(check['embedding']), torch.Tensor(query['embedding']))
                    results.append({
                        "bucket": i_check,
                        "similarity": sim
                    })
            
            # sort results by similarity score
            results = sorted(results, key=lambda k: k['similarity'], reverse=True)
            # we only care about the results inside the bucket size
            results = results[:len(category) - 1]
            # check majority category of results
            counts = dict()
            for i in results:
                counts[i['bucket']] = counts.get(i['bucket'], 0) + 1
            majority = max(counts.items(), key=operator.itemgetter(1))[0]

            if i_cat == majority:
                measures[i_cat]['tp'] += 1
            else:
                measures[i_cat]['fn'] += 1
                measures[majority]['fp'] += 1

    return measures

In [None]:
with open('../data/label_java_sqli.json') as json_file:
    categories = json.load(json_file)

measures = evaluate_categories_3(categories, feats)
for measure in measures:
    precision = measure['tp'] / (measure['tp'] + measure['fp']) if (measure['tp'] + measure['fp']) else 0
    recall = measure['tp'] / (measure['tp'] + measure['fn']) if (measure['tp'] + measure['fn']) else 0
    f1 = 2 * ((precision * recall) / (precision + recall)) if (precision + recall) else 0
    beta = 2
    f_beta = (1 + beta**2) * ((precision * recall) / (beta**2 * precision + recall)) if (precision + recall) else 0
    
    print(f"F1: {f1}")
    print("-----------")

## Evaluation 5
Check a0 against all others. Look at rank 1 and apply label.

In [None]:
from torch import nn
import operator
import copy

# compare all issues to a query. first similar should be from same category.
def evaluate_categories_4(categories, feats, threshold = 1):
    cos = nn.CosineSimilarity(dim=0)

    # measures per category
    measures = [{
        "true": 0,
        "false": 0,
    } for _ in range(len(categories))]

    # loop through all categories
    for (i_cat, category) in enumerate(categories):
        for (i_query, query_id) in enumerate(category):
            # put all similarity results in this array
            results = []

            query = get_iss_by_id(query_id, feats)
            others = copy.deepcopy(categories)
            del others[i_cat][i_query]

            # compare to all other issues
            for i_check_cat, check_cat in enumerate(others):
                for check_iss in check_cat:
                    check = get_iss_by_id(check_iss, feats)
                    sim = cos(torch.Tensor(check['embedding']), torch.Tensor(query['embedding']))
                    results.append({
                        "bucket": i_check_cat == i_cat,
                        "similarity": sim
                    })
            
            # sort results by similarity score
            results = sorted(results, key=lambda k: k['similarity'], reverse=True)

            # check if one under the first threshold results comes from the same category
            found = False
            for i in range(threshold):
                if results[i]["bucket"] == True:
                    measures[i_cat]["true"] += 1
                    found = True
                    break
            if found == False:
                measures[i_cat]["false"] += 1
    
    return measures

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt

with open('../data/label_java_xss.json') as json_file:
    categories = json.load(json_file)

limit = 20
accuracies = [ [] for _ in range(len(categories)) ]

# for threshold in range(1, limit):
for threshold in [1, 3, 5]:
    print(f"Threshold: {threshold}")
    measures = evaluate_categories_4(categories, feats, threshold)
    for i_cat, cat in enumerate(measures):
        print(f"Kategorie {i_cat + 1}")
        print(cat['true'] / (cat['true'] + cat['false']))