## Startup

In [None]:
!pip install transformers
!pip install torch

In [None]:
import torch
from transformers import AutoTokenizer, AutoModel, RobertaTokenizer, RobertaModel, RobertaConfig

# CodeBERT
# config = RobertaConfig.from_pretrained("microsoft/codebert-base", output_hidden_states=True)
# tokenizer = RobertaTokenizer.from_pretrained("microsoft/codebert-base")
# model = RobertaModel.from_pretrained("microsoft/codebert-base", config=config)

# GraphCodeBERT
config = RobertaConfig.from_pretrained("microsoft/graphcodebert-base", output_hidden_states=True)
tokenizer = AutoTokenizer.from_pretrained("microsoft/graphcodebert-base")
model = AutoModel.from_pretrained("microsoft/graphcodebert-base", config=config)


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

model.eval() # disable dropout etc.

In [None]:
# If there's a GPU available...
if torch.cuda.is_available():    
    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

## Helper Functions

In [None]:
def code2vec(code):
    encoded = tokenizer.encode(
        code,                      # Sentence to encode.
        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
        max_length = 512,
        truncation=True
    )
    # print(encoded)
    # print(tokenizer.tokenize(code))
    encoded = torch.LongTensor(encoded).unsqueeze(0)
    encoded = encoded.to(device)
    
    with torch.no_grad():
        out = model(input_ids=encoded)

    # sum of last 4 layers
    vector = torch.stack(out.hidden_states[-4:]).sum(0)

    return vector

In [None]:
def array2vec(snippet, sourcefirst=True):
    snippet = [i["code"] for i in snippet]
    # source first
    if sourcefirst == True:
        snippet = snippet[::-1]
    code = "\n".join(snippet)
    vec = code2vec(code)[:,1:-1,:]
    vec = torch.mean(vec, 1).squeeze(0)
    return vec

In [None]:
def get_iss_by_id(id, dicts):
    return next(item for item in dicts if item["id"] == id)

In [None]:
def tensor_to_fix_length(input, length):
    if input.shape[0] > length:
        input = torch.narrow(input, 0, 0, length)
    else:
        pad = length - input.shape[0]
        zeros = torch.zeros((pad, 768))
        input = torch.cat((input, zeros), 0)
    return input

## Calculating Embeddings

In [None]:
def file_to_embeddings(file):
    with open(file, encoding="utf-8") as json_file:
        issues = json.load(json_file)

        for iss in issues:
            if "cleared" in iss and len(iss["cleared"]) > 0:
                iss['embedding'] = array2vec(iss["cleared"], True)
        return issues

In [None]:
import json

test_set = file_to_embeddings('../output/java_taints_cleaned.json')
patch_set = file_to_embeddings('../output/java_taints_patched.json')

## Patch

In [None]:
from scipy import spatial
from torch import nn
import hashlib

cos = nn.CosineSimilarity(dim=0)

def get_target(query_set, query_id, patch_set):
    query = get_iss_by_id(query_id, query_set)
    if "embedding" not in query:
        return None
    query_embedding = query["embedding"]

    results = []

    for index, issue in enumerate(patch_set):
        if "embedding" not in issue:
            continue
        check_embedding = issue["embedding"]
        cos_sim = cos(query_embedding, check_embedding).item()
        if cos_sim > 0.8:
            issue["similarity"] = 3.75 * cos_sim - 2.75
        else:
            issue["similarity"] = 0.3125 * cos_sim

        results.append(issue)
    return sorted(results, key=lambda k: k['similarity'], reverse=True)

def print_target(issue):
    (project, _) = issue['taints'][0]['file'].split(":", -1)
    sc_target = f"https://sonarcloud.io/project/issues?id={project}&issues={issue['id']}&open={issue['id']}"

    print(f"Target Issue SC: {sc_target}")
    print(f"Most Similar Issue from: {issue['owner']}/{issue['repo']}")
    print(f"Score: {issue['similarity']}")

    for taint in issue['taints']:
        (_, component) = taint['file'].split(":", -1)
        sha = hashlib.sha256(component.encode("utf-8")).hexdigest()
        # L453-L455
        lines = taint["lines"].split("-")
        lines = f"L{lines[0]}-L{lines[1]}"
        diff = f"https://github.com/{issue['owner']}/{issue['repo']}/commit/{issue['patchHash']}#diff-{sha}{lines}"
        print(f"In file: {component}, Lines: {taint['lines']}")
        print(diff)

def print_query(query):
    project = query['taints'][0]['file'].split(":", -1)
    project = ":".join(project[0:-1])
    sc_query = f"https://sonarcloud.io/project/issues?id={project}&issues={query['id']}&open={query['id']}"

    print(f"Query Issue SC: {sc_query}")

In [None]:
"""
SQLi Examples: AXRL2ByGEX4aK-xQ7L6A, AXhcFNE6aAnXFglb3e4u, AXd3C_p6zPaAchWyLZF1, AXhcFNFUaAnXFglb3e5C
XSS Examples: AXOdOdf-GR10QreeQhOl, AXdUSsfa7ZW8oBO85-B4 top3, AXZEAzWg-NnJaEJMI8Wa top2
XSS counter example: AXbyMnmVFBBWxU1WkZtW (patch, but not applicable)
"""

id = "AXRL2ByGEX4aK-xQ7L6A"
query = get_iss_by_id(id, test_set)

issues = get_target(test_set, id, patch_set)

print_query(query)
for x in range(3):
    print()
    print_target(issues[x])

## Demo

In [None]:
id = "AXRL2ByGEX4aK-xQ7L6A"
query = get_iss_by_id(id, test_set)

issues = get_target(test_set, id, patch_set)

print_query(query)
print_target(issues[0])

In [None]:
id = "AXOdOdf-GR10QreeQhOl"
query = get_iss_by_id(id, test_set)

issues = get_target(test_set, id, patch_set)

print_query(query)
print_target(issues[0])