In [1]:
import json
import sys
import os

sys.path.append('../../../')
sys.path.append('../code')
sys.path.append('../../../Char_data')
sys.path.append('../../../python_parser')
retval = os.getcwd()

import csv
import pandas as pd
import logging
import argparse
import warnings
import pickle
import copy
import torch
import multiprocessing
import time
import re
from tqdm import tqdm
from beamAttack import Beam_Atack
import numpy as np
import Levenshtein
from run_parser import get_identifiers, get_example
from model import Model
from utils import set_seed
from utils import Recorder, is_valid_variable_java
from torch.utils.data import DataLoader, Dataset, SequentialSampler, RandomSampler, TensorDataset
from attacker import get_code_pairs, convert_examples_to_features
from transformers import RobertaForMaskedLM
from transformers import (RobertaConfig, RobertaModel, RobertaTokenizer)

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
warnings.simplefilter(action='ignore', category=FutureWarning)  # Only report warning

MODEL_CLASSES = {
    'roberta': (RobertaConfig, RobertaModel, RobertaTokenizer)
}

In [3]:
def convert_examples_to_features(code1_tokens, code2_tokens, label, url1, url2, tokenizer, cache):
    # source
    code1_tokens = code1_tokens[:block_size - 2]
    code1_tokens = [tokenizer.cls_token] + code1_tokens + [tokenizer.sep_token]
    code2_tokens = code2_tokens[:block_size - 2]
    code2_tokens = [tokenizer.cls_token] + code2_tokens + [tokenizer.sep_token]

    code1_ids = tokenizer.convert_tokens_to_ids(code1_tokens)
    padding_length = block_size - len(code1_ids)
    code1_ids += [tokenizer.pad_token_id] * padding_length

    code2_ids = tokenizer.convert_tokens_to_ids(code2_tokens)
    padding_length = block_size - len(code2_ids)
    code2_ids += [tokenizer.pad_token_id] * padding_length

    source_tokens = code1_tokens + code2_tokens
    source_ids = code1_ids + code2_ids
    return InputFeatures(source_tokens, source_ids, label, url1, url2)

def get_example1(item):
    url1, url2, label, tokenizer, cache, url_to_code = item
    if url1 in cache:
        code1 = cache[url1].copy()
    else:
        try:
            code = ' '.join(url_to_code[url1].split())
        except:
            code = ""
        code1 = tokenizer.tokenize(code)
    if url2 in cache:
        code2 = cache[url2].copy()
    else:
        try:
            code = ' '.join(url_to_code[url2].split())
        except:
            code = ""
        code2 = tokenizer.tokenize(code)

    return convert_examples_to_features(code1, code2, label, url1, url2, tokenizer, cache)

class TextDataset(Dataset):
    def __init__(self, tokenizer, file_path='train', block_size=512, pool=None):
        postfix = file_path.split('/')[-1].split('.txt')[0]
        self.examples = []
        index_filename = file_path
        logger.info("Creating features from index file at %s ", index_filename)
        url_to_code = {}
        with open('/'.join(index_filename.split('/')[:-1]) + '/data.jsonl') as f:
            for line in f:
                line = line.strip()
                js = json.loads(line)
                url_to_code[js['idx']] = js['func']

        data = []
        cache = {}
        with open(index_filename) as f:
            for line in f:
                line = line.strip()
                url1, url2, label = line.split('\t')
                if url1 not in url_to_code or url2 not in url_to_code:
                    continue
                if label == '0':
                    label = 0
                else:
                    label = 1
                data.append((url1, url2, label, tokenizer, args, cache, url_to_code))
        # if 'test' not in postfix:
        #     data = random.sample(data, int(len(data) * 0.1))

        self.examples = pool.map(get_example1, tqdm(data, total=len(data)))
        if 'train' in postfix:
            for idx, example in enumerate(self.examples[:3]):
                logger.info("*** Example ***")
                logger.info("idx: {}".format(idx))
                logger.info("label: {}".format(example.label))
                logger.info("input_tokens: {}".format([x.replace('\u0120', '_') for x in example.input_tokens]))
                logger.info("input_ids: {}".format(' '.join(map(str, example.input_ids))))

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, item):

        return torch.tensor(self.examples[item].input_ids), torch.tensor(self.examples[item].label)

def tokenize_with_camel_case(token):
    matches = re.finditer('.+?(?:(?<=[a-z])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][a-z])|$)', token)
    return [m.group(0) for m in matches]


def tokenize_with_snake_case(token):
    return token.split('_')

def is_camel_case(s):
    transfer = tokenize_with_camel_case(s)[0]
    return False if transfer == s else True

def is_snake_case(s):
    return True if '_' in s else False

def get_advs(split_identifiers, sub_tokens, nodigit=True):
    adv_vars = []
    replace_tokens = []
    for token in sub_tokens:
        similars = []
        num = 0
        distence = 1
        for x in split_identifiers:
            if Levenshtein.distance(token, x) <= distence and num < 30:
                num += 1
                similars.append(x)
        while num < 30:
            distence += 1
            for x in split_identifiers:
                if Levenshtein.distance(token, x) <= distence and num < 30:
                    num += 1
                    similars.append(x)
        similars = [token] if len(similars) == 0 else similars
        if token.isdigit():
            similars = [token]
        replace_tokens.append(similars)
    result = [[]]
    for list_pool in replace_tokens:
        lis = []
        for i in result:
            for j in list_pool:
                lis.append(i + [j])
        result = lis
    for adv_var in result:
        if nodigit:
            adv_var_snake = [i.lower() for i in adv_var]
            adv_vars.append('_'.join(adv_var_snake))
        adv_var_camel = [i.title() for i in adv_var]
        adv_vars.append(''.join(adv_var_camel))
        adv_var_pascal = [i.title() for i in adv_var]
        adv_var_pascal[0] = adv_var_pascal[0].lower()
        adv_vars.append(''.join(adv_var_pascal))
    return adv_vars

def get_new_substituions(split_identifiers, identifiers):
    new_substituions = {}
    for var in identifiers:
        adv_vars = []
        if is_camel_case(var):
            sub_tokens = tokenize_with_camel_case(var)
            adv_vars = get_advs(split_identifiers, sub_tokens)

        elif is_snake_case(var):
            sub_tokens = tokenize_with_snake_case(var)
            adv_vars = get_advs(split_identifiers, sub_tokens)
        elif bool(re.search(r'\d', var)):
            sub_tokens = re.findall(r'\d+|(?:[^\w\s]|_)+|[^\W\d_]+', var)
            adv_vars = get_advs(split_identifiers, sub_tokens, nodigit=False)

        org_similars = []
        num = 0
        distence = 1
        for x in split_identifiers:
            if Levenshtein.distance(var, x) <= distence and num < 30:
                num += 1
                org_similars.append(x)
        while num < 30:
            distence += 1
            for x in split_identifiers:
                if Levenshtein.distance(var, x) <= distence and num < 30:
                    num += 1
                    org_similars.append(x)
        adv_vars1 = [var] if len(org_similars) == 0 else org_similars
        adv_vars = adv_vars + adv_vars1

        adv_vars = [var_tmp for var_tmp in adv_vars if var_tmp != var]
        adv_vars = list(set(adv_vars[:30]))
        if len(adv_vars) > 0:
            new_substituions[var] = adv_vars
    return new_substituions

In [None]:
args = {}
args[block_size] = 512

In [8]:
device = torch.device("cuda")
config = RobertaConfig.from_pretrained("microsoft/codebert-base")
config.num_labels = 2
tokenizer = RobertaTokenizer.from_pretrained("microsoft/codebert-base", do_lower_case=False, cache_dir=None)
block_size = min(512, tokenizer.max_len_single_sentence)
model = RobertaModel.from_pretrained("microsoft/codebert-base",
                                            from_tf=bool('.ckpt' in "microsoft/codebert-base"),
                                            config=config,
                                            cache_dir=None)
model = Model(model, config, tokenizer)
checkpoint_prefix = 'checkpoint-best-f1/model.bin'
output_dir = os.path.join(args.output_dir, '{}'.format(checkpoint_prefix))
model.load_state_dict(torch.load(output_dir))
model.to(device)
logger.info("reload model from {}".format(output_dir))

TypeError: __init__() missing 1 required positional argument: 'args'