In [1]:
import os
os.environ['TOKENIZERS_PARALLELISM'] = 'false'
import json
import copy
import torch
import pickle
import random
import tokenize
import numpy as np
from tqdm import tqdm
from multiprocessing import Pool

# import logging
# logger = logging.getLogger('root')

import torch.nn as nn
from transformers import AutoTokenizer, AutoModel, AutoConfig
from transformers import RobertaTokenizer, RobertaModel, RobertaConfig

import time
global last_time
last_time = time.time()
def tprint(s):
    return 0
    global last_time
    print('using time =', round(time.time() - last_time, 2), '----', s)
    last_time = time.time()

In [2]:
def delete(path):
    if (os.path.islink(path)):
        os.unlink(path)
    elif (os.path.isdir(path)):
        for x in os.listdir(path):
            delete(path + '/' + x)
        os.rmdir(path)
    else:
        assert(os.path.isfile(path))
        os.remove(path)

def clear(path):
    cnt = 0
    if (os.path.islink(path)):
        delete(path)
    elif (os.path.isdir(path)):
        for x in os.listdir(path):
            if (x[0] == '.'):
                delete(path + '/' + x)
            else:
                cnt += clear(path + '/' + x)
        if (cnt == 0):
            delete(path)
    else:
        assert(os.path.isfile(path))
        if (path[-3:] == '.py'):
            cnt = 1
        else:
            delete(path)
    return cnt

def _download_repo(repo_full_name, token, save_path):
#     print(f"git clone https://{token}@github.com.cnpmjs.org/{repo_full_name} {save_path}/{repo_full_name} --progress --depth 1")
    os.system(f"git clone https://{token}@github.com.cnpmjs.org/{repo_full_name} {save_path}/{repo_full_name} --quiet --depth 1")
#     logger.info(f'Downloaded {repo_full_name}')
    clear(save_path + '/' + repo_full_name)

def download_repos(repo_full_names, tokens, save_path, num_process = 1):
    pool = Pool(num_process)
    for repo_full_name in repo_full_names:
        token = random.choice(tokens)
        pool.apply_async(_download_repo, args = (repo_full_name, token, save_path))
    pool.close()
    pool.join()

# def download_repos(repo_full_names, tokens, save_path, num_process = 1):
#     for repo_full_name in tqdm(repo_full_names):
#         token = random.choice(tokens)
#         _download_repo(repo_full_name, token, save_path)

In [3]:
def _read_code_tokens_and_functions(path):
    code_tokens = []
    function_stack = []
    code_functions = []
    indent_number = 0
    with open(path, 'rb') as input_file:
        tokenGenerator = tokenize.tokenize(input_file.readline)
        for token in tokenGenerator:
            if (token.type in [0, 59, 60, 62]): # COMMENT
                pass
            elif (token.type in [4, 61]): # NEWLINE
                pass
            elif (token.type == 5): # INDENT
                indent_number += 1
            elif (token.type == 6): # DEDENT
                indent_number -= 1
                if (function_stack != [] and indent_number == function_stack[-1][3]):
                    code_functions.append(function_stack.pop())
                    code_functions[-1][2] = len(code_tokens)
            elif (token.type in [1, 2, 3, 54]): # NAME NUMBER STRING OP
                code_tokens.append(token.string)
                if (token.string in ['def', 'class']):
                    function_stack.append([0, len(code_tokens) - 1, 0, indent_number])
                elif (function_stack != [] and function_stack[-1][0] == 0):
                    function_stack[-1][0] = token.string
            else:
                assert(False)
    return code_tokens, code_functions

def _read_codes(repo_path, current_path = '.'):
    data = {}
    path = os.path.join(repo_path, current_path)
    if (os.path.isdir(path)):
        for x in os.listdir(path):
            data.update(_read_codes(repo_path, current_path + '/' + x))
    elif os.path.isfile(path):
        if (path[-3 :] != '.py'):
            return {}
        try:
            data[current_path] = _read_code_tokens_and_functions(path)
        except:
            f = open('badfile.txt', 'a')
            print(path, file = f)
            f.close()
            return {}
    else:
        return {}
    return data

def read_repos(repos, filepath, num_process = 1):
    dataset = []
    pool = Pool(processes = num_process)
    for _, repo in enumerate(repos):
        dataset.append(pool.apply_async(_read_codes, (filepath + '/' + repo, '.')))
    pool.close()
    pool.join()
    for i in range(len(dataset)):
        dataset[i] = dataset[i].get()
    return dataset

# def read_repos(repos, filepath, num_process = 1):
#     dataset = []
#     for _, repo in enumerate(repos):
#         dataset.append(_read_codes(filepath + '/' + repo, '.'))
#     return dataset

In [4]:
class CodeFeatureWorker():
    def __init__(self, cfg):
        self.save_path = cfg['save_path']
        if os.path.exists(self.save_path) == False: os.mkdir(self.save_path)

        self.keep_code = cfg['keep_code']
        self.github_tokens = cfg['github_tokens']
        self.num_process = cfg['num_process']
        self.device = cfg['device']

        self._init_model(cfg['model_cfg'])
    
    def _init_model(self, cfg):
        self.code_trainable = cfg['code_trainable']
        self.hidden_dim = cfg['hidden_dim']
        self.model_config0 = RobertaConfig.from_pretrained(cfg['code_model'])
        self.tokenizer0 = RobertaTokenizer.from_pretrained(cfg['code_model'])
        self.code_model0 = RobertaModel.from_pretrained(cfg['code_model'])
        
        self.model_config = copy.deepcopy(self.model_config0)
        self.tokenizer = copy.deepcopy(self.tokenizer0)
        self.code_model = copy.deepcopy(self.code_model0).to(self.device)
        for param in self.code_model.parameters():
            param.requires_grad = self.code_trainable
    
    def _restart_model(self):
        del self.model_config
        del self.tokenizer
        del self.code_model
        self.model_config = copy.deepcopy(self.model_config0)
        self.tokenizer = copy.deepcopy(self.tokenizer0)
        self.code_model = copy.deepcopy(self.code_model0).to(self.device)
        for param in self.code_model.parameters():
            param.requires_grad = self.code_trainable

    def _if_downloaded(self, repo_names):
        return np.array([
            os.path.exists(
                os.path.join(self.save_path, repo_name)
            ) for repo_name in repo_names
        ])
    
    def _embed_functions(self, context_tokens, batch_size = 32):
        func_embs = []
        total_size = len(context_tokens['input_ids'])
        for i in range(0, total_size, batch_size):
            end_index = min(total_size, i + batch_size)
            with torch.no_grad():
                func_embs.append(
                    self.code_model(input_ids = context_tokens['input_ids'][i : end_index],
                                    attention_mask = context_tokens['attention_mask'][i : end_index])
                    ['pooler_output'])
        func_embs = torch.cat(func_embs, dim = 0)
        return func_embs
    
    def _compute_embs(self, repo_codes):
        repo_embs = []
        for repo_code in tqdm(repo_codes):
            func_names = []
            func_contexts = []
            for file in repo_code:
                tokens, functions = repo_code[file]
                for func_name, S, T, _ in functions:
                    func_names.append(func_name)
                    func_contexts.append(' '.join(tokens[S : min(T, S + 512)]))
            if (len(func_names) == 0 or len(func_names) > 1600):
                repo_embs.append(-1)
                continue
            tprint('s3.0 funcs got : %d' % len(func_names))
            
            self._restart_model()
            tprint('s3.05 restart done')

            # update tokenizer
            _ = self.tokenizer.add_tokens(func_names)
            self.code_model.resize_token_embeddings(len(self.tokenizer))
            tprint('s3.1 tokenize')
            context_tokens = self.tokenizer(func_contexts, return_tensors="pt",
                                            truncation=True, padding='max_length').to(self.device)
            tprint('s3.1 tokenize done')
            func_embs = self._embed_functions(context_tokens)
            func_token_ids = self.tokenizer.convert_tokens_to_ids(func_names)
            with torch.no_grad():
                self.code_model.embeddings.word_embeddings.weight[func_token_ids] = func_embs
            tprint('s3.3 first run & weighted')
            
            # compute embedding
            repo_embs.append(self._embed_functions(context_tokens).mean(0).to('cpu').tolist())
            tprint('s3.4 second run\n')
            
#             # delete tokens
#             self.tokenizer.added_tokens_encoder.clear()
#             self.code_model.resize_token_embeddings(len(self.tokenizer))
            
        return repo_embs

    def __call__(self, repo_names):
        repo_names = np.array(repo_names)
        # Step 1: 
        # Check whether we have the repos in self.keep_code.
        # If not, download repos
#         tprint('step1')
#         download_repo_names = repo_names[~self._if_downloaded(repo_names)]
#         download_repos(download_repo_names, self.github_tokens, self.save_path, self.num_process)

        # Step 2:
        # Convert repo files to tokens
        tprint('step2')
        codes = read_repos(repo_names, self.save_path, self.num_process)

        # Step 3:
        # Use the model to output code feature embs
        tprint('step3')
        repo_embs = self._compute_embs(codes)
        
        if self.keep_code == False:
            for repo_name in repo_names:
                os.system('rm -r -f ' + self.save_path + '/' + repo_name)
            clear(self.save_path)

        return [[x, y] for x, y in zip(repo_names, repo_embs)]

In [5]:
# if __name__ == '__main__':
cfg = {
    'save_path': './repos/',
    'keep_code': True,
    'github_tokens': [
        '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',
        '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',
        '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',
        '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',
        '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',
        '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',
        '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',
        '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',
    ],
    'num_process': 10,
    'device': 'cuda',
    'model_cfg': {
        'hidden_dim': 768,
        'code_model': 'microsoft/codebert-base',
        'code_trainable': False,
    }
}
worker = CodeFeatureWorker(cfg)
# result = worker(['AnonymousWorld123/Q-Layer'])

In [6]:
def save_result(result, save_dir, save_file):
    if os.path.exists(save_dir) == False: os.mkdir(save_dir)
    f = open(save_file, 'w')
    json.dump(result, fp = f)
    f.close()

for root, dirs, files in os.walk('../github-topic/'):
    for file in files:
        if file in ['api.pk']:
            continue
        if file not in ['android.pk', 'deep-learning.pk']:
            continue
        print('[' + file + ']')
        items = pickle.load(open(root + file, 'rb'))
        repo_full_names = [x['full_name'] for x in items]

#         result = []
#         for i in tqdm(range(250)):
#             result.extend(worker(repo_full_names[i * 10 : i * 10 + 10]))

        result = worker(repo_full_names[0 : 2500])
#         result = worker(repo_full_names[5 : 6])

        save_result(result, '../data', '../data/' + file[ : -3] + '.jsonl')

[deep-learning.pk]


100%|█████████████████████████████████████| 2500/2500 [1:28:04<00:00,  2.11s/it]


[android.pk]


100%|█████████████████████████████████████| 2500/2500 [1:24:01<00:00,  2.02s/it]
