In [7]:
import os
os.environ['TOKENIZERS_PARALLELISM'] = 'false'
import json
import torch
import pickle
import random
import tokenize
import numpy as np
from tqdm import tqdm
from multiprocessing import Pool

# import logging
# logger = logging.getLogger('root')

import torch.nn as nn
from transformers import AutoTokenizer, AutoModel, AutoConfig

In [8]:
def delete(path):
    if (os.path.islink(path)):
        os.unlink(path)
    elif (os.path.isdir(path)):
        for x in os.listdir(path):
            delete(path + '/' + x)
        os.rmdir(path)
    else:
        assert(os.path.isfile(path))
        os.remove(path)

def clear(path):
    cnt = 0
    if (os.path.islink(path)):
        delete(path)
    elif (os.path.isdir(path)):
        for x in os.listdir(path):
            if (x[0] == '.'):
                delete(path + '/' + x)
            else:
                cnt += clear(path + '/' + x)
        if (cnt == 0):
            delete(path)
    else:
        assert(os.path.isfile(path))
        if (path[-3:] == '.py'):
            cnt = 1
        else:
            delete(path)
    return cnt

def _download_repo(repo_full_name, token, save_path):
#     print(f"git clone https://{token}@github.com.cnpmjs.org/{repo_full_name} {save_path}/{repo_full_name} --progress --depth 1")
    os.system(f"git clone https://{token}@github.com.cnpmjs.org/{repo_full_name} {save_path}/{repo_full_name} --quiet --depth 1")
#     logger.info(f'Downloaded {repo_full_name}')
    clear(save_path + '/' + repo_full_name)

def download_repos(repo_full_names, tokens, save_path, num_process = 1):
    pool = Pool(num_process)
    for repo_full_name in repo_full_names:
        token = random.choice(tokens)
        pool.apply_async(_download_repo, args = (repo_full_name, token, save_path))
    pool.close()
    pool.join()

# def download_repos(repo_full_names, tokens, save_path, num_process = 1):
#     for repo_full_name in tqdm(repo_full_names):
#         token = random.choice(tokens)
#         _download_repo(repo_full_name, token, save_path)

In [10]:
def _read_code_tokens_and_functions(path):
    code_tokens = []
    function_stack = []
    code_functions = []
    indent_number = 0
    print(path)
    with open(path, 'rb') as input_file:
        tokenGenerator = tokenize.tokenize(input_file.readline)
        for token in tokenGenerator:
            if (token.type in [0, 59, 60, 62]): # COMMENT
                pass
            elif (token.type in [4, 61]): # NEWLINE
                pass
            elif (token.type == 5): # INDENT
                code_tokens.append('{')
                indent_number += 1
            elif (token.type == 6): # DEDENT
                code_tokens.append('}')
                indent_number -= 1
                if (function_stack != [] and indent_number == function_stack[-1][3]):
                    code_functions.append(function_stack.pop())
                    code_functions[-1][2] = len(code_tokens) + 1
            elif (token.type in [1, 2, 3, 54]): # NAME NUMBER STRING OP
                code_tokens.append(token.string)
                if (token.string in ['def', 'class']):
                    function_stack.append([0, len(code_tokens) - 1, 0, indent_number])
                    print(token)
                elif (function_stack != [] and function_stack[-1][0] == 0):
                    function_stack[-1][0] = token.string
            else:
                assert(False)
    return code_tokens, code_functions

def _read_codes(repo_path, current_path = '.'):
    data = {}
    path = os.path.join(repo_path, current_path)
    if (os.path.isdir(path)):
        for x in os.listdir(path):
            data.update(_read_codes(repo_path, current_path + '/' + x))
    elif os.path.isfile(path):
        if (path[-3 :] != '.py'):
            return {}
        try:
            data[current_path] = _read_code_tokens_and_functions(path)
        except:
            f = open('badfile.txt', 'a')
            print(path, file = f)
            f.close()
            return {}
    else:
        return {}
    return data

# TODO
# def read_repos(repos, filepath, num_process = 1):
#     dataset = []
#     pool = Pool(processes = num_process)
#     for _, repo in enumerate(repos):
#         dataset.append(pool.apply_async(_read_codes, (filepath + '/' + repo, '.')))
#     pool.close()
#     pool.join()
#     for i in range(len(dataset)):
#         dataset[i] = dataset[i].get()
#     return dataset

def read_repos(repos, filepath, num_process = 1):
    dataset = []
    for _, repo in enumerate(repos):
        dataset.append(_read_codes(filepath + '/' + repo, '.'))
    return dataset

In [11]:
class CodeFeatureWorker():
    def __init__(self, cfg):
        self.save_path = cfg['save_path']
        if os.path.exists(self.save_path) == False: os.mkdir(self.save_path)

        self.keep_code = cfg['keep_code']
        self.github_tokens = cfg['github_tokens']
        self.num_process = cfg['num_process']
        self.device = cfg['device']

#         self._init_model(cfg['model_cfg']) # TODO
    
    def _init_model(self, cfg):
        self.hidden_dim = cfg['hidden_dim']
        self.model_config = AutoConfig.from_pretrained(cfg['code_model'])
        self.tokenizer = AutoTokenizer.from_pretrained(cfg['code_model'])
        self.code_model = AutoModel.from_pretrained(cfg['code_model']).to(self.device)
        for param in self.code_model.parameters():
            param.requires_grad = cfg['code_trainable']

        self.rnn = nn.LSTM(self.hidden_dim, self.hidden_dim, num_layers = 1, batch_first = True).to(self.device)
        for param in self.rnn.parameters():
            param.requires_grad = cfg['rnn_trainable']

    def _if_downloaded(self, repo_names):
        return np.array([
            os.path.exists(
                os.path.join(self.save_path, repo_name)
            ) for repo_name in repo_names
        ])
    
    def _aggregate_code_embs(self, file_names, code_embs):
        # Aggregate code emb of files to obtain repo emb
        # Currently, file_names is not used.
        code_embs = nn.utils.rnn.pad_sequence(code_embs, batch_first = True)
        res, (_, _) = self.rnn(code_embs)
        return res[:, -1, :]

#     def _compute_embs(self, repo_codes):
#         code_embs = []
#         for repo in repo_codes:
#             keys, values = list(repo.keys()), list(repo.values())
#             # compute code emb from files
#             tokens = self.tokenizer(values, return_tensors="pt", truncation=True, padding='max_length').to(self.device)
#             code_embs.append(self.code_model(**tokens)['pooler_output'])

#         # aggregate embs of all files to obtain repo emb
#         repo_embs = self._aggregate_code_embs(keys, code_embs)

#         return repo_embs
    
    def _compute_embs(self, repo_codes):
        repo_embs = []
        for repo in repo_codes:
            keys, values = list(repo.keys()), list(repo.values())
            if (len(values) == 0 or len(values) > 2000):
                repo_embs.append(0)
                continue
            # compute code emb from files
            tokens = self.tokenizer(values, return_tensors="pt", truncation=True, padding='max_length').to(self.device)
            
            code_embs = []
            for i in range(tokens['input_ids'].size(0)):
                code_embs.append(self.code_model(input_ids = tokens['input_ids'][i : i + 1],
                                                 attention_mask = tokens['attention_mask'][i : i + 1])['pooler_output'])
            code_embs = [torch.cat(code_embs, dim = 0)]
            
            # aggregate embs of all files to obtain repo emb
            repo_embs.append(self._aggregate_code_embs(keys, code_embs).view(-1).tolist())
        
        return repo_embs

    def __call__(self, repo_names):
        repo_names = np.array(repo_names)
        # Step 1: 
        # Check whether we have the repos in self.keep_code.
        # If not, download repos
#         download_repo_names = repo_names[~self._if_downloaded(repo_names)]
#         download_repos(download_repo_names, self.github_tokens, self.save_path, self.num_process)

        # Step 2:
        # Convert repo files to tokens
        codes = read_repos(repo_names, self.save_path, self.num_process)

        return [0]
        
        # Step 3:
        # Use the model to output code feature embs
        embs = self._compute_embs(codes)

        if self.keep_code == False:
#             os.system('rm -r -f ' + self.save_path)
            for repo_name in repo_names:
                os.system('rm -r -f ' + self.save_path + '/' + repo_name)

        return embs

In [12]:
# if __name__ == '__main__':
cfg = {
    'save_path': './repos/',
    'keep_code': True,
    'github_tokens': [
        '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',
        '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',
        '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',
        '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',
        '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',
        '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',
        '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',
        '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',
    ],
    'num_process': 5,
    'device': 'cuda',
    'model_cfg': {
        'hidden_dim': 768,
        'code_model': 'microsoft/codebert-base',
        'code_trainable': False,
        'rnn_trainable': True,
    }
}
worker = CodeFeatureWorker(cfg)
# tmp = worker(['AnonymousWorld123/Q-Layer'])

In [13]:
def save_result(result, save_dir, save_file):
    if os.path.exists(save_dir) == False: os.mkdir(save_dir)
    f = open(save_file, 'w')
    json.dump(result, fp = f)
    f.close()

for root, dirs, files in os.walk('../github-topic/'):
    for file in files:
        if file in ['api.pk']:
            continue
        if file not in ['android.pk', 'deep-learning.pk']:
            continue
        print('[' + file + ']')
        items = pickle.load(open(root + file, 'rb'))
        repo_full_names = [x['full_name'] for x in items]

# TODO
#         result = []
#         for i in tqdm(range(250)):
#             result.extend(worker(repo_full_names[i * 10 : i * 10 + 10]))

        result = worker(repo_full_names[5 : 6])
#         save_result(result, '../data', '../data/' + file[ : -3] + '.jsonl')

[deep-learning.pk]
./repos//nikitasrivatsan/DeepLearningVideoGames/./deep_q_network.py
TokenInfo(type=1 (NAME), string='def', start=(25, 0), end=(25, 3), line='def weight_variable(shape):\n')
['weight_variable', 64, 92, 0]
['def', 'weight_variable', '(', 'shape', ')', ':', '{', 'initial', '=', 'tf', '.', 'truncated_normal', '(', 'shape', ',', 'stddev', '=', '0.01', ')', 'return', 'tf', '.', 'Variable', '(', 'initial', ')', '}']
TokenInfo(type=1 (NAME), string='def', start=(29, 0), end=(29, 3), line='def bias_variable(shape):\n')
['bias_variable', 91, 119, 0]
['def', 'bias_variable', '(', 'shape', ')', ':', '{', 'initial', '=', 'tf', '.', 'constant', '(', '0.01', ',', 'shape', '=', 'shape', ')', 'return', 'tf', '.', 'Variable', '(', 'initial', ')', '}']
TokenInfo(type=1 (NAME), string='def', start=(33, 0), end=(33, 3), line='def conv2d(x, W, stride):\n')
['conv2d', 118, 158, 0]
['def', 'conv2d', '(', 'x', ',', 'W', ',', 'stride', ')', ':', '{', 'return', 'tf', '.', 'nn', '.', 'conv2d', 

./repos//kivy/buildozer/./buildozer/libs/__init__.py
./repos//kivy/buildozer/./buildozer/__init__.py
TokenInfo(type=1 (NAME), string='class', start=(75, 0), end=(75, 5), line='class ChromeDownloader(FancyURLopener):\n')
['ChromeDownloader', 298, 313, 0]
['class', 'ChromeDownloader', '(', 'FancyURLopener', ')', ':', '{', 'version', '=', '(', "'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 '", "'(KHTML, like Gecko) Chrome/28.0.1500.71 Safari/537.36'", ')', '}']
TokenInfo(type=1 (NAME), string='class', start=(84, 0), end=(84, 5), line='class BuildozerException(Exception):\n')
['BuildozerException', 319, 330, 0]
['class', 'BuildozerException', '(', 'Exception', ')', ':', '{', "'''\n    Exception raised for general situations buildozer cannot process.\n    '''", 'pass', '}']
TokenInfo(type=1 (NAME), string='class', start=(91, 0), end=(91, 5), line='class BuildozerCommandException(BuildozerException):\n')
['BuildozerCommandException', 329, 340, 0]
['class', 'BuildozerCommandException', 