In [1]:
import os
import json
import random
import tokenize
from tqdm import tqdm
import multiprocessing

from sentence_transformers import SentenceTransformer
sentence_model = SentenceTransformer('all-MiniLM-L6-v2')

from transformers import RobertaTokenizer
tokenizer = RobertaTokenizer.from_pretrained('microsoft/codebert-base')

topics = ['algorithm', 'aws', 'bitcoin', 'bot', 'compiler', 'covid-19',
          'learning', 'django', 'git', 'google', 'graphql', 'kubernetes']

class arguments(object):
    def __init__(self):
        pass
args = arguments()
args.total_length = 512

In [2]:
def cnt_python(path):
    if (os.path.isdir(path)):
        ans = 0
        for x in os.listdir(path):
            ans += cnt_python(path + '/' + x)
        return ans
    else:
        assert(os.path.isfile(path))
        return 0 + (path[-3 :] == '.py')

def get_repos(filepath, itempath, structurepath):
    repos = []
    for dir1 in os.listdir(filepath):
        assert(os.path.isdir(filepath + '/' + dir1))
        if (not os.path.isdir(itempath + '/' + dir1)):
            continue
        for dir2 in os.listdir(filepath + '/' + dir1):
            assert(os.path.isdir(filepath + '/' + dir1 + '/' + dir2))
            if (not os.path.isdir(itempath + '/' + dir1 + '/' + dir2)):
                continue
            cnt = cnt_python(filepath + '/' + dir1 + '/' + dir2)
            if (cnt > 1000):
                continue
            repos.append(dir1 + '/' + dir2)
    return repos

filepath = '../data/github-repos/files'
itempath = '../data/github-repos/item'
structurepath = '../data/github-repos/structure'
repos = get_repos(filepath, itempath, structurepath)

In [3]:
def read_label(repos, filepath):
    labels = []
    for repo in tqdm(repos):
        assert(os.path.isfile(filepath + '/' + repo + '/' + 'item.jsonl'))
        f = open(filepath + '/' + repo + '/' + 'item.jsonl', 'r')
        item = json.loads(f.readline())
        labels.append(item['topics'])
        f.close()
    return labels

labels = read_label(repos, '../data/github-repos/item')
for i in range(len(labels)):
    labels[i] = ['algorithm' if x == 'algorithms' else x for x in labels[i]]
    labels[i] = ['learning' if 'learn' in x else x for x in labels[i]]
    labels[i] = list(set(labels[i]))
labels01 = [[x in label for x in topics] for label in labels]

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 23088/23088 [00:01<00:00, 13156.71it/s]


In [4]:
repos = [x for x, y in zip(repos, labels01) if sum(y) != 0]
labels = read_label(repos, '../data/github-repos/item')
for i in range(len(labels)):
    labels[i] = ['algorithm' if x == 'algorithms' else x for x in labels[i]]
    labels[i] = ['learning' if 'learn' in x else x for x in labels[i]]
    labels[i] = list(set(labels[i]))
labels01 = [[x in label for x in topics] for label in labels]

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 12272/12272 [00:00<00:00, 13217.44it/s]


In [5]:
def label_cnter(labels):
    cnter = dict()
    for x in labels:
        for y in x:
            if (y in cnter):
                cnter[y] += 1
            else:
                cnter[y] = 1
    for x in topics:
        print(x, cnter[x])
# label_cnter(labels)

In [6]:
def get_tokens(code, do_remove, tmp_file_name, path):
    if (do_remove):
        try:
            code = remove_comments_and_docstrings(code, 'python')
        except:
            pass
    output_file = open(tmp_file_name, 'w')
    print(code, file = output_file)
    output_file.close()
    
    tokens = []
    f = open(tmp_file_name, 'rb')
    tokenGenerator = tokenize.tokenize(f.readline)
    for token in tokenGenerator:
        if (token.type in [0, 59, 60, 62]): # COMMENT
            pass
        elif (token.type in [4, 61]): # NEWLINE
            pass
        elif (token.type == 5): # INDENT
            pass
        elif (token.type == 6): # DEDENT
            pass
        elif (token.type in [1, 2, 3, 54]): # NAME NUMBER STRING OP
            tokens.append(token.string)
        else:
            assert(False)
        if (len(tokens) > 1.5 * args.total_length):
            break
    f.close()
    return tokens

def tokenized_code(tokens):
    if (tokens != []):
        tokens = [tokenizer.tokenize(tokens[0])] \
               + [tokenizer.tokenize('@ ' + x)[1 :] for x in tokens[1 :]]
        tokens = [y for x in tokens for y in x]
        tokens = tokens[: args.total_length - 2]
    code_ids = [tokenizer.cls_token_id] + tokenizer.convert_tokens_to_ids(tokens)
    padding_length = args.total_length - len(code_ids)
    code_ids += [tokenizer.pad_token_id] * padding_length
    return code_ids

def search(path, idx):
    data = []
    if (os.path.isdir(path)):
        for x in os.listdir(path):
            data.extend(search(path + '/' + x, idx))
    else:
        assert(os.path.isfile(path))
        if (path[-3 :] != '.py'):
            return []
        input_file = open(path, 'r')
        try:
            code = input_file.read()
        except:
            f = open('../data/badfile.txt', 'a')
            print(path, file = f)
            f.close()
            return []
        tmp_file_name = str(idx) + '.py'
        try:
            tokens = get_tokens(code, True, tmp_file_name, path)
        except:
#             tokens = get_tokens(code, False, tmp_file_name, path)
            try:
                tokens = get_tokens(code, False, tmp_file_name, path)
            except:
                print(path)
                os.system('rm ./' + tmp_file_name)
                return []
        os.system('rm ./' + tmp_file_name)
        if (tokens != []):
            data.append(tokenized_code(tokens))
        input_file.close()
    return data

def read_data(repos, filepath):
    print(len(repos))
    dataset = []
    pool = multiprocessing.Pool(processes = 50)
    for idx, repo in enumerate(repos):
        dataset.append(pool.apply_async(search, (filepath + '/' + repo, idx, )))
    pool.close()
    pool.join()
    for i in range(len(dataset)):
        dataset[i] = dataset[i].get()
    return dataset
    
#     dataset = []
#     for idx, repo in enumerate(tqdm(repos)):
#         data = search(filepath + '/' + repo, idx)
#         dataset.append(data)
#     return dataset

In [7]:
def get_graph(path):
    nodes = {}
    file = open(path + '/node.txt')
    context = file.read()
    context = context.strip().split('\n')
    file.close()
    for x in context[1 :]:
        try:
            node_id = int(x[0 : x.find(',')])
            node_type = x[x.find(',') + 1 : x.find(',', x.find(',') + 1)]
            node_name = x[x.find(',', x.find(',') + 1) + 1 : x.rfind(',')]
            node_father = int(x[x.rfind(',') + 1 :])
        except:
            print(x)
            print(x[0 : x.find(',')])
            print(x[x.find(',') + 1 : x.find(',', x.find(',') + 1)])
            print(x[x.find(',', x.find(',') + 1) + 1 : x.rfind(',')])
            print(x[x.rfind(',') + 1 :])
            assert(False)
        assert(node_id not in nodes)
        if (node_name.find('data/github-repos/files') != -1):
            node_name = node_name[node_name.find('data/github-repos/files') + 24 :]
        nodes[node_id] = (node_type, node_name, node_father)
    
    edges = []
    file = open(path + '/edge.txt')
    context = file.read()
    context = context.strip().split('\n')
    file.close()
    for x in context[1 :]:
        source = int(x[: x.find(',')])
        target = int(x[x.find(',') + 1 : x.find(',', x.find(',') + 1)])
        if ((source in nodes) and (target in nodes)):
            edges.append([source, target])
    for node_id in nodes:
        node_father = nodes[node_id][2]
        if (node_father in nodes):
            edges.append([node_father, node_id])
        else:
            assert(node_father == -1)
    return nodes, edges

def cut_graph(nodes, edges):
    keep = set()
    left = set()
    E = {}
    for x in nodes:
        E[x] = []
        left.add(x)
    for u, v in edges:
        E[u].append(v)
        E[v].append(u)
    
    while (len(keep) < min(200, len(nodes))):
        root = random.sample(left, 1)[0]
        keep.add(root)
        left.remove(root)
        flag = True
        while (flag):
            flag = False
            consider = set()
            for u in keep:
                for v in E[u]:
                    consider.add(v)
            consider = consider & left
            consider = random.sample(consider, min(len(consider), 200 - len(keep)))
            for x in consider:
                keep.add(x)
                left.remove(x)
                flag = True
    
    new_nodes = []
    new_edges = []
    idmap = {}
    for idx, x in enumerate(keep):
        new_nodes.append(nodes[x][1])
        idmap[x] = idx
    for (u, v) in edges:
        if ((u in idmap) and (v in idmap)):
            new_edges.append([idmap[u], idmap[v]])
    return new_nodes, new_edges

def read_graph(repos, filepath):
    graphs = []
    for repo in tqdm(repos):
        nodes = []
        edges = []
        if (os.path.isfile(structurepath + '/' + repo + '/node.txt')):
            assert(os.path.isfile(structurepath + '/' + repo + '/edge.txt'))
            nodes, edges = get_graph(structurepath + '/' + repo)
            nodes, edges = cut_graph(nodes, edges)
        graphs.append([nodes, edges])
    return graphs

In [8]:
def read_description(repos, filepath):
    description = []
    for repo in tqdm(repos):
        if (not os.path.isfile(filepath + '/' + repo + '/' + 'description.md')):
            context = ''
        else:
            f = open(filepath + '/' + repo + '/' + 'description.md', 'r')
            context = f.read()
            f.close()
        description.append(sentence_model.encode(context).tolist())
    return description

def read_readme(repos, filepath):
    readme = []
    for repo in tqdm(repos):
        if (not os.path.isfile(filepath + '/' + repo + '/' + 'README.md')):
            context = ''
        else:
            try:
                f = open(filepath + '/' + repo + '/' + 'README.md', 'r')
                context = f.read()
                f.close()
            except:
                context = ''
        readme.append(sentence_model.encode(context).tolist())
    return readme

In [9]:
dataset = read_data(repos, '../data/github-repos/files')
graphs = read_graph(repos, '../data/github-repos/item')
description = read_description(repos, '../data/github-repos/item')
readme = read_readme(repos, '../data/github-repos/files')

12272
../data/github-repos/files/Snivyn/raffle-scripts/GetMyLevisJordans%20-%20Levis%20x%20Air%20Jordan%204%20-Black-%20-White/main.py
../data/github-repos/files/SoumanRoy/MachineLearning/Linear_Regression_Gradient_Descent/gradient_descent.py
../data/github-repos/files/hariharanragothaman/problemsets/leetcode/0509_fibonacci_number.py
../data/github-repos/files/ojasvin/Data-Structures-and-Algorithms-codes/CS_LAB_MA252/graphalgorithms/Bipartite/Bipartite.py
../data/github-repos/files/Mendes1302/Python-Learning/Learn/WAGE ADJUSTMENT.py
../data/github-repos/files/Block-Chen/blocksdk-python/BlockSDK/market.py
../data/github-repos/files/vwang0/Leetcode_Solutions/Algorithms_easy/0136. SingleNumber.py
../data/github-repos/files/CUBigDataClass/Indian-Premier-League/code/player_birth_info.py
../data/github-repos/files/Badhansen/python-working-dictionary/Lynda%20Python%203/18%20Debugging/saytime-errors.py
../data/github-repos/files/sk-g/Leetcode/python/191. Number of 1 Bits.py
../data/github-repo

../data/github-repos/files/liadbiz/Leetcode-Solutions/src/python/dynamic_programming/guess_number_higher_or_lower_2.py
../data/github-repos/files/jseric/stem_games-fesbovci/source_code/level_01/pillars/pillars.py
../data/github-repos/files/yennanliu/CS_basics/leetcode_python/Dynamic_Programming/coin-change.py
../data/github-repos/files/jbrower95/crop/ropgadget/ropchain/arch/ropmaker.py
../data/github-repos/files/yennanliu/CS_basics/leetcode_python/Design/design_search_autocomplete_system.py
../data/github-repos/files/yennanliu/CS_basics/workspace/Pipeline/gameApp/etl/batch/load.py
../data/github-repos/files/AbhiSaphire/Competitive-Programming-Solutions/Coding%20Club%20India/Asked%20Amazon%20Interview%20Questions/MinimumPlatforms.py
../data/github-repos/files/DeachSword/CHRLINE/examples/remove_e2ee_key.py
../data/github-repos/files/DeepNinja07x/Python_Scripts/Algorithms/kadaneAlgorithm.py
../data/github-repos/files/DeepNinja07x/Python_Scripts/Basic%20Scripts/CurrencyConverter.py
../data

../data/github-repos/files/fsiddh/Python-and-Django-Full-Stack-Web-Developer-Bootcamp/Section%201%20-%20Course%20Introduction/DJANGO_COURSE_1.xx/Python_Level_Two/Part4_Errors_and_Exceptions.py
../data/github-repos/files/DedSecInside/Awesome-Scripts/APIs/Telegram%20API/telethon/client/dialogs.py
../data/github-repos/files/DedSecInside/Awesome-Scripts/APIs/Telegram%20API/telethon/client/downloads.py
../data/github-repos/files/DedSecInside/Awesome-Scripts/APIs/Telegram%20API/telethon/client/users.py
../data/github-repos/files/DedSecInside/Awesome-Scripts/APIs/Telegram%20API/telethon/client/account.py
../data/github-repos/files/DedSecInside/Awesome-Scripts/APIs/Telegram%20API/telethon/client/chats.py
../data/github-repos/files/duncanmichel/Programming-Problem-Solutions/LeetCode/TwoCityScheduling.py
../data/github-repos/files/DedSecInside/Awesome-Scripts/APIs/Telegram%20API/telethon/client/auth.py
../data/github-repos/files/DedSecInside/Awesome-Scripts/APIs/Telegram%20API/telethon/client/up

since Python 3.9 and will be removed in a subsequent version.
  root = random.sample(left, 1)[0]
since Python 3.9 and will be removed in a subsequent version.
  consider = random.sample(consider, min(len(consider), 200 - len(keep)))
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 12272/12272 [03:03<00:00, 66.90it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 12272/12272 [01:26<00:00, 141.98it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 12272/12272 [02:39<00:00, 76.72it/s]


In [10]:
f = open('../data/repos.jsonl', 'w')
json.dump(repos, fp = f)
f.close()
f = open('../data/dataset.jsonl', 'w')
json.dump(dataset, fp = f)
f.close()
f = open('../data/graphs.jsonl', 'w')
json.dump(graphs, fp = f)
f.close()
f = open('../data/labels01.jsonl', 'w')
json.dump(labels01, fp = f)
f.close()
f = open('../data/description.jsonl', 'w')
json.dump(description, fp = f)
f.close()
f = open('../data/readme.jsonl', 'w')
json.dump(readme, fp = f)
f.close()