In [1]:
import pickle

## Licenses

In [3]:
with open('./javascript_licenses.pkl', 'rb') as f:
    licenses = pickle.load(f)
licenses

{'cheshire137/gulp-ruby-haml': [('cheshire137/gulp-ruby-haml/LICENSE.txt',
   'The MIT License (MIT)\n\nCopyright (c) 2014 Sarah Vessels, Sindre Sorhus\n\nPermission is hereby granted, free of charge, to any person obtaining a copy\nof this software and associated documentation files (the "Software"), to deal\nin the Software without restriction, including without limitation the rights\nto use, copy, modify, merge, publish, distribute, sublicense, and/or sell\ncopies of the Software, and to permit persons to whom the Software is\nfurnished to do so, subject to the following conditions:\n\nThe above copyright notice and this permission notice shall be included in\nall copies or substantial portions of the Software.\n\nTHE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\nIMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\nFITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\nAUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY

## Dedupe definitions


In [13]:
with open('./javascript_dedupe_definitions_v2.pkl', 'rb') as f:
    dedupe_definitions = pickle.load(f)
dedupe_definitions

[{'nwo': 'Microsoft/vscode',
  'sha': '693a13cd32c5be798051edc0cb43e1e39fc456d9',
  'path': 'extensions/json/build/update-grammars.js',
  'language': 'javascript',
  'identifier': 'adaptJSON',
  'parameters': '(grammar, replacementScope)',
  'argument_list': '',
  'return_statement': '',
  'docstring': '',
  'docstring_summary': '',
  'docstring_tokens': [],
  'function': "function adaptJSON(grammar, replacementScope) {\n\tgrammar.name = 'JSON with comments';\n\tgrammar.scopeName = `source${replacementScope}`;\n\n\tvar fixScopeNames = function(rule) {\n\t\tif (typeof rule.name === 'string') {\n\t\t\trule.name = rule.name.replace(/\\.json/g, replacementScope);\n\t\t}\n\t\tif (typeof rule.contentName === 'string') {\n\t\t\trule.contentName = rule.contentName.replace(/\\.json/g, replacementScope);\n\t\t}\n\t\tfor (var property in rule) {\n\t\t\tvar value = rule[property];\n\t\t\tif (typeof value === 'object') {\n\t\t\t\tfixScopeNames(value);\n\t\t\t}\n\t\t}\n\t};\n\n\tvar repository = gra

## Dataset

In [2]:
import glob

In [9]:
type = 'test'
files = glob.glob(f'./javascript/final/jsonl/test/javascript_{type}_*.jsonl.gz')
files

['./javascript/final/jsonl/test\\javascript_test_0.jsonl.gz']

In [13]:
import sys
import time
import torch
import random
import tarfile
import gzip
import numpy as np
from tqdm import tqdm


def get_json_iterator_from_tar_file(file_paths, shuffle=False, progress=False, group_by=None, buffer=True, truncate=-1,
                                    seed=42):
  """读取 jsonl.gz 包中的 jsonl 文件，并置入迭代器"""

  random.seed(seed)
  np.random.seed(seed)
  torch.manual_seed(seed)

  if isinstance(file_paths, str):
    file_paths = [file_paths]

  if shuffle:
    np.random.shuffle(file_paths)

  count = 0

  for file_path in file_paths:
    payloads = []
    t1 = time.time()
    with gzip.open(file_path, 'r') as f:
      files = [x.name for x in f.getmembers() if x.name.endswith('.jsonl')]
      # if shuffle:
      #     np.random.shuffle(files)

      if progress:
        file_iter = tqdm(files, file=sys.stdout)
      else:
        file_iter = files

      for filename in file_iter:
        jsonl_file = f.extractfile(filename)
        if jsonl_file is not None:
          if group_by is None:
            for line_no, tree_encoding_line in enumerate(jsonl_file):
              count += 1
              if truncate != -1 and count > truncate:
                break
              payload = tree_encoding_line, dict(file_name=filename, line_num=line_no)
              if buffer:
                payloads.append(payload)
              else:
                yield payload

          elif group_by == 'binary_file':
            lines = [(l.decode().strip(), dict(file_name=filename, line_num=line_no))
                     for line_no, l in enumerate(jsonl_file)]
            yield lines

        if truncate != -1 and count > truncate:
          break

    if shuffle:
      np.random.shuffle(payloads)

    print(f'load shard {file_path} took {time.time() - t1:.4f}s, length={len(payloads)}', file=sys.stderr)

    for payload in payloads:
      yield payload

In [14]:
json_iter = get_json_iterator_from_tar_file(files[0])

In [16]:
next(json_iter)

AttributeError: 'GzipFile' object has no attribute 'getmembers'

In [59]:
import json
with gzip.open(files[0], 'rb') as f:
    item = json.loads(f.readline().decode())
    print(item.keys())

dict_keys(['repo', 'path', 'func_name', 'original_string', 'language', 'code', 'code_tokens', 'docstring', 'docstring_tokens', 'sha', 'url', 'partition'])
