In [1]:
from datasets import load_dataset_builder

builder = load_dataset_builder(
    "code-search-net/code_search_net", trust_remote_code=True
)
builder.download_and_prepare()


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
help(builder)

Help on CodeSearchNet in module datasets_modules.datasets.code-search-net--code_search_net.8f2524e6b62f65af5f5d65c53715c654db7b08dc93e0b7bcce2ab2f286a75be1.code_search_net object:

class CodeSearchNet(datasets.builder.GeneratorBasedBuilder)
 |  CodeSearchNet(
 |      cache_dir: Optional[str] = None,
 |      dataset_name: Optional[str] = None,
 |      config_name: Optional[str] = None,
 |      hash: Optional[str] = None,
 |      base_path: Optional[str] = None,
 |      info: Optional[datasets.info.DatasetInfo] = None,
 |      features: Optional[datasets.features.features.Features] = None,
 |      token: Union[bool, str, NoneType] = None,
 |      repo_id: Optional[str] = None,
 |      data_files: Union[str, list, dict, datasets.data_files.DataFilesDict, NoneType] = None,
 |      data_dir: Optional[str] = None,
 |      storage_options: Optional[dict] = None,
 |      writer_batch_size: Optional[int] = None,
 |      **config_kwargs
 |  )
 |
 |  "CodeSearchNet corpus: proxy dataset for seman

In [4]:
splits = ["train", "validation", "test"]

datasets = {}

for split in splits:
    datasets[split] = builder.as_dataset(split=split) # type: ignore
    print(f"[{split}] number of samples {len(datasets[split])}")


[train] number of samples 1880853
[validation] number of samples 89154
[test] number of samples 100529


In [5]:
datasets['train'][0]["func_code_string"]

'def train(train_dir, model_save_path=None, n_neighbors=None, knn_algo=\'ball_tree\', verbose=False):\n    """\n    Trains a k-nearest neighbors classifier for face recognition.\n\n    :param train_dir: directory that contains a sub-directory for each known person, with its name.\n\n     (View in source code to see train_dir example tree structure)\n\n     Structure:\n        <train_dir>/\n        ├── <person1>/\n        │   ├── <somename1>.jpeg\n        │   ├── <somename2>.jpeg\n        │   ├── ...\n        ├── <person2>/\n        │   ├── <somename1>.jpeg\n        │   └── <somename2>.jpeg\n        └── ...\n\n    :param model_save_path: (optional) path to save model on disk\n    :param n_neighbors: (optional) number of neighbors to weigh in classification. Chosen automatically if not specified\n    :param knn_algo: (optional) underlying data structure to support knn.default is ball_tree\n    :param verbose: verbosity of training\n    :return: returns knn classifier that was trained on 

In [8]:
# keep only python samples
filtered_datasets = {}
for split in splits:
    filtered_datasets[split] = datasets[split].filter(lambda example: example["language"] == "python")
    print(f"[{split}] filtered number of samples {len(filtered_datasets[split])}")

Filter: 100%|██████████| 1880853/1880853 [11:11<00:00, 2802.87 examples/s]


[train] filtered number of samples 412178


Filter: 100%|██████████| 89154/89154 [00:29<00:00, 3033.63 examples/s]

[validation] filtered number of samples 23107
[test] filtered number of samples 22176





In [None]:
filtered_datasets_by_size = {}
MAX_LENGTH = 100
for split in splits:
    filtered_datasets_by_size[split] = filtered_datasets[split].filter(lambda example: len(example["func_code_tokens"]) <= MAX_LENGTH)
    print(f"[{split}] filtered number of samples by size {len(filtered_datasets_by_size[split])}")

Filter: 100%|██████████| 412178/412178 [03:03<00:00, 2248.53 examples/s]


[train] filtered number of samples by size 412178


Filter: 100%|██████████| 23107/23107 [00:09<00:00, 2328.82 examples/s]


[validation] filtered number of samples by size 23107


Filter: 100%|██████████| 22176/22176 [00:09<00:00, 2293.33 examples/s]

[test] filtered number of samples by size 22176





In [34]:
import random 
K = [10000, 2000, 2000] # train, val, test 
sampled_data = {}

for k_sample, split in zip(K,splits):
    indices = random.sample(range(len(filtered_datasets_by_size[split])), k=k_sample)
    sampled_data[split]= filtered_datasets_by_size[split].select(indices)
    print(f"[{split}] sampled {len(sampled_data[split])}")

[train] sampled 10000
[validation] sampled 2000
[test] sampled 2000


In [36]:
sampled_data['train'][0]

{'repository_name': 'pydata/xarray',
 'func_path_in_repository': 'xarray/core/accessors.py',
 'func_name': '_access_through_cftimeindex',
 'whole_func_string': 'def _access_through_cftimeindex(values, name):\n    """Coerce an array of datetime-like values to a CFTimeIndex\n    and access requested datetime component\n    """\n    from ..coding.cftimeindex import CFTimeIndex\n    values_as_cftimeindex = CFTimeIndex(values.ravel())\n    if name == \'season\':\n        months = values_as_cftimeindex.month\n        field_values = _season_from_months(months)\n    else:\n        field_values = getattr(values_as_cftimeindex, name)\n    return field_values.reshape(values.shape)',
 'language': 'python',
 'func_code_string': 'def _access_through_cftimeindex(values, name):\n    """Coerce an array of datetime-like values to a CFTimeIndex\n    and access requested datetime component\n    """\n    from ..coding.cftimeindex import CFTimeIndex\n    values_as_cftimeindex = CFTimeIndex(values.ravel())\n

In [48]:

keep = ["func_code_string","func_code_url","split_name"]

data_selected_columns = {}

for split in splits:
    data_selected_columns[split] = sampled_data[split].select_columns(keep)
    print(f"[{split}] cleaned {len(data_selected_columns[split])}")

[train] cleaned 10000
[validation] cleaned 2000
[test] cleaned 2000


In [None]:
import pandas as pd

In [49]:
dfs = []
for split in splits:
    dfs.append(data_selected_columns[split].to_pandas())
dfs = pd.concat(dfs, ignore_index=True)

In [50]:
dfs.head()

Unnamed: 0,func_code_string,func_code_url,split_name
0,"def _access_through_cftimeindex(values, name):...",https://github.com/pydata/xarray/blob/6d93a95d...,train
1,"def main():\n """"""Start the bot.""""""\n # B...",https://github.com/balemessenger/bale-bot-pyth...,train
2,"def insert(self, nodes, pos):\n # TODO:...",https://github.com/openego/ding0/blob/e2d6528f...,train
3,"def auth_complete(self, *args, **kwargs):\n ...",https://github.com/troeger/opensubmit/blob/384...,train
4,"def open(filename, frame='unspecified'):\n ...",https://github.com/BerkeleyAutomation/autolab_...,train


In [51]:
dfs[dfs == 'valid'] = 'val'

In [52]:
dfs.to_parquet("dataset.parquet")