In [1]:
import os
from dotenv import load_dotenv
from huggingface_hub import login
from datasets import load_dataset, concatenate_datasets
from tqdm.auto import tqdm
import matplotlib.pyplot as plt

load_dotenv(override=True)

  from .autonotebook import tqdm as notebook_tqdm


True

In [2]:
from utils.get_complexity import get_complexity_label, get_complexity_score

In [3]:
hf_token = os.environ['HF_TOKEN']
login(hf_token, add_to_git_credential=True)

Token has not been saved to git credential helper.
Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


[1m[31mCannot authenticate through git-credential as no helper is defined on your machine.
You might have to re-authenticate when pushing to the Hugging Face Hub.
Run the following command in your terminal in case you want to set the 'store' credential helper as default.

git config --global credential.helper store

Read https://git-scm.com/book/en/v2/Git-Tools-Credential-Storage for more details.[0m


In [4]:
dataset_dict = load_dataset("semeru/code-text-javascript")
print(dataset_dict)


DatasetDict({
    train: Dataset({
        features: ['repo', 'path', 'func_name', 'original_string', 'language', 'code', 'code_tokens', 'docstring', 'docstring_tokens', 'sha', 'url', 'partition'],
        num_rows: 58025
    })
    validation: Dataset({
        features: ['repo', 'path', 'func_name', 'original_string', 'language', 'code', 'code_tokens', 'docstring', 'docstring_tokens', 'sha', 'url', 'partition'],
        num_rows: 3885
    })
    test: Dataset({
        features: ['repo', 'path', 'func_name', 'original_string', 'language', 'code', 'code_tokens', 'docstring', 'docstring_tokens', 'sha', 'url', 'partition'],
        num_rows: 3291
    })
})


In [5]:
def transform_data(datapoint):
    score = get_complexity_score(datapoint.get("code", ""))
    return {
        "complexity": score,
        "label": get_complexity_label(score)
    }

processed_ds = dataset_dict.map(transform_data, num_proc=4)
processed_ds = processed_ds.select_columns(["repo", "code", "complexity", "label"])

In [6]:
print(processed_ds["train"][0])
print(processed_ds)

{'repo': 'ciena-blueplanet/bunsen-core', 'code': 'function (state, action) {\n    return _.defaults({\n      isValidating: action.isValidating,\n      lastAction: IS_VALIDATING\n    }, state)\n  }', 'complexity': 1.3, 'label': 'Simple'}
DatasetDict({
    train: Dataset({
        features: ['repo', 'code', 'complexity', 'label'],
        num_rows: 58025
    })
    validation: Dataset({
        features: ['repo', 'code', 'complexity', 'label'],
        num_rows: 3885
    })
    test: Dataset({
        features: ['repo', 'code', 'complexity', 'label'],
        num_rows: 3291
    })
})


In [None]:
# processed_ds.push_to_hub("aslam-naseer/js-function-complexity")

Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00,  6.73ba/s]
Processing Files (1 / 1): 100%|██████████| 15.5MB / 15.5MB,  0.00B/s  
New Data Upload: |          |  0.00B /  0.00B,  0.00B/s  
Uploading the dataset shards: 100%|██████████| 1/1 [00:03<00:00,  3.38s/ shards]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 77.81ba/s]
Processing Files (1 / 1): 100%|██████████| 1.02MB / 1.02MB,  0.00B/s  
New Data Upload: |          |  0.00B /  0.00B,  0.00B/s  
Uploading the dataset shards: 100%|██████████| 1/1 [00:01<00:00,  1.35s/ shards]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 111.38ba/s]
Processing Files (1 / 1): 100%|██████████|  980kB /  980kB,  0.00B/s  
New Data Upload: |          |  0.00B /  0.00B,  0.00B/s  
Uploading the dataset shards: 100%|██████████| 1/1 [00:01<00:00,  1.11s/ shards]
No files have been modified since last commit. Skipping to prevent empty commit.


CommitInfo(commit_url='https://huggingface.co/datasets/aslam-naseer/js-function-complexity/commit/baf5eeb356e51789300ecc00d5b01f0e06db9929', commit_message='Upload dataset', commit_description='', oid='baf5eeb356e51789300ecc00d5b01f0e06db9929', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/aslam-naseer/js-function-complexity', endpoint='https://huggingface.co', repo_type='dataset', repo_id='aslam-naseer/js-function-complexity'), pr_revision=None, pr_num=None)