In [1]:
from datasets import load_dataset, DatasetDict, load_from_disk

In [2]:
# Specify the folder where the dataset was saved
cache_dir = "./datasets"

# Load the dataset from the saved folder
dataset = load_dataset("b-mc2/sql-create-context", cache_dir=cache_dir)

In [3]:
dataset

DatasetDict({
    train: Dataset({
        features: ['answer', 'question', 'context'],
        num_rows: 78577
    })
})

In [4]:
train_data = dataset['train'].take(2000)
train_data[0]

{'answer': 'SELECT COUNT(*) FROM head WHERE age > 56',
 'question': 'How many heads of the departments are older than 56 ?',
 'context': 'CREATE TABLE head (age INTEGER)'}

In [5]:
train_testval_split = train_data.train_test_split(test_size=0.25, seed=42)

# Further split the test+val set into validation and test (e.g., 50% of 20% = 10% each)
test_val_split = train_testval_split["test"].train_test_split(test_size=0.5, seed=42)

# Combine splits into a DatasetDict
split_dataset = DatasetDict({
    "train": train_testval_split["train"],
    "val": test_val_split["train"],
    "test": test_val_split["test"],
})

In [6]:
split_dataset

DatasetDict({
    train: Dataset({
        features: ['answer', 'question', 'context'],
        num_rows: 1500
    })
    val: Dataset({
        features: ['answer', 'question', 'context'],
        num_rows: 250
    })
    test: Dataset({
        features: ['answer', 'question', 'context'],
        num_rows: 250
    })
})

In [7]:
# save_path = "./datasets/sql-create-context-split"
# split_dataset.save_to_disk(save_path)

In [8]:
# split_dataset = load_from_disk("./datasets/sql-create-context-split")

In [9]:
split_dataset

DatasetDict({
    train: Dataset({
        features: ['answer', 'question', 'context'],
        num_rows: 1500
    })
    val: Dataset({
        features: ['answer', 'question', 'context'],
        num_rows: 250
    })
    test: Dataset({
        features: ['answer', 'question', 'context'],
        num_rows: 250
    })
})

In [10]:
eval_dataset = load_dataset("json", data_files={'eval':"./datasets/sql_eval_dataset.json"})
eval_dataset

DatasetDict({
    eval: Dataset({
        features: ['question', 'answer', 'db_name', 'context', 'query_category'],
        num_rows: 250
    })
})

In [11]:
split_dataset["eval"] = eval_dataset["eval"]
split_dataset

DatasetDict({
    train: Dataset({
        features: ['answer', 'question', 'context'],
        num_rows: 1500
    })
    val: Dataset({
        features: ['answer', 'question', 'context'],
        num_rows: 250
    })
    test: Dataset({
        features: ['answer', 'question', 'context'],
        num_rows: 250
    })
    eval: Dataset({
        features: ['question', 'answer', 'db_name', 'context', 'query_category'],
        num_rows: 250
    })
})

In [12]:
split_dataset["eval"][0]

{'question': 'Which authors have written publications in both the domain "Machine Learning" and the domain "Data Science"?',
 'answer': "SELECT {author.name,author.aid} FROM author WHERE author.aid IN (SELECT domain_author.aid FROM domain_author WHERE domain_author.did IN (SELECT domain.did FROM DOMAIN WHERE domain.name IN ('Machine Learning', 'Data Science') ) GROUP BY 1 HAVING COUNT(DISTINCT domain_author.did) = 2);",
 'db_name': 'academic',
 'context': 'CREATE TABLE public.author (aid BIGINT NOT NULL, homepage TEXT, name TEXT, oid BIGINT);\nCREATE TABLE public.cite (cited BIGINT, citing BIGINT);\nCREATE TABLE public.conference (cid BIGINT NOT NULL, homepage TEXT, name TEXT);\nCREATE TABLE public.domain (did BIGINT NOT NULL, name TEXT);\nCREATE TABLE public.domain_author (aid BIGINT NOT NULL, did BIGINT NOT NULL);\nCREATE TABLE public.domain_conference (cid BIGINT NOT NULL, did BIGINT NOT NULL);\nCREATE TABLE public.domain_journal (did BIGINT NOT NULL, jid BIGINT NOT NULL);\nCREATE T

In [13]:
save_path = "./datasets/sql-create-context-split"
split_dataset.save_to_disk(save_path)

Saving the dataset (0/1 shards):   0%|          | 0/1500 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/250 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/250 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/250 [00:00<?, ? examples/s]