In [1]:
from datasets import load_dataset, DatasetDict, load_from_disk, concatenate_datasets, disable_caching
disable_caching()

In [2]:
# Specify the folder where the dataset was saved
cache_dir = "./datasets"

# Load the dataset from the saved folder
dataset = load_dataset("b-mc2/sql-create-context", cache_dir=cache_dir)

In [3]:
dataset

DatasetDict({
    train: Dataset({
        features: ['answer', 'question', 'context'],
        num_rows: 78577
    })
})

In [4]:
train_data = dataset['train'].take(2000)
train_data[0]

{'answer': 'SELECT COUNT(*) FROM head WHERE age > 56',
 'question': 'How many heads of the departments are older than 56 ?',
 'context': 'CREATE TABLE head (age INTEGER)'}

In [5]:
train_testval_split = train_data.train_test_split(test_size=0.25, seed=42)

# Further split the test+val set into validation and test (e.g., 50% of 20% = 10% each)
test_val_split = train_testval_split["test"].train_test_split(test_size=0.5, seed=42)

# Combine splits into a DatasetDict
split_dataset = DatasetDict({
    "train": train_testval_split["train"],
    "val": test_val_split["train"],
    "test": test_val_split["test"],
})

In [6]:
split_dataset

DatasetDict({
    train: Dataset({
        features: ['answer', 'question', 'context'],
        num_rows: 1500
    })
    val: Dataset({
        features: ['answer', 'question', 'context'],
        num_rows: 250
    })
    test: Dataset({
        features: ['answer', 'question', 'context'],
        num_rows: 250
    })
})

In [7]:
# save_path = "./datasets/sql-create-context-split"
# split_dataset.save_to_disk(save_path)

In [8]:
# split_dataset = load_from_disk("./datasets/sql-create-context-split")

In [9]:
# split_dataset

In [10]:
eval_dataset = load_dataset("json", data_files={'eval':"./datasets/sql_eval_dataset.json"})
eval_dataset

DatasetDict({
    eval: Dataset({
        features: ['question', 'answer', 'db_name', 'context', 'query_category'],
        num_rows: 250
    })
})

In [11]:
eval_train_test = eval_dataset["eval"].train_test_split(test_size=0.4, seed=42)
eval_train_test

DatasetDict({
    train: Dataset({
        features: ['question', 'answer', 'db_name', 'context', 'query_category'],
        num_rows: 150
    })
    test: Dataset({
        features: ['question', 'answer', 'db_name', 'context', 'query_category'],
        num_rows: 100
    })
})

In [12]:
eval_train_test["train_append"] = eval_train_test["train"].select_columns(['answer', 'question', 'context'])
eval_train_test

DatasetDict({
    train: Dataset({
        features: ['question', 'answer', 'db_name', 'context', 'query_category'],
        num_rows: 150
    })
    test: Dataset({
        features: ['question', 'answer', 'db_name', 'context', 'query_category'],
        num_rows: 100
    })
    train_append: Dataset({
        features: ['answer', 'question', 'context'],
        num_rows: 150
    })
})

In [13]:
split_dataset["train"] = concatenate_datasets([split_dataset["train"].take(750)] + [eval_train_test["train_append"]]*5)
split_dataset["train"] = split_dataset["train"].shuffle(seed=42)
split_dataset

DatasetDict({
    train: Dataset({
        features: ['answer', 'question', 'context'],
        num_rows: 1500
    })
    val: Dataset({
        features: ['answer', 'question', 'context'],
        num_rows: 250
    })
    test: Dataset({
        features: ['answer', 'question', 'context'],
        num_rows: 250
    })
})

In [14]:
split_dataset["eval_train"] = eval_train_test["train"]
split_dataset["eval_test"] = eval_train_test["test"]
split_dataset

DatasetDict({
    train: Dataset({
        features: ['answer', 'question', 'context'],
        num_rows: 1500
    })
    val: Dataset({
        features: ['answer', 'question', 'context'],
        num_rows: 250
    })
    test: Dataset({
        features: ['answer', 'question', 'context'],
        num_rows: 250
    })
    eval_train: Dataset({
        features: ['question', 'answer', 'db_name', 'context', 'query_category'],
        num_rows: 150
    })
    eval_test: Dataset({
        features: ['question', 'answer', 'db_name', 'context', 'query_category'],
        num_rows: 100
    })
})

In [15]:
save_path = "./datasets/train_merge_150x5_750"
split_dataset.save_to_disk(save_path)

Saving the dataset (0/1 shards):   0%|          | 0/1500 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/250 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/250 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/150 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]

In [16]:
split_dataset["train"][-1]

{'answer': "SELECT name FROM restaurant WHERE food_type ILIKE '%Italian%' OR city_name ILIKE '%New York%' ORDER BY name NULLS LAST;",
 'question': 'Which restaurants serve Italian cuisine or are located in New York? Order the results by the restaurant name.',
 'context': 'CREATE TABLE public.geographic (city_name text, county text, region text);\n\nCREATE TABLE public.location (restaurant_id bigint, house_number bigint, street_name text, city_name text);\n\nCREATE TABLE public.restaurant (id bigint, name text, food_type text, city_name text, rating real);'}