In [98]:
from datasets import load_dataset, DatasetDict, load_from_disk, concatenate_datasets, disable_caching
disable_caching()

In [99]:
# Specify the folder where the dataset was saved
cache_dir = "./datasets"

# Load the dataset from the saved folder
dataset = load_dataset("b-mc2/sql-create-context", cache_dir=cache_dir)

In [100]:
dataset

DatasetDict({
    train: Dataset({
        features: ['answer', 'question', 'context'],
        num_rows: 78577
    })
})

In [101]:
train_data = dataset['train'].take(2000)
train_data[0]

{'answer': 'SELECT COUNT(*) FROM head WHERE age > 56',
 'question': 'How many heads of the departments are older than 56 ?',
 'context': 'CREATE TABLE head (age INTEGER)'}

In [102]:
train_testval_split = train_data.train_test_split(test_size=0.25, seed=42)

# Further split the test+val set into validation and test (e.g., 50% of 20% = 10% each)
test_val_split = train_testval_split["test"].train_test_split(test_size=0.5, seed=42)

# Combine splits into a DatasetDict
split_dataset = DatasetDict({
    "train": train_testval_split["train"],
    "val": test_val_split["train"],
    "test": test_val_split["test"],
})

In [103]:
split_dataset

DatasetDict({
    train: Dataset({
        features: ['answer', 'question', 'context'],
        num_rows: 1500
    })
    val: Dataset({
        features: ['answer', 'question', 'context'],
        num_rows: 250
    })
    test: Dataset({
        features: ['answer', 'question', 'context'],
        num_rows: 250
    })
})

In [104]:
# save_path = "./datasets/sql-create-context-split"
# split_dataset.save_to_disk(save_path)

In [105]:
# split_dataset = load_from_disk("./datasets/sql-create-context-split")

In [106]:
# split_dataset

In [107]:
eval_dataset = load_dataset("json", data_files={'eval':"./datasets/sql_eval_dataset.json"})
eval_dataset

DatasetDict({
    eval: Dataset({
        features: ['question', 'answer', 'db_name', 'context', 'query_category'],
        num_rows: 250
    })
})

In [108]:
eval_train_test = eval_dataset["eval"].train_test_split(test_size=0.4, seed=42)
eval_train_test

DatasetDict({
    train: Dataset({
        features: ['question', 'answer', 'db_name', 'context', 'query_category'],
        num_rows: 150
    })
    test: Dataset({
        features: ['question', 'answer', 'db_name', 'context', 'query_category'],
        num_rows: 100
    })
})

In [109]:
eval_train_test["train_append"] = eval_train_test["train"].select_columns(['answer', 'question', 'context'])
eval_train_test

DatasetDict({
    train: Dataset({
        features: ['question', 'answer', 'db_name', 'context', 'query_category'],
        num_rows: 150
    })
    test: Dataset({
        features: ['question', 'answer', 'db_name', 'context', 'query_category'],
        num_rows: 100
    })
    train_append: Dataset({
        features: ['answer', 'question', 'context'],
        num_rows: 150
    })
})

In [110]:
split_dataset["train"] = concatenate_datasets([split_dataset["train"]] + [eval_train_test["train_append"]]*3)
split_dataset["train"] = split_dataset["train"].shuffle(seed=42)
split_dataset

DatasetDict({
    train: Dataset({
        features: ['answer', 'question', 'context'],
        num_rows: 1950
    })
    val: Dataset({
        features: ['answer', 'question', 'context'],
        num_rows: 250
    })
    test: Dataset({
        features: ['answer', 'question', 'context'],
        num_rows: 250
    })
})

In [111]:
split_dataset["eval_train"] = eval_train_test["train"]
split_dataset["eval_test"] = eval_train_test["test"]
split_dataset

DatasetDict({
    train: Dataset({
        features: ['answer', 'question', 'context'],
        num_rows: 1950
    })
    val: Dataset({
        features: ['answer', 'question', 'context'],
        num_rows: 250
    })
    test: Dataset({
        features: ['answer', 'question', 'context'],
        num_rows: 250
    })
    eval_train: Dataset({
        features: ['question', 'answer', 'db_name', 'context', 'query_category'],
        num_rows: 150
    })
    eval_test: Dataset({
        features: ['question', 'answer', 'db_name', 'context', 'query_category'],
        num_rows: 100
    })
})

In [112]:
# save_path = "./datasets/sql-create-context-split"
# split_dataset.save_to_disk(save_path)

In [113]:
split_dataset["train"][-1]

{'answer': 'SELECT CAST(COUNT(DISTINCT CASE WHEN keyphrase_count > 1 THEN subquery.paperid END) AS FLOAT) / NULLIF(COUNT(DISTINCT CASE WHEN keyphrase_count =1 THEN subquery.paperid END), 0) AS ratio FROM (SELECT paperkeyphrase.paperid, COUNT(paperkeyphrase.keyphraseid) AS keyphrase_count FROM paperkeyphrase GROUP BY paperkeyphrase.paperid) AS subquery;',
 'question': 'What is the ratio of papers that have more than 1 keyphrases to papers that have 1 keyphrase?',
 'context': 'CREATE TABLE public.author (authorid bigint NOT NULL, authorname text);\n\nCREATE TABLE public.cite (citingpaperid bigint NOT NULL, citedpaperid bigint NOT NULL);\n\nCREATE TABLE public.dataset (datasetid bigint NOT NULL, datasetname text);\n\nCREATE TABLE public.field (fieldid bigint);\n\nCREATE TABLE public.journal (journalid bigint NOT NULL, journalname text);\n\nCREATE TABLE public.keyphrase (keyphraseid bigint NOT NULL, keyphrasename text);\n\nCREATE TABLE public.paper (paperid bigint NOT NULL, title text, ven