In [2]:
import tiktoken
import pandas as pd
import getpass
import openai

from opengpt.config import Config
from opengpt.dataset_utils import split_csv_by_max_len, create_dataset

## Prompt and domain-data configuration

Make sure the prompts and domain-data is configured properly. Domain-data in the `config` file is the `datasets` part.

In [20]:
config = Config(yaml_path='../configs/example_config_for_detaset_creation.yaml')

In [None]:
# Based on the teacher and model, you might need to change the tokenizer
tokenizer = tiktoken.encoding_for_model(config.teacher.model)

In [22]:
split_csv_by_max_len(config.datasets, 
                     max_len=config.teacher.max_len, 
                     tokenizer=tokenizer,
                     base_path=config.base_path)

Datasets:   0%|          | 0/1 [00:00<?, ?it/s]

nhs_conditions_small_sample:   0%|          | 0/10 [00:00<?, ?it/s]




## Send data to the Teacher and create a instruction based dataset

If an example ocasionally fails, that is fine. But, if no examples pass, have a look at the parser and prompt you are using, maybe they are not aligned. 

If the connection breaks or for some reason the generation stops, that is fine, the scripts below make checkpoints and will continue generation. Everytime we run the script it will also try to re-do the examples that failed in the previous run.

In [None]:
openai.api_key = getpass.getpass("Your OPENAI_API_KEY: ")

In [8]:
# The same dataset will create outputs in two languages and do this for two epochs and two different prompts
raw_data, prepared_data = create_dataset(config)

Starting prompts: ['f53cf99826', 'f4df95ec69']
Run: 0
Language: English


nhs_conditions_small_sample:   0%|          | 0/10 [00:00<?, ?it/s]

Starting prompts: ['f53cf99826', 'f4df95ec69']
Run: 0
Language: French


nhs_conditions_small_sample:   0%|          | 0/10 [00:00<?, ?it/s]

Starting prompts: ['f53cf99826', 'f4df95ec69']
Run: 1
Language: English


nhs_conditions_small_sample:   0%|          | 0/10 [00:00<?, ?it/s]

Starting prompts: ['f53cf99826', 'f4df95ec69']
Run: 1
Language: French


nhs_conditions_small_sample:   0%|          | 0/10 [00:00<?, ?it/s]



In [31]:
prepared_data.head()

Unnamed: 0,text,raw_data_id
0,<|user|> What is considered a high blood press...,0
1,<|user|> What are the risks of having high blo...,0
2,<|user|> What lifestyle changes can help preve...,0
3,<|user|> What kind of medicines can help contr...,0
4,<|user|> What are some things that can increas...,0


In [30]:
raw_data.head()

Unnamed: 0,id,raw_output,dataset,language,run,prompt_hash,prompt_text_hash,context
0,0,ID;Question;Answer\n1;What is considered a hig...,nhs_conditions_small_sample,English,0,f53cf99826,a886c127f3f267e647b41b8f12caf0fac51bec9ff2d908...,Overview\nHigh blood pressure (hypertension)\n...
1,1,ID;Question;Answer\n1;What is bronchiolitis an...,nhs_conditions_small_sample,English,0,f53cf99826,ca0cc4cd9acd981724d8478f85610d37679383bbb330cb...,Bronchiolitis\nBronchiolitis is a common chest...
2,2,"Patient: Hi, I've been coughing for a week now...",nhs_conditions_small_sample,English,0,f4df95ec69,2bfdb5c693f89fec690df0b78321945e82d630f0dabf98...,Bronchitis\nBronchitis is inflammation of the ...
3,3,"Patient: Hi, I have been experiencing joint pa...",nhs_conditions_small_sample,English,0,f4df95ec69,14142c96ed663d1313091800a7778df104e50577c26f23...,"Steroids\nSteroids, also called corticosteroid..."
4,4,ID;Question;Answer\n1;What are the symptoms of...,nhs_conditions_small_sample,English,0,f53cf99826,ed53b30b98e2f13a1b120f459ff0ca70d3e8f305bc8855...,Overview\nCreutzfeldt-Jakob disease\nCreutzfel...
