In [1]:
from datasets import load_dataset, Dataset
from transformers import AutoTokenizer
import pickle as pkl

  from .autonotebook import tqdm as notebook_tqdm


### Loading the tokenizer

Using the instruct version of the base model. Code also works by formatting QA pairs using #Question and #Answer tags.

In [7]:
tokenizer = AutoTokenizer.from_pretrained("HuggingFaceTB/SmolLM-135M-Instruct")

### Loading the main dataset

In [8]:
ds = load_dataset("HuggingFaceTB/everyday-conversations-llama3.1-2k")

### Convert the dataset to chat format

In [9]:
def convert_to_chat(dataset):
    data = []
    for i in dataset:
        sample = i['messages']
        for i,j in zip(sample, sample[1:]):
            if i['role']=='user':
                data.append({'text':tokenizer.apply_chat_template([i,j], tokenize=False)})
    data = Dataset.from_list(data)
    return data

In [10]:
train = convert_to_chat(ds['train_sft'])
test = convert_to_chat(ds['test_sft'])

In [11]:
train, test

(Dataset({
     features: ['text'],
     num_rows: 8625
 }),
 Dataset({
     features: ['text'],
     num_rows: 455
 }))

In [13]:
print(train[10]['text'])

<|im_start|>user
That makes sense. Why is biodiversity important?<|im_end|>
<|im_start|>assistant
Biodiversity is important because it helps maintain healthy ecosystems, supports the food chain, and provides many benefits to humans, such as clean air and water, food, and medicine.<|im_end|>



### Dumping the dataset to disk

In [17]:
pkl.dump(train, open("Dataset/ChatTrain.pkl","wb"))
pkl.dump(test, open("Dataset/ChatTest.pkl","wb"))