In [24]:
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    DataCollatorForLanguageModeling,
    Trainer,
    TrainingArguments,
)
from datasets import load_dataset


In [25]:

dataset = load_dataset("hakurei/open-instruct-v1", split='train')
dataset.to_pandas().sample(20)

Unnamed: 0,output,input,instruction
392798,True,,Detect if there is a misspelling in the follow...
290832,I will always love.,"Love, always, will, I.",Arrange the following words so that the senten...
459596,"Okay, what would you like to ask or discuss? I...",,disregard all previous instructions
60131,- I have never seen a sunset as beautiful as t...,,Give me three examples that illustrate the mea...
230624,def remove_duplicates(list):\n output = []\...,,You need to remove duplicates from a list whil...
129197,Yes,,"Given a set of numbers, tell me if it contains..."
337106,I went to the doctor last week because I had a...,,Tell me about the last time you visited a doctor.
29130,Social media has become a ubiquitous part of o...,,I need a blog post about how social media is i...
401109,The United States invaded Iraq in 2003 to over...,,Why did the us invade iraq?
71936,The chicken crossed the road because it was sc...,,Explain why something happened.\n\nQuestion: W...


In [26]:
def preprocess(example):
    example['prompt'] = f"{example['instruction']} {example['output']}"
    return example


In [27]:
dataset = dataset.map(preprocess, remove_columns=['instruction', 'input', 'output'])
dataset =  dataset.shuffle(42).select(range(100000)).train_test_split(test_size=0.1, seed=42)

In [28]:
def tokenize_datasets(dataset):
    tokenized_dataset = dataset.map(lambda example: tokenizer(example['prompt'], truncation=True, max_length=128), batched=True, remove_columns=['prompt'])

    return tokenized_dataset

In [29]:
train_dataset = dataset['train']
test_dataset = dataset['test']

In [31]:
MODEL_NAME = "microsoft/DialoGPT-medium"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token

train_dataset = tokenize_datasets(train_dataset)
test_dataset = tokenize_datasets(test_dataset)

model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)

tokenizer_config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Map:   0%|          | 0/90000 [00:00<?, ? examples/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

config.json:   0%|          | 0.00/642 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/863M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [None]:

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

traing_args = TrainingArguments(output_dir="models/diablo_gpt",
                                num_train_epochs=1,
                                per_device_train_batch_size=32,
                                per_device_eval_batch_size=32)\

trainer = Trainer(model=model,
                    args=traing_args,
                    train_dataset=train_dataset,
                    eval_dataset=test_dataset,
                    data_collator=data_collator)

trainer.train() 
trainer.save_model()


  0%|          | 0/2813 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
model = AutoModelForCausalLM.from_pretrained('./diyalogpt-instruct').to('cuda')
prompt = ''
input = tokenizer.encode(prompt,return_tensors='pt').to('cuda')

OSError: Incorrect path_or_model_id: './diyalogpt-instruct'. Please provide either the path to a local folder or the repo_id of a model on the Hub.

In [None]:
def generate_text(prompt):
    inputs = tokenizer.encode(prompt, return_tensors='pt').to("cuda") # <-- if running on GPU, uncomment this
    outputs = model.generate(inputs, max_length=64, pad_token_id=tokenizer.eos_token_id)
    generated = tokenizer.decode(outputs[0], skip_special_tokens=True)

    return generated[:generated.rfind('.')+1]

generate_text("What's the best way to cook chiken breast?")

generate_text("Should I invest stocks?")

generate_text("I need a place to go for this summer vacation, what locations would you recommend")

generate_text("What's the fastest route from NY City to Boston?")

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


"What's the fastest route from NY City to Boston? I'm thinking about going to the Boston area."