In [None]:
!pip install datasets



In [None]:
from datasets import load_dataset

In [None]:
!pip install trl

In [None]:
from transformers import TrainingArguments

In [None]:
from trl import SFTTrainer

# Part 1 dolly

In [None]:
training_args = TrainingArguments(
    output_dir="sft_checkpoint/",
    num_train_epochs=4,
    logging_steps=100,
    # gradient_accumulation_steps=2,
    save_strategy="steps",
    # evaluation_strategy="steps",
    per_device_train_batch_size=4,
    # per_device_eval_batch_size=1,
    logging_dir="./logs"
)

In [None]:
dataset = load_dataset("philschmid/dolly-15k-oai-style", split="train")
trainer = SFTTrainer(
    "openai-community/gpt2-medium",
    args=training_args,
    train_dataset=dataset,
    packing=True,
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
trainer.train()

Step,Training Loss
100,2.7688
200,2.6859
300,2.6619
400,2.5925
500,2.5966
600,2.5794
700,2.4768
800,2.3933
900,2.3874
1000,2.4053


Step,Training Loss
100,2.7688
200,2.6859
300,2.6619
400,2.5925
500,2.5966
600,2.5794
700,2.4768
800,2.3933
900,2.3874
1000,2.4053


TrainOutput(global_step=2576, training_loss=2.3608005772466245, metrics={'train_runtime': 2205.5697, 'train_samples_per_second': 4.668, 'train_steps_per_second': 1.168, 'total_flos': 1.9123804701720576e+16, 'train_loss': 2.3608005772466245, 'epoch': 4.0})

In [None]:
trainer.save_model('SFT_GPT-2M_Dolly15k')

In [None]:
# reward model

In [None]:
# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
python examples/scripts/reward_modeling.py \
    --model_name_or_path=facebook/opt-350m \
    --output_dir="reward_modeling_anthropic_hh" \
    --per_device_train_batch_size=16 \
    --num_train_epochs=1 \
    --gradient_accumulation_steps=2 \
    --gradient_checkpointing=True \
    --learning_rate=1.41e-5 \
    --report_to="wandb" \
    --remove_unused_columns=False \
    --optim="adamw_torch" \
    --logging_steps=10 \
    --evaluation_strategy="steps" \
    --eval_steps=500 \
    --max_length=512 \
"""
import warnings

import torch
from datasets import load_dataset
from tqdm import tqdm
from transformers import AutoModelForSequenceClassification, AutoTokenizer, HfArgumentParser

from trl import ModelConfig, RewardConfig, RewardTrainer, get_kbit_device_map, get_peft_config, get_quantization_config


tqdm.pandas()


if __name__ == "__main__":
    parser = HfArgumentParser((RewardConfig, ModelConfig))
    config, model_config = parser.parse_args_into_dataclasses()
    config.gradient_checkpointing_kwargs = dict(use_reentrant=False)

    ################
    # Model & Tokenizer
    ################
    torch_dtype = (
        model_config.torch_dtype
        if model_config.torch_dtype in ["auto", None]
        else getattr(torch, model_config.torch_dtype)
    )
    quantization_config = get_quantization_config(model_config)
    model_kwargs = dict(
        revision=model_config.model_revision,
        trust_remote_code=model_config.trust_remote_code,
        device_map=get_kbit_device_map() if quantization_config is not None else None,
        quantization_config=quantization_config,
    )
    tokenizer = AutoTokenizer.from_pretrained(model_config.model_name_or_path, use_fast=True)
    model = AutoModelForSequenceClassification.from_pretrained(
        model_config.model_name_or_path, num_labels=1, **model_kwargs
    )

    if model_config.lora_task_type != "SEQ_CLS":
        warnings.warn(
            "You are using a `task_type` that is different than `SEQ_CLS` for PEFT. This will lead to silent bugs"
            " Make sure to pass --lora_task_type SEQ_CLS when using this script."
        )

    ################
    # Dataset
    ################
    raw_datasets = load_dataset("Anthropic/hh-rlhf")
    # Tokenize chosen/rejected pairs of inputs
    # Adapt this section to your needs for custom datasets

    def preprocess_function(examples):
        new_examples = {
            "input_ids_chosen": [],
            "attention_mask_chosen": [],
            "input_ids_rejected": [],
            "attention_mask_rejected": [],
        }
        for chosen, rejected in zip(examples["chosen"], examples["rejected"]):
            tokenized_chosen = tokenizer(chosen)
            tokenized_rejected = tokenizer(rejected)

            new_examples["input_ids_chosen"].append(tokenized_chosen["input_ids"])
            new_examples["attention_mask_chosen"].append(tokenized_chosen["attention_mask"])
            new_examples["input_ids_rejected"].append(tokenized_rejected["input_ids"])
            new_examples["attention_mask_rejected"].append(tokenized_rejected["attention_mask"])

        return new_examples

    # Preprocess the dataset and filter out examples that are longer than args.max_length
    raw_datasets = raw_datasets.map(
        preprocess_function,
        batched=True,
        num_proc=4,
    )
    raw_datasets = raw_datasets.filter(
        lambda x: len(x["input_ids_chosen"]) <= config.max_length and len(x["input_ids_rejected"]) <= config.max_length
    )
    train_dataset = raw_datasets["train"]
    eval_dataset = raw_datasets["test"]

    ################
    # Training
    ################
    trainer = RewardTrainer(
        model=model,
        tokenizer=tokenizer,
        args=config,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        peft_config=get_peft_config(model_config),
    )
    trainer.train()
    trainer.save_model(config.output_dir)
    trainer.push_to_hub()
    metrics = trainer.evaluate()
    trainer.log_metrics("eval", metrics)
    print(metrics)

In [None]:
from transformers import GPT2LMHeadModel
from transformers import AutoTokenizer

In [None]:
model2 = GPT2LMHeadModel.from_pretrained('/content/drive/MyDrive/NLP/SFT_GPT-2M_Dolly15k')

In [None]:
model2

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 1024)
    (wpe): Embedding(1024, 1024)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-23): 24 x GPT2Block(
        (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=1024, out_features=50257, bias=False)
)

In [None]:
tokenizer = AutoTokenizer.from_pretrained('/content/drive/MyDrive/NLP/SFT_GPT-2M_Dolly15k')

In [None]:
from transformers import pipeline

In [None]:
generator = pipeline('text-generation', model=model2, tokenizer=tokenizer)

In [None]:
generator("The man worked as a", num_return_sequences=1)

[{'generated_text': 'The man worked as a journalist; he was elected MLA.\nThe men\'s swimming team that was also with him was undefeated, won eight medals.\n"He was a great athlete, and did a great job of organizing the men\'s team;'}]

In [None]:
generator2 = pipeline('text-generation', model='gpt2-medium')
generator2("The man worked as a", max_length=100, num_return_sequences=5)



config.json:   0%|          | 0.00/718 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.52G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': 'The man worked as a foreman for a major company in San Jose, California. During his time on the job, he reported numerous incidents of harassment and assault against him and other female employees. In 2013 a supervisor at the company began sending sexual texts to one of the employees who had complained about the incidents. The former employee also reported it to her supervisor, who proceeded to file a complaint with the state police and later with the California Human Rights Commission. In April 2013, the state attorney general filed'},
 {'generated_text': 'The man worked as a security guard at the Grand Circus Hotel before becoming an officer in 2008, court was told.\n\nHe was also married and lived in a rented cottage in Longbridge.'},
 {'generated_text': 'The man worked as a salesman, but was employed by a nearby family that had been farming land in the area for decades.\n\nWhen the suspect shot him, police said, one of the bullets went into his back and his arm

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!cp -r /content/SFT_GPT-2M_Dolly15k /content/drive/MyDrive/NLP

In [None]:
!zip -r