<a href="https://colab.research.google.com/github/ChandlerU11/Hugging_ReST/blob/main/ReST_Method_Fine.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install datasets
!pip install accelerate -U
!pip install transformers[torch]

In [2]:
from transformers import DataCollatorForSeq2Seq, Seq2SeqTrainer, Seq2SeqTrainingArguments, AutoModelForSeq2SeqLM, AutoTokenizer
from sklearn.model_selection import train_test_split
from datasets import Dataset
from tqdm import tqdm
import torch
import pandas as pd
import statistics
import random
import time

if torch.cuda.is_available():
    device = 0

def prepare_dataset(examples):
  input_ids = tokenizer(examples['input'], truncation=True, max_length=128)['input_ids']
  label_ids = tokenizer(examples["response"], truncation=True, max_length=128)['input_ids']
  return {"input_ids": input_ids, "labels": label_ids}

def dummy_reward_model(df):
    # Get input length
    input_len = [int(each.split()[0]) for each in df['input']]

    # Get word count of response
    gen_len = [len(each.split()) for each in df['response']]

    # Count number of "hugs" in response
    hug_count = [sum(1 for i in each.split() if i == 'hugs') for each in df['response']]

    # If count of "hugs" and total words in string do not match input legnth, give negative reward
    rewards = []
    for x,y,z in zip(input_len, hug_count, gen_len):
      if z == y:
        rewards.append(-abs(int(x) - y))
      else:
        rewards.append(-3)

    return rewards

def generate(df, gen_model, tokenizer, generation_kwargs, N):
    responses = []
    for _ in range(N):
        inputs = tokenizer(df['input'].tolist(), return_tensors="pt", padding = True, truncation = True)
        response = gen_model.generate(inputs["input_ids"].to(device), **generation_kwargs)
        response = [tokenizer.decode(each, skip_special_tokens = True) for each in response]
        responses.extend(response)

    # Create preserved-order dataframe with prompt / response pairs
    gen_df = pd.concat([df.copy()] * N)
    gen_df['response'] = responses

    return gen_df

def ReST(D, Deval, G, I, N, model, tokenizer, generation_kwargs, training_args):
    for g in range(G):
        print('Grow Step ', g)

        # Generate Dg. N determines number of generations per sample.
        Dg = generate(D, model, tokenizer, generation_kwargs, N)

        # Annotate Dg with reward model.
        Dg['scores'] = dummy_reward_model(Dg)

        print(len(Dg[Dg['scores'] == 0]), "generations out of ", len(Dg), "are the correct length.")
        print("Example output from model:")
        print(Dg.head(25))
        time.sleep(10)

        steps = 0
        for tau_i in I:
            print('Improve Step: ', steps)
            print('Threshold: ', tau_i)

            # Filter for samples at or above threshold
            Dg_filt = Dg.loc[(Dg['scores'] >= tau_i)].copy()
            if len(Dg_filt) == 0:
                print("NO SAMPLES ABOVE THRESHOLD")
                break

            Dg_filt = Dataset.from_pandas(Dg_filt).map(prepare_dataset, batched=True)

            # Create trainer with newly filtered data
            trainer = Seq2SeqTrainer(model=model, args=training_args, tokenizer=tokenizer, train_dataset=Dg_filt, data_collator=data_collator)

            # First fine-tuning of improve step
            trainer.train()

            # Generate one response to for every sample in eval set
            Dg_eval = generate(Deval, model, tokenizer, generation_kwargs, 1)
            Dg_eval['scores'] = dummy_reward_model(Dg_eval)

            # While model improves reward model score on eval set, continue to fine-tune using Dg_filt
            prev = -5
            improve = statistics.mean(Dg_eval['scores'])
            while prev < improve:
                trainer.train()
                Dg_eval = generate(Deval, model, tokenizer, generation_kwargs, 1)
                Dg_eval['scores'] = dummy_reward_model(Dg_eval)
                prev = improve
                improve = statistics.mean(Dg_eval['scores'])

            steps += 1

    print("Training Finished!!!")
    return model

def test_ReST(test_data, model, tokenizer, generation_kwargs):
      test_df = generate(test_data, model, tokenizer, generation_kwargs, 1)
      test_df['scores'] = dummy_reward_model(test_df)
      print("The fine-tuned model now generates the correct number of hugs", len(test_df[test_df['scores'] == 0]) / len(test_df) * 100, "percent of the time!" )
      print(test_df.head(25))

# Generate training data
rand_data = []
for i in range(1000):
     rand_data.append(str(random.randrange(1,5)) + ' hugs')

# Generate test data
rand_test_data = []
for i in range(100):
     rand_test_data.append(str(random.randrange(1,5)) + ' hugs')

train_df = pd.DataFrame()
train_df['input'] = rand_data

test_df = pd.DataFrame()
test_df['input'] = rand_test_data

generation_kwargs = {
    "min_length":-1,
    "top_k": 0.0,
    "top_p": 1.0,
    "do_sample": True,
}

training_args = Seq2SeqTrainingArguments(
            do_train=True,
            do_eval=False,
            learning_rate = 3e-4,
            output_dir="./t5-small",
            num_train_epochs=1,
            per_device_train_batch_size = 64
            )

tokenizer = AutoTokenizer.from_pretrained("google-t5/t5-small")
model = AutoModelForSeq2SeqLM.from_pretrained("google-t5/t5-small").to(device)
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, label_pad_token_id=-100)

D, Deval = train_test_split(train_df, test_size = .1, random_state = 42)
G = 3          # Number of Grow Steps
I = [-2,-1,0]  # Number of Improve Steps (length of list) with respective thresholds
N = 10         # Number of generations for each sample when creating Dg

fine_tuned_model = ReST(D, Deval, G, I, N, model, tokenizer, generation_kwargs, training_args)

test_ReST(test_df, fine_tuned_model, tokenizer, generation_kwargs)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Grow Step  0




6 generations out of  9000 are the correct length.
Example output from model:
      input                 response  scores
716  2 hugs                  2 Eric2      -3
351  1 hugs                      The      -3
936  2 hugs                   2 hugs      -3
256  2 hugs                  2- hugs      -3
635  3 hugs                   3 hugs      -3
644  1 hugs      1oja Its had addict      -3
554  3 hugs                   3 hugs      -3
959  1 hugs                   1 hugs      -3
168  3 hugs                   3 hugs      -3
917  1 hugs                       ss      -3
528  3 hugs                   DayMay      -3
823  1 hugs             1 questions.      -3
985  2 hugs                      God      -3
816  4 hugs  # improvement, 4 emails      -3
86   3 hugs           3 e altes nice      -3
432  4 hugs                   4 hugs      -3
184  3 hugs                   3 hugs      -3
978  4 hugs                 4 4 hugs      -3
534  4 hugs               4 4 4 hugh      -3
294  2 hugs           

Map:   0%|          | 0/199 [00:00<?, ? examples/s]

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss




Step,Training Loss




Step,Training Loss




Step,Training Loss


Improve Step:  1
Threshold:  -1




Map:   0%|          | 0/157 [00:00<?, ? examples/s]

Step,Training Loss




Step,Training Loss


Improve Step:  2
Threshold:  0




Map:   0%|          | 0/6 [00:00<?, ? examples/s]

Step,Training Loss




Step,Training Loss




Step,Training Loss




Step,Training Loss




Step,Training Loss




Step,Training Loss




Grow Step  1
2234 generations out of  9000 are the correct length.
Example output from model:
      input        response  scores
716  2 hugs            hugs      -1
351  1 hugs            hugs       0
936  2 hugs            hugs      -1
256  2 hugs            hugs      -1
635  3 hugs       hugs hugs      -1
644  1 hugs            hugs       0
554  3 hugs            hugs      -2
959  1 hugs       hugs hugs      -1
168  3 hugs            hugs      -2
917  1 hugs           thugs      -3
528  3 hugs            hugs      -2
823  1 hugs            hugs       0
985  2 hugs       hugs hugs       0
816  4 hugs            hugs      -3
86   3 hugs            hugs      -2
432  4 hugs            hugs      -3
184  3 hugs            hugs      -2
978  4 hugs            hugs      -3
534  4 hugs            hugs      -3
294  2 hugs  hugs hugs hugs      -1
892  4 hugs            hugs      -3
425  4 hugs            hugs      -3
713  4 hugs            hugs      -3
260  2 hugs            hugs      -1
237  3

Map:   0%|          | 0/6646 [00:00<?, ? examples/s]

Step,Training Loss




Step,Training Loss




Step,Training Loss




Improve Step:  1
Threshold:  -1


Map:   0%|          | 0/4657 [00:00<?, ? examples/s]

Step,Training Loss




Step,Training Loss




Improve Step:  2
Threshold:  0


Map:   0%|          | 0/2234 [00:00<?, ? examples/s]

Step,Training Loss




Step,Training Loss




Grow Step  2




6353 generations out of  9000 are the correct length.
Example output from model:
      input             response  scores
716  2 hugs            hugs hugs       0
351  1 hugs                 hugs       0
936  2 hugs            hugs hugs       0
256  2 hugs            hugs hugs       0
635  3 hugs       hugs hugs hugs       0
644  1 hugs                 hugs       0
554  3 hugs            hugs hugs      -1
959  1 hugs                 hugs       0
168  3 hugs            hugs hugs      -1
917  1 hugs                 hugs       0
528  3 hugs  hugs hugs hugs hugs      -1
823  1 hugs                 hugs       0
985  2 hugs            hugs hugs       0
816  4 hugs       hugs hugs hugs      -1
86   3 hugs       hugs hugs hugs       0
432  4 hugs       hugs hugs hugs      -1
184  3 hugs       hugs hugs hugs       0
978  4 hugs  hugs hugs hugs hugs       0
534  4 hugs       hugs hugs hugs      -1
294  2 hugs            hugs hugs       0
892  4 hugs       hugs hugs hugs      -1
425  4 hugs      

Map:   0%|          | 0/8973 [00:00<?, ? examples/s]

Step,Training Loss




Step,Training Loss


Improve Step:  1
Threshold:  -1




Map:   0%|          | 0/8687 [00:00<?, ? examples/s]

Step,Training Loss




Step,Training Loss




Improve Step:  2
Threshold:  0


Map:   0%|          | 0/6353 [00:00<?, ? examples/s]

Step,Training Loss




Step,Training Loss




Step,Training Loss




Step,Training Loss




Training Finished!!!
The fine-tuned model now generates the correct number of hugs 100.0 percent of the time!
     input             response  scores
0   3 hugs       hugs hugs hugs       0
1   3 hugs       hugs hugs hugs       0
2   1 hugs                 hugs       0
3   3 hugs       hugs hugs hugs       0
4   1 hugs                 hugs       0
5   4 hugs  hugs hugs hugs hugs       0
6   4 hugs  hugs hugs hugs hugs       0
7   3 hugs       hugs hugs hugs       0
8   1 hugs                 hugs       0
9   1 hugs                 hugs       0
10  1 hugs                 hugs       0
11  4 hugs  hugs hugs hugs hugs       0
12  3 hugs       hugs hugs hugs       0
13  1 hugs                 hugs       0
14  2 hugs            hugs hugs       0
15  1 hugs                 hugs       0
16  4 hugs  hugs hugs hugs hugs       0
17  4 hugs  hugs hugs hugs hugs       0
18  3 hugs       hugs hugs hugs       0
19  1 hugs                 hugs       0
20  3 hugs       hugs hugs hugs       0
21  1 hugs