<a href="https://colab.research.google.com/github/EdBerg21/CodeT/blob/main/KEYDONEfinetune_llama_2_on_science_qa.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Finetune on ScienceQA
Let's use LLM Engine to fine-tune Llama-2 on ScienceQA!

# Packages Required
For this demo, we'll be using the `scale-llm-engine` package and `datasets` from Huggingface.


In [2]:
!pip install scale-llm-engine
!pip install datasets

Collecting scale-llm-engine
  Downloading scale_llm_engine-0.0.0b19-py3-none-any.whl (23 kB)
Installing collected packages: scale-llm-engine
Successfully installed scale-llm-engine-0.0.0b19
Collecting datasets
  Downloading datasets-2.14.6-py3-none-any.whl (493 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m493.7/493.7 kB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m16.8 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m17.4 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0.0,>=0.14.0 (from datasets)
  Downloading huggingface_hub-0.18.0-py3-none-any.whl (301 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

# Data Preparation
Let's load in the dataset using Huggingface and view the features.

In [3]:
from datasets import load_dataset
from smart_open import smart_open
import pandas as pd

dataset = load_dataset('derek-thomas/ScienceQA')
dataset['train'].features

Downloading readme:   0%|          | 0.00/10.3k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/377M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/126M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/122M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/12726 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/4241 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/4241 [00:00<?, ? examples/s]

{'image': Image(decode=True, id=None),
 'question': Value(dtype='string', id=None),
 'choices': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None),
 'answer': Value(dtype='int8', id=None),
 'hint': Value(dtype='string', id=None),
 'task': Value(dtype='string', id=None),
 'grade': Value(dtype='string', id=None),
 'subject': Value(dtype='string', id=None),
 'topic': Value(dtype='string', id=None),
 'category': Value(dtype='string', id=None),
 'skill': Value(dtype='string', id=None),
 'lecture': Value(dtype='string', id=None),
 'solution': Value(dtype='string', id=None)}

Now, let's format the dataset into what's acceptable for LLM Engine - a CSV file with 'prompt' and 'response' columns.

In [4]:
choice_prefixes = [chr(ord('A') + i) for i in range(26)] # A-Z
def format_options(options, choice_prefixes):
    return ' '.join([f'({c}) {o}' for c, o in zip(choice_prefixes, options)])

def format_prompt(r, choice_prefixes):
    options = format_options(r['choices'], choice_prefixes)
    return f'''Context: {r["hint"]}\nQuestion: {r["question"]}\nOptions:{options}\nAnswer:'''

def format_label(r, choice_prefixes):
    return choice_prefixes[r['answer']]

def convert_dataset(ds):
    prompts = [format_prompt(i, choice_prefixes) for i in ds if i['hint'] != '']
    labels = [format_label(i, choice_prefixes) for i in ds if i['hint'] != '']
    df = pd.DataFrame.from_dict({'prompt': prompts, 'response': labels})
    return df

save_to_s3 = False
df_train = convert_dataset(dataset['train'])
if save_to_s3:
    train_url = 's3://...'
    val_url = 's3://...'
    df_train = convert_dataset(dataset['train'])
    with smart_open(train_url, 'wb') as f:
        df_train.to_csv(f)

    df_val = convert_dataset(dataset['validation'])
    with smart_open(val_url, 'wb') as f:
        df_val.to_csv(f)
else:
    # Gists of the already processed datasets
    train_url = 'https://gist.githubusercontent.com/jihan-yin/43f19a86d35bf22fa3551d2806e478ec/raw/91416c09f09d3fca974f81d1f766dd4cadb29789/scienceqa_train.csv'
    val_url = 'https://gist.githubusercontent.com/jihan-yin/43f19a86d35bf22fa3551d2806e478ec/raw/91416c09f09d3fca974f81d1f766dd4cadb29789/scienceqa_val.csv'

df_train

Unnamed: 0,prompt,response
0,Context: The passage below describes an experi...,B
1,Context: The passage below describes an experi...,A
2,Context: This passage describes the myotonia c...,A
3,Context: The diagrams below show two pure samp...,C
4,Context: Below is a food web from an ocean eco...,A
...,...,...
6074,Context: The images below show two pairs of ma...,A
6075,Context: Select the better answer.\nQuestion: ...,A
6076,Context: Read the description of a trait.\nHan...,A
6077,Context: The objects are identical except for ...,A


# Fine-tune
Now, we can fine-tune the model using LLM Engine.

In [5]:
import os
os.environ['SCALE_API_KEY'] = 'clonk7mf005it1asab2fy0p9g'

from llmengine import FineTune

response = FineTune.create(
    model="llama-2-7b",
    training_file=train_url,
    validation_file=val_url,
    hyperparameters={
        'lr':2e-4,
    },
    suffix='science-qa-llama'
)
run_id = response.id

We can sleep until the job completes.

In [None]:
import time

while True:
    job_status = FineTune.get(run_id).status
    print(job_status)
    if job_status == 'SUCCESS':
        break
    time.sleep(60)

fine_tuned_model = FineTune.get(run_id).fine_tuned_model

BatchJobStatus.PENDING
BatchJobStatus.PENDING
BatchJobStatus.PENDING


# Inference and Evaluation
Let's evaluate the new fine-tuned model by running inference against it.

In [None]:
import pandas as pd
from llmengine import Completion

# Helper function to get outputs for fine-tuned model with retries
def get_output(prompt: str, num_retry: int = 5):
    for _ in range(num_retry):
        try:
            response = Completion.create(
                model=fine_tuned_model,
                prompt=prompt,
                max_new_tokens=1,
                temperature=0.01
            )
            return response.output.text.strip()
        except Exception as e:
            print(e)
    return ""

# Read the test data
test = pd.read_csv(val_url)

test["prediction"] = test["prompt"].apply(get_output)
print(f"Accuracy: {(test['response'] == test['prediction']).mean() * 100:.2f}%")