# Finetuning Llama-2 models
- Initially fine-tuned via USC HPC clusters
- With this script, it is possible to fine-tune on A100 or V100 with Google Colab Pro subscription or above

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import os

# Change directory
os.chdir('/content/drive/MyDrive/Finetuning')

# Verify that the current working directory has been changed
print(os.getcwd())

%cd /content/drive/MyDrive/Finetuning

In [None]:
!git clone https://github.com/hiyouga/LLaMA-Efficient-Tuning.git

In [None]:
!pip install --upgrade pip
!pip install bitsandbytes>=0.39.0
!pip install -r /content/drive/MyDrive/Finetuning/LLaMA-Efficient-Tuning/requirements.txt
!pip install trl==0.7.2

In [None]:
import os
os.environ["TRANSFORMERS_CACHE"] = "/content/drive/MyDrive/Finetuning/.cache/huggingface/"

In [None]:
efficient_finetuning_folder = "/content/drive/MyDrive/Finetuning/LLaMA-Efficient-Tuning" #absolute path

train_gpt_4 = "train_gpt-4.json"
test = "test.json" # not the actual test set, but for a placeholder

In [None]:
import json
def add_json_file(efficient_finetuning_folder, json_file_name):
    # Replace {username} with your actual username
    data_info_file = f"{efficient_finetuning_folder}/data/dataset_info.json"

    # Load the data_info.json file
    with open(data_info_file, 'r') as f:
        data_info = json.load(f)

    # Create a new key by removing the .json extension from the file name
    new_key = json_file_name.replace('.json', '')

    # Add the new key to the data_info dictionary
    data_info[new_key] = {
        'file_name': json_file_name
    }

    # Save the updated data_info.json file
    with open(data_info_file, 'w') as f:
        json.dump(data_info, f, indent=4)

    print(f'Added {new_key} to data_info.json')

add_json_file(efficient_finetuning_folder, train_gpt_4)
add_json_file(efficient_finetuning_folder, test)

In [None]:
!huggingface-cli login --token ### HUGGINGFACE API KEY ###
%cd /content/drive/MyDrive/Finetuning/LLaMA-Efficient-Tuning

# FACT-GPT training

In [None]:
def train_valid_llama(model_size, train_data):

    command = f"""!CUDA_VISIBLE_DEVICES=0 python /content/drive/MyDrive/Finetuning/LLaMA-Efficient-Tuning/src/train_bash.py \
        --stage sft \
        --model_name_or_path "meta-llama/Llama-2-{model_size}-chat-hf" \
        --do_train \
        --dataset "{train_data}" \
        --template "default" \
        --finetuning_type "lora" \
        --lora_target "q_proj,v_proj" \
        --output_dir "/content/drive/MyDrive/Finetuning/train_valid_{model_size}_{train_data}" \
        --overwrite_cache \
        --per_device_train_batch_size 4 \
        --gradient_accumulation_steps 4 \
        --lr_scheduler_type "cosine" \
        --logging_steps 1 \
        --save_steps 61 \
        --val_size 0.2 \
        --evaluation_strategy steps \
        --eval_steps 61 \
        --learning_rate "5e-5" \
        --num_train_epochs 3.0 \
        --plot_loss True \
        --fp16"""

    print(command)

In [None]:
def test_llama(model_size, train_data):

    command = f"""!CUDA_VISIBLE_DEVICES=0 python src/train_bash.py \
    --stage sft \
    --model_name_or_path 'meta-llama/Llama-2-{model_size}-chat-hf' \
    --do_predict \
    --dataset 'test' \
    --template 'default' \
    --finetuning_type 'lora' \
    --checkpoint_dir '/content/drive/MyDrive/Finetuning/train_valid_{model_size}_{train_data}' \
    --output_dir '/content/drive/MyDrive/Finetuning/train_valid_{model_size}_{train_data}/test-endpoint' \
    --per_device_eval_batch_size 8 \
    --max_samples 10000 \
    --temperature 0.01 \
    --top_p 0.01 \
    --predict_with_generate"""

    print(command)

### Train

In [None]:
train_valid_llama('13b', 'train_gpt-4')
train_valid_llama('7b', 'train_gpt-4')

In [None]:
!CUDA_VISIBLE_DEVICES=0 python /content/drive/MyDrive/Finetuning/LLaMA-Efficient-Tuning/src/train_bash.py         --stage sft         --model_name_or_path "meta-llama/Llama-2-13b-chat-hf"         --do_train         --dataset "train_gpt-4"         --template "default"         --finetuning_type "lora"         --lora_target "q_proj,v_proj"         --output_dir "/content/drive/MyDrive/Finetuning/train_valid_13b_train_gpt-4"         --overwrite_cache         --per_device_train_batch_size 4         --gradient_accumulation_steps 4         --lr_scheduler_type "cosine"         --logging_steps 1         --save_steps 61         --val_size 0.2         --evaluation_strategy steps         --eval_steps 61         --learning_rate "5e-5"         --num_train_epochs 3.0         --plot_loss True         --fp16
!CUDA_VISIBLE_DEVICES=0 python /content/drive/MyDrive/Finetuning/LLaMA-Efficient-Tuning/src/train_bash.py         --stage sft         --model_name_or_path "meta-llama/Llama-2-7b-chat-hf"         --do_train         --dataset "train_gpt-4"         --template "default"         --finetuning_type "lora"         --lora_target "q_proj,v_proj"         --output_dir "/content/drive/MyDrive/Finetuning/train_valid_7b_train_gpt-4"         --overwrite_cache         --per_device_train_batch_size 4         --gradient_accumulation_steps 4         --lr_scheduler_type "cosine"         --logging_steps 1         --save_steps 61         --val_size 0.2         --evaluation_strategy steps         --eval_steps 61         --learning_rate "5e-5"         --num_train_epochs 3.0         --plot_loss True         --fp16

In [None]:
test_llama('13b', 'train_gpt-4')
test_llama('7b', 'train_gpt-4')

In [None]:
!CUDA_VISIBLE_DEVICES=0 python src/train_bash.py     --stage sft     --model_name_or_path 'meta-llama/Llama-2-13b-chat-hf'     --do_predict     --dataset 'test'     --template 'default'     --finetuning_type 'lora'     --checkpoint_dir '/content/drive/MyDrive/Finetuning/train_valid_13b_train_gpt-4'     --output_dir '/content/drive/MyDrive/Finetuning/train_valid_13b_train_gpt-4/test-endpoint'     --per_device_eval_batch_size 8     --max_samples 10000     --temperature 0.01     --top_p 0.01     --predict_with_generate
!CUDA_VISIBLE_DEVICES=0 python src/train_bash.py     --stage sft     --model_name_or_path 'meta-llama/Llama-2-7b-chat-hf'     --do_predict     --dataset 'test'     --template 'default'     --finetuning_type 'lora'     --checkpoint_dir '/content/drive/MyDrive/Finetuning/train_valid_7b_train_gpt-4'     --output_dir '/content/drive/MyDrive/Finetuning/train_valid_7b_train_gpt-4/test-endpoint'     --per_device_eval_batch_size 8     --max_samples 10000     --temperature 0.01     --top_p 0.01     --predict_with_generate

# Put inference results into dataset

In [None]:
import pandas as pd

df = pd.read_csv('/content/drive/MyDrive/FACT-GPT dataset.csv')
df['13b_finetuned_on_gpt_4'] = pd.read_json('/content/drive/MyDrive/Finetuning/train_valid_13b_train_gpt-4_balanced/test-endpoint/generated_predictions.jsonl', lines=True)['predict']
df['13b_finetuned_on_gpt_3_5'] = pd.read_json('/content/drive/MyDrive/Finetuning/train_valid_13b_train_gpt-3_5_balanced/test-endpoint/generated_predictions.jsonl', lines=True)['predict']
df['13b_finetuned_on_70b'] = pd.read_json('/content/drive/MyDrive/Finetuning/train_valid_13b_train_llama_2_70b_balanced/test-endpoint/generated_predictions.jsonl', lines=True)['predict']

df['7b_finetuned_on_gpt_4'] = pd.read_json('/content/drive/MyDrive/Finetuning/train_valid_7b_train_gpt-4_balanced/test-endpoint/generated_predictions.jsonl', lines=True)['predict']
df['7b_finetuned_on_gpt_3_5'] = pd.read_json('/content/drive/MyDrive/Finetuning/train_valid_7b_train_gpt-3_5_balanced/test-endpoint/generated_predictions.jsonl', lines=True)['predict']
df['7b_finetuned_on_70b'] = pd.read_json('/content/drive/MyDrive/Finetuning/train_valid_7b_train_llama_2_70b_balanced/test-endpoint/generated_predictions.jsonl', lines=True)['predict']

df.to_csv('/content/drive/MyDrive/FACT-GPT dataset.csv', index=False)

# unassign

In [None]:
from google.colab import runtime
runtime.unassign()