ASC Fine-tuning Notebook

In [None]:
#This notebook is there to finetune the 'Orca-2-7b' model for the ASC pipeline
#after this notebook is run, the created model can be used for the main ASC notebook

#large chunks of the code have been adapted from this notebook:
#https://colab.research.google.com/drive/1IqL0ay04RwNNcn5R7HzhgBqZ2lPhHloh?usp=sharing
#which comes from this video-tutorial https://www.youtube.com/watch?v=Q9zv369Ggfk

Load dependencies and modules

In [None]:
#mount if google drive is needed, else skip
from google.colab import drive
drive.mount('/content/drive')

In [None]:
#install dependencies

!pip install -Uqqq pip
!pip install -qqq bitsandbytes==0.39.0
!pip install -qqq torch==2.1.0
!pip install -qqq -U git+https://github.com/huggingface/transformers.git@e03a9cc
!pip install -qqq -U git+https://github.com/huggingface/peft.git@42a184f
!pip install -qqq -U git+https://github.com/huggingface/accelerate.git@c9fbb71
!pip install -qqq datasets==2.12.0
!pip install -qqq loralib==0.1.1
!pip install -qqq einops==0.6.1
!pip install sentencepiece #if sentencepiece was not installed prior, the runtime has to be restarted

In [None]:
#import packages

import json
import os
from pprint import pprint
import bitsandbytes as bnb
import torch
import torch.nn as nn
import transformers
from datasets import load_dataset
from peft import (
    LoraConfig,
    PeftConfig,
    PeftModel,
    get_peft_model,
    prepare_model_for_kbit_training
)
from transformers import (
    AutoConfig,
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig
)
import sentencepiece

os.environ["CUDA_VISIBLE_DEVICES"] = "0"

Create finetuning dataset

In [None]:
#load initial data
with open("/content/drive/MyDrive/thesis data/Subtitles/RNN_1 Panopto - DL.txt") as f: #provide path to the original Panopto transription
  RNN_1_Panopto_NO_TS = []
  for i, line in enumerate(f):
    if i % 2 == 0:
      RNN_1_Panopto_NO_TS.append(line.rstrip())

with open("/content/drive/MyDrive/thesis data/Subtitles/RNN_1 Goldstandard - DL.txt") as f: ##provide path to the manually created goldstandard transcription
  RNN_1_Goldstandard_NO_TS = []
  for i, line in enumerate(f):
    if i % 2 == 0:
      RNN_1_Goldstandard_NO_TS.append(line.rstrip())

In [None]:
#the dataset is constructed to train the model to give appropriate outputs and understand that only a single subtitle sequence is supposed to be output
#the model is supposed to only output the provided subtitles in a corrected way
#thats why the subtitles from the goldstandard are provided as the correct outputs, without additional text
subtitlecorpus = RNN_1_Panopto_NO_TS #provide panopto transcript with no timestamps
goldstandardcorpus = RNN_1_Goldstandard_NO_TS
RNN_1_finetuning_prompts = DataFrame(columns =["prompt","answer"]) #set up dataframe for data
length = len(subtitlecorpus)/3


for l in range(1,int(len(subtitlecorpus)/3)+1): # we will take 3 lines of text into one chunk
    subtitles = ' '.join(subtitlecorpus[(l*3)-3:l*3]) #take corresponding subtitle sequence from panopto
    goldstandardanswer = ' '.join(RNN_1_Goldstandard_NO_TS[(l*3)-3:l*3]) #take corresponding subtitle sequence from goldstandard
    context_info = retrieve_context(subtitles)
    #input our information into the dataframe with prompt that will also be used in the main pipeline
    RNN_1_finetuning_prompts.loc[l-1]= f"""
You are tasked with correcting subtitles, which where automatically generated and
therefore incorporate false transcriptions. Especially technical terms are often
incorrectly transcribed. Analyse the sentences and distill the incorrect words out of the sentences and replace
them with the correct terms. Do not make more changes.

Below is the subtitle passage you should correct now:
{subtitles}

Below here is some context information to understand the context of the false transcriptions better:
{context_info}

Please output ONLY the corrected subtitle passage
""", str(goldstandardanswer)
    print(f"{l}/{length} processed...")

#save the training data dataframe
RNN_1_finetuning_prompts.to_csv('/content/drive/MyDrive/thesis data/finetuning/Finetuning_CSV.csv')

Load model and prepare it for training

In [None]:
#load model from huggingface
MODEL_NAME = "microsoft/Orca-2-7b" #define model name


#create bnb config to load model in 4bit quantization to reduce computational load
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

#load the model into memory
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    device_map="auto",
    trust_remote_code=True,
    quantization_config=bnb_config
)


In [None]:
#load tokenizer for the model
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME,use_fast=False)
tokenizer.pad_token = tokenizer.eos_tokenwith t

In [None]:
model.gradient_checkpointing_enable() #prevents memory issues
model = prepare_model_for_kbit_training(model) #prepares the model for training

In [None]:
#define the 'LoRA' config
#'LoRA' is the finetuning method used and is parameter-efficient, it will reduce training time greatly
config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM" # has to match model type
)

#prepare model to be trained with LoRA
model = get_peft_model(model, config)

Load training dataset and prepare it for fine-tuning

In [None]:
#load training dataframe which was created in the beginning of this notebook
data = load_dataset("csv", data_files="/content/drive/MyDrive/thesis data/ASC_Finetuning_CSV.csv")

In [None]:
#shows what the data looks like before preparation
data

In [None]:
#can be used to inspect an element of the training data
data["train"][5]

In [None]:
# define functions to prepare and tokenize the data for the trainer

# define function to extract information from the training data and put it into correct prompt format for the LLM
def generate_prompt(data_point):
  return f"""
<human>: {data_point["prompt"]}
<assistant>: {data_point["answer"]}
""".strip()

# define function to tokenize the full prompt from the preceding function
def generate_and_tokenize_prompt(data_point):
  full_prompt = generate_prompt(data_point)
  tokenized_full_prompt = tokenizer(full_prompt, padding=True, truncation=True)
  return tokenized_full_prompt

In [None]:
#the initial data is being put into correct prompt format and is tokenized for training
#it is also shuffled to break any inherent ordering
data = data["train"].shuffle().map(generate_and_tokenize_prompt)

In [None]:
#shows what the data looks like after preparation
data

Execute the fine-tuning

In [None]:
#finetune model

#define training parameters for the trainer
training_args = transformers.TrainingArguments(
      per_device_train_batch_size=1,
      gradient_accumulation_steps=4,
      num_train_epochs=1,
      learning_rate=2e-4,
      fp16=True,
      save_total_limit=3,
      logging_steps=1,
      output_dir="experiments",
      optim="paged_adamw_8bit",
      lr_scheduler_type="cosine",
      warmup_ratio=0.05,
)
#initialize trainer with predefined settings
trainer = transformers.Trainer(
    model=model,
    train_dataset=data,
    args=training_args,
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False)
)
model.config.use_cache = False

#execute training
trainer.train()

In [None]:
#define a dataname that will be used to give the final model a identifiable name
DATA_NAME = 'ASC_Finetuning'
#save finetuned model
model.save_pretrained(f"{MODEL_NAME}_finetuned_on_{DATA_NAME}")