## Mistral finetuning 

In [None]:
# base model from huggingFace or path to model
base_model = "D:/2024_projects/llm_recovery/results3/checkpoint-1200"
base_model = "mistralai/Mistral-7B-Instruct-v0.2" ## From kaggle inputs
new_model = "llm_recovery_finetuned"

# Define datasets
test_path= "nbroad-v2.csv"
train_path ="nbroad-v2.csv"

### Install required libs

In [None]:
# !pip install -U bitsandbytes
# !pip install transformers==4.36.2
# !pip install -U peft
# !pip install -U accelerate
# !pip install -U trl
# !pip install datasets==2.16.0
# !pip install sentencepiece

### Import libs

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig,HfArgumentParser,TrainingArguments,pipeline, logging
from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training, get_peft_model
import os,torch
from datasets import load_dataset
from trl import SFTTrainer
import pandas as pd
import pyarrow as pa
import pyarrow.dataset as ds
import pandas as pd
from datasets import Dataset
import re


### Login to HF

In [None]:
from google.colab import userdata
secret_hf = userdata.get('HUGGINGFACE_TOKEN')
!huggingface-cli login --token $secret_hf

### Some Utility Funcs for preprocessing

In [44]:
def build_prompt(instruction , original_text ,rewritten_text):
  prompt=f"<s>[INST] Instruction:\n{instruction} \nOriginal Text:\n{original_text} \nRe-written Text: \n{rewritten_text}[/INST]\nAnswer:"
  return prompt

In [None]:
def remove_numbered_list(text):
    final_text_paragraphs = [] 
    for line in text.split('\n'):
        # Split each line at the first occurrence of '. '
        parts = line.split('. ', 1)
        # If the line looks like a numbered list item, remove the numbering
        if len(parts) > 1 and parts[0].isdigit():
            final_text_paragraphs.append(parts[1])
        else:
            # If it doesn't look like a numbered list item, include the line as is
            final_text_paragraphs.append(line)

    return '  '.join(final_text_paragraphs)


#trims LLM output to just the response
def trim_to_response(text):
    terminate_string = "[/INST]"
    text = text.replace('</s>', '')
    #just in case it puts things in quotes
    text = text.replace('"', '')
    text = text.replace("'", '')

    last_pos = text.rfind(terminate_string)
    return text[last_pos + len(terminate_string):] if last_pos != -1 else text

#looks for response_start / returns only text that occurs after
def extract_text_after_response_start(full_text):
    parts = full_text.rsplit(response_start, 1)  # Split from the right, ensuring only the last occurrence is considered
    if len(parts) > 1:
        return parts[1].strip()  # Return text after the last occurrence of response_start
    else:
        return full_text  # Return the original text if response_start is not found

    
#trims text to requested number of sentences (or first LF or double-space sequence)
def trim_to_first_x_sentences_or_lf(text, x=1):
    if x <= 0:
        return ""

    # Any double-spaces dealt with as linefeed
    text = text.replace("  ", "\n")

    # Split text at the first linefeed
    text_chunks = text.split('\n', 1)
    first_chunk = text_chunks[0]

    # Split the first chunk into sentences, considering the space after each period
    sentences = [sentence.strip() for sentence in first_chunk.split('.') if sentence]

    # If there's a linefeed, return the text up to the first linefeed
    if len(text_chunks) > 1:
        # Check if the first chunk has fewer sentences than x, and if so, just return it
        if len(sentences) < x:
            trimmed_text = first_chunk
        else:
            # Otherwise, trim to x sentences within the first chunk
            trimmed_text = '. '.join(sentences[:x]).strip()
    else:
        # If there's no linefeed, determine if the number of sentences is less than or equal to x
        if len(sentences) <= x:
            trimmed_text = '. '.join(sentences).strip()  # Ensure space is preserved after periods
        else:
            # Otherwise, return the first x sentences, again ensuring space after periods
            trimmed_text = '. '.join(sentences[:x]).strip()

    # Add back the final period if it was removed and the text needs to end with a sentence.
    if len(sentences) > 0 and not trimmed_text.endswith('.'):
        trimmed_text += '.'

    return trimmed_text

In [None]:
def get_prompt(model, tokenizer ,instruction , orig_text, transformed_text, response_prefix):
    prompt = build_prompt(instruction ,orig_text ,transformed_text )
    #decoded = pipe(prompt)
    inputs = tokenizer([prompt,], return_tensors = "pt").to("cuda")
    outputs = model.generate(**inputs, max_new_tokens = 20, use_cache = True)
    decoded = tokenizer.batch_decode(outputs)
    just_response = trim_to_response(decoded[0])        
    final_text = extract_text_after_response_start(just_response)
        
    #mistral has been replying with numbered lists - clean them up....
    final_text = remove_numbered_list(final_text)
        
    #mistral v02 tends to respond with the input after providing the answer - this tries to trim that down
    final_text = trim_to_first_x_sentences_or_lf(final_text, max_sentences_in_response)
    
    #default to baseline if empty or unusually short
    if len(final_text) < 15:
        final_text = base_line
    
    return final_text

# Build Dataset

In [None]:
#original text prefix
orig_prefix = "Original Text:"

instruction_prefix = "Instruction:"

#mistral "response"
instruction = "There are two sentences defined as Original Text and Re-written Text below. You will tell what new element was added or change in tone was made to improve it - with no references to the original.  You will avoid mentioning names of characters.  It is crucial no person, place or thing from the original text be mentioned.  For example - You will not say things like 'change the puppet show into a book report' - You would just say 'improve this text into a book report'.  If the original text mentions a specific idea, person, place, or thing - You will not mention it in your answer.  For example if there is a 'dog' or 'office' in the Original text - the word 'dog' or 'office' must not be in your response.  Your answer will be a single sentence."
#modified text prefix
rewrite_prefix = "Re-written Text:"

#provided as start of Mistral response (anything after this is used as the prompt)
#providing this as the start of the response helps keep things relevant
response_start = "Answer: "

#added after response_start to prime mistral
#"Improve this" or "Improve this text" resulted in non-answers.  
#"Improve this text by" seems to product good results
response_prefix = "Improve this text by"

#well-scoring baseline text
#thanks to: https://www.kaggle.com/code/rdxsun/lb-0-61
base_line = 'Refine the following passage by emulating the writing style of [insert desired style here], with a focus on enhancing its clarity, elegance, and overall impact. Preserve the essence and original meaning of the text, while meticulously adjusting its tone, vocabulary, and stylistic elements to resonate with the chosen style.Please improve the following text using the writing style of, maintaining the original meaning but altering the tone, diction, and stylistic elements to match the new style.Enhance the clarity, elegance, and impact of the following text by adopting the writing style of , ensuring the core message remains intact while transforming the tone, word choice, and stylistic features to align with the specified style.' 


In [None]:
df = pd.read_csv(train_path)
df.head(5)

# build training dataset with the right format
df['text'] = '<s>[INST] ' + instruction_prefix + '\n' + instruction + '\n' + orig_prefix + '\n' + df['original_text'] + '\n' + rewrite_prefix  +'\n'+  df['rewritten_text']+ '[/INST]'+ '\n'+ response_start  +'\n'+ df['rewrite_prompt'] + '</s>'


# remove columns
df=df.drop(['id','original_text','rewrite_prompt' ,'rewritten_text'],axis=1)

# convert to dataset object
dataset = ds.dataset(pa.Table.from_pandas(df).to_batches())
dataset = Dataset(pa.Table.from_pandas(df))
dataset

# Load the model

In [None]:
# Load base model
bnb_config = BitsAndBytesConfig(
    load_in_4bit= True,
    bnb_4bit_quant_type= "nf4",
    bnb_4bit_compute_dtype= torch.bfloat16,
    bnb_4bit_use_double_quant= False,
)
model = AutoModelForCausalLM.from_pretrained(
        base_model,
        load_in_4bit=True,
        quantization_config=bnb_config,
        torch_dtype=torch.bfloat16,
        device_map="auto",
        trust_remote_code=True,
)


model.config.use_cache = False # silence the warnings. Please re-enable for inference!
model.config.pretraining_tp = 1
model.gradient_checkpointing_enable()

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)
tokenizer.padding_side = 'right'
tokenizer.pad_token = tokenizer.eos_token
tokenizer.add_eos_token = True
tokenizer.bos_token, tokenizer.eos_token


### Count token size

In [None]:
# count trainging tokens
from transformers import LlamaTokenizer
tokenizer_ = LlamaTokenizer.from_pretrained("cognitivecomputations/dolphin-llama2-7b")
tokens = tokenizer_.tokenize(dataset.to_pandas().to_string())
len(tokens)

# Fine-Tune

In [None]:
#Adding the adapters in the layers
model = prepare_model_for_kbit_training(model)
peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=64,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj","gate_proj"]
)
model = get_peft_model(model, peft_config)

In [None]:
#Hyperparamter
training_arguments = TrainingArguments(
    output_dir="./results2",
    num_train_epochs=2,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=1,
    optim="paged_adamw_32bit",
    save_steps=50,
    logging_steps=1,
    learning_rate=2e-4,
    weight_decay=0.001,
    fp16=False,
    bf16=False,
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="constant",
)


In [None]:
# Setting sft parameters
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    peft_config=peft_config,
    max_seq_length= None,
    dataset_text_field="text",
    tokenizer=tokenizer,
    args=training_arguments,
    packing= False,
)

In [None]:
trainer.train()

# Save and push the adapter to HF

In [None]:
# Save the fine-tuned model
trainer.model.save_pretrained(new_model)
model.config.use_cache = True
model.eval()

In [None]:
trainer.model.push_to_hub(new_model)

# Test the model
  Restart kernel may need due to memory issues

In [None]:
max_sentences_in_response = 1

### Load base model and peft adapters

In [None]:
model_id = "D:/2024_projects/llm_recovery/Mistral"
peft_model_id = "D:/2024_projects/llm_recovery/results3/checkpoint-2400"

In [None]:
model = AutoModelForCausalLM.from_pretrained(model_id , load_in_4bit=True, device_map="auto") # load base model 
tokenizer = AutoTokenizer.from_pretrained(model_id) # load tokenizer

In [None]:
model.load_adapter(peft_model_id) # Load peft adapters

In [None]:
# logging.set_verbosity(logging.CRITICAL)
# pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=2000)

### Load test dataset

In [None]:
df_Test = pd.read_csv("D:/2024_projects/llm_recovery/data/nbroad-v2.csv")
df_Test.head(5)

### Inferencing and get postprocessed outputs

In [None]:
answer = get_prompt(model , tokenizer, instruction ,df_Test.iloc[12]['original_text'], df_Test.iloc[12]['rewritten_text'] , response_prefix)

In [None]:
print(answer)

## CoT (Chain-of-thougt approch)

In [None]:
df_test=pd.read_csv(test_path)

questionCounter=0
correct=0
promptEnding = "[/INST]"

# this must be >= 2
fail_limit=10

# chain of thought activator, model might run out of output tokens
USE_COT=True

#this comes before the question
testGuide='Answer the following question, at the end of your response write the answer like this: Answer:a or Answer:b or Answer:c or Answer:d \n'

for index, row in df_test.iterrows():
    print("#############################")
    questionCounter = questionCounter + 1

    #chain of thought activator
    if USE_COT:
        chainOfThoughtActivator='\nfirst think step by step\n'
    else:
        chainOfThoughtActivator='\n'

    #build the prompt
    question=testGuide + row['Question'] + '\na)' + row['a'] + '\nb)' + row['b'] + '\nc)' + row['c'] + '\nd)' + row['d'] + chainOfThoughtActivator
    print(question)

    #true answer
    truth=row['Answer']

    #use a loop, if llm stopped before reaching the answer. ask again
    index=-1
    failCounter=0
    while(index==-1):

        #build the prompt
        prompt = build_prompt(question)

        #generate answer
        result = get_prompt(prompt)
        llmAnswer = result[0]['generated_text']

        #remove our prompt from it
        index = llmAnswer.find(promptEnding)
        llmAnswer = llmAnswer[len(promptEnding)+index:]

        print("LLM Answer:")
        print(llmAnswer)

        #remove spaces
        llmAnswer=llmAnswer.replace(' ','')

        #find the option in response
        index = llmAnswer.find('Answer:')

        #edge case - llm stoped at the worst time
        if(index+len('Answer:')==len(llmAnswer)):
            index=-1

        #update question for the next try. remove chain of thought
        question=testGuide + row['Question'] + '\na)' + row['a'] + '\nb)' + row['b'] + '\nc)' + row['c'] + '\nd)' + row['d']

        #Don't get stock on a question
        failCounter=failCounter+1
        if failCounter==fail_limit:
            break

    if failCounter==fail_limit:
        continue

    #find and match the option
    next_char = llmAnswer[index+len('Answer:'):][0]
    if next_char in truth:
        correct=correct+1
        print('correct')
    else:
        print('wrong')

    #update accuracy
    accuracy=correct/questionCounter
    print(f"Progress: {questionCounter/len(df_test)}")
    print(f"Accuracy: {accuracy}")




--------------------------