In [1]:
!ls /kaggle/input/

learning-agency-lab-automated-essay-scoring-2  llama-3-8b-lora-fine-tuned-exp-1
llama-3-8b-instruct


In [2]:
%%writefile infer.py

import gc
import torch
import argparse
import pandas as pd

from tqdm import tqdm
from peft import PeftModel
from types import SimpleNamespace
from transformers import AutoTokenizer, AutoModelForCausalLM

torch.backends.cuda.enable_flash_sdp(False)
torch.backends.cuda.enable_mem_efficient_sdp(False)

def main(args):
    config = SimpleNamespace(
        data_dir = '/kaggle/input/learning-agency-lab-automated-essay-scoring-2',
    )

    model     = AutoModelForCausalLM.from_pretrained(
        args.model_pth,
        torch_dtype=torch.bfloat16,
        device_map="auto",
        trust_remote_code=True,
    )
    model     = PeftModel.from_pretrained(model, args.lora_pth)
    tokenizer = AutoTokenizer.from_pretrained(args.model_pth, padding_side='right') 
    tokenizer.pad_token = tokenizer.eos_token
    
    def preprocess(sample, text=False, infer_mode=False, max_seq=args.max_length, return_tensors=None):
        sys_prompt = "Please read the following essay and assign a score of 1,2,3,4,5,6 where 6 is the best. Output only a single number with no explanation.\n\n"
        prompt = sample["full_text"]
        if infer_mode: answer = ""
        else: answer = str(sample["score"])

        messages = [
            {"role": "user", "content": sys_prompt + prompt},
            {"role": "assistant", "content": f"\n\nThe score is: " + answer}
        ]
        formatted_sample = tokenizer.apply_chat_template(messages, tokenize=False)
        if infer_mode: formatted_sample = formatted_sample.replace("<|eot_id|>","")

        tokenized_sample = tokenizer(formatted_sample, padding=True, return_tensors=return_tensors, 
                                     truncation=True, add_special_tokens=False, max_length=max_seq) 

        if return_tensors=="pt":
            tokenized_sample["labels"] = tokenized_sample["input_ids"].clone()
        else:
            tokenized_sample["labels"] = tokenized_sample["input_ids"].copy()

        if text: return formatted_sample
        else: return tokenized_sample
    
    df_test = pd.read_csv(f'{config.data_dir}/test.csv')
    sub     = pd.read_csv(f'{config.data_dir}/sample_submission.csv')
    
    test_preds = []

    for i,row in tqdm(df_test.iterrows(), total=len(df_test)):

        tokenized_sample = preprocess(row, infer_mode=True, max_seq=args.max_length, return_tensors="pt")
        generated_ids = model.generate(**tokenized_sample.to('cuda'), max_new_tokens=2,
                                       pad_token_id=tokenizer.eos_token_id, do_sample=False)
        decoded = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)

        try:
            answer = decoded[0].rsplit("The score is: ", 1)[1]
            test_preds.append( int(answer) )
        except:
            test_preds.append( 3 )
            
    sub.score = test_preds
    sub.score = sub.score.astype('int')
    sub.to_csv(args.sub_pth, index=False)
    
    del model, tokenizer
    torch.cuda.empty_cache(); gc.collect()
    
if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--model_pth",  type=str, required=True, help="Path to the pretrained model" )
    parser.add_argument("--lora_pth",   type=str, required=True, help="Path to the PEFT LoRA adapter")
    parser.add_argument("--sub_pth",    type=str, required=True, help="Path to save submission file" )
    parser.add_argument("--max_length", type=int, required=True, help="Max length of input sequence" )
    args = parser.parse_args()

    main(args)

Writing infer.py


In [3]:
!python infer.py \
    --max_length 2048 \
    --sub_pth submission.csv \
    --model_pth /kaggle/input/llama-3-8b-instruct/Meta-Llama-3-8B-Instruct \
    --lora_pth /kaggle/input/llama-3-8b-lora-fine-tuned-exp-1/Meta-Llama-3-8B-Instruct-max-len-1024-fold-1-exp-1-ckpt

Loading checkpoint shards: 100%|██████████████████| 4/4 [03:29<00:00, 52.31s/it]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
  0%|                                                     | 0/3 [00:00<?, ?it/s]2024-04-20 18:25:03.951941: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-20 18:25:03.952053: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-20 18:25:04.052936: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
100%|█████████████████████████████████████████████| 3/3 [00:23<00:00,  7.86s/it]