In [2]:
import os
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import LoraConfig
from trl import SFTTrainer
import huggingface_hub


In [3]:

# Hugging Face login (make sure to keep your token secure)
huggingface_hub.login("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")

# Model and dataset paths
base_model = "beomi/Llama-3-Open-Ko-8B"
new_model = "Llama3-Ko-3-8B-baemin"

# Load the dataset from CSV
dataset = load_dataset('csv', data_files='train.csv', split='train')



The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to C:\Users\darkb\.cache\huggingface\token
Login successful


In [None]:
# Data preprocessing
def preprocess_qa(example):
    return {
        'text': f"Question: {example['Question']} Answer: {example['Answer']}"
    }

processed_dataset = dataset.map(preprocess_qa, remove_columns=['Question', 'Answer'])

# Check CUDA capabilities
if torch.cuda.get_device_capability()[0] >= 8:
    attn_implementation = "flash_attention_2"
    torch_dtype = torch.bfloat16
else:
    attn_implementation = "eager"
    torch_dtype = torch.float16

# QLoRA config
quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch_dtype,
    bnb_4bit_use_double_quant=False,
)

# Load model
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=quant_config,
    device_map={"": 0}
)
model.config.use_cache = False
model.config.pretraining_tp = 1

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

# LoRA config
peft_params = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=32,
    bias="none",
    task_type="CAUSAL_LM",
)

# Training arguments
training_params = TrainingArguments(
    output_dir="./results",
    num_train_epochs=1,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=1,
    optim="paged_adamw_32bit",
    save_steps=100,
    logging_steps=25,
    learning_rate=2e-4,
    weight_decay=0.001,
    fp16=False,
    bf16=False,
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="constant",
    report_to="tensorboard"
)

# Set up trainer
trainer = SFTTrainer(
    model=model,
    train_dataset=processed_dataset,
    peft_config=peft_params,
    dataset_text_field="text",
    max_seq_length=512,
    tokenizer=tokenizer,
    args=training_params,
    packing=False,
)

# Start training
trainer.train()

# Save the model
model_save_path = "./results/final_model"
trainer.save_model(model_save_path)

print("Training completed and model saved.")


In [5]:
from pandas import read_csv
from tqdm import tqdm
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

def load_model_and_tokenizer(model_path):
    # 모델 로드
    model = AutoModelForCausalLM.from_pretrained(
        model_path,
        device_map="auto",
        torch_dtype=torch.float16,
    )
    model.config.use_cache = False  # 추론 시 캐시 비활성화로 메모리 사용량 감소

    tokenizer = AutoTokenizer.from_pretrained(model_path)
    return model, tokenizer

def generate_answer(question, model, tokenizer):
    inputs = tokenizer(question, return_tensors="pt", truncation=True, padding=True, max_length=512).to(model.device)
    
    with torch.no_grad():
        outputs = model.generate(
            input_ids=inputs["input_ids"],
            attention_mask=inputs["attention_mask"],
            max_length=100,  # 생성 길이 감소
            num_beams=3,     # 빔 검색 수 감소
            early_stopping=True
        )
    
    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return answer.split('?')[1].strip() if '?' in answer else answer.strip()

def process_in_batches(questions, model, tokenizer, batch_size=8):
    answers = []
    for i in tqdm(range(0, len(questions), batch_size)):
        batch = questions[i:i+batch_size]
        batch_answers = [generate_answer(q, model, tokenizer) for q in batch]
        answers.extend(batch_answers)
        torch.cuda.empty_cache()
    return answers

# 메인 실행 코드
if __name__ == "__main__":
    model_path = "./results/final_model"
    model, tokenizer = load_model_and_tokenizer(model_path)
    
    test_df = read_csv('test.csv')
    submission_df = read_csv('sample_submission.csv')
    
    submission_df['Answer'] = process_in_batches(test_df['Question'].tolist(), model, tokenizer, batch_size=8)
    
    submission_df.to_csv("submission_results.csv", index=False)
    print("Inference completed and results saved.")

Loading checkpoint shards: 100%|██████████| 6/6 [00:12<00:00,  2.14s/it]
Some parameters are on the meta device device because they were offloaded to the cpu.
  error_msgs.append(f'While copying the parameter named "{key}", '
  error_msgs.append(f'While copying the parameter named "{key}", '
  error_msgs.append(f'While copying the parameter named "{key}", '
  error_msgs.append(f'While copying the parameter named "{key}", '
  error_msgs.append(f'While copying the parameter named "{key}", '
  error_msgs.append(f'While copying the parameter named "{key}", '
  error_msgs.append(f'While copying the parameter named "{key}", '
  error_msgs.append(f'While copying the parameter named "{key}", '
  error_msgs.append(f'While copying the parameter named "{key}", '
  error_msgs.append(f'While copying the parameter named "{key}", '
  error_msgs.append(f'While copying the parameter named "{key}", '
  error_msgs.append(f'While copying the parameter named "{key}", '
  error_msgs.append(f'While copying t

RuntimeError: You can't move a model that has some modules offloaded to cpu or disk.

In [9]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import PeftModel
import pandas as pd
from tqdm import tqdm

def load_model(model_path):
    # Set default dtype
    torch_dtype = torch.float16

    # QLoRA config
    quant_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch_dtype,
        bnb_4bit_use_double_quant=False,
    )

    # Load base model
    base_model = AutoModelForCausalLM.from_pretrained(
        model_path,
        quantization_config=quant_config,
        device_map="auto",
        trust_remote_code=True,
    )

    # Load LoRA adapter
    model = PeftModel.from_pretrained(base_model, model_path)
    
    return model

def load_tokenizer(model_path):
    tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = "right"
    return tokenizer

def generate_answer(question, model, tokenizer, max_length=150):
    inputs = tokenizer(question, return_tensors="pt", truncation=True, padding=True, max_length=512).to(model.device)
    
    with torch.no_grad():
        outputs = model.generate(
            input_ids=inputs["input_ids"],
            attention_mask=inputs["attention_mask"],
            max_length=max_length,
            num_beams=5,
            early_stopping=True
        )

    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return answer.split('Question:')[1].split('Answer:')[1].strip() if 'Answer:' in answer else answer.strip()

def process_batch(questions, model, tokenizer):
    return [generate_answer(q, model, tokenizer) for q in questions]

def main():
    model_path = "./results/final_model"
    batch_size = 8  # Adjust this based on your GPU memory

    print("Loading model...")
    model = load_model(model_path)
    tokenizer = load_tokenizer(model_path)

    print("Loading test data...")
    test_df = pd.read_csv('test.csv')
    submission_df = pd.read_csv('sample_submission.csv')

    print("Generating answers...")
    answers = []
    for i in tqdm(range(0, len(test_df), batch_size)):
        batch_questions = test_df['Question'][i:i+batch_size].tolist()
        batch_answers = process_batch(batch_questions, model, tokenizer)
        answers.extend(batch_answers)

    submission_df['Answer'] = answers
    submission_df.to_csv("submission_results.csv", index=False)
    print("Results saved to submission_results.csv")

if __name__ == "__main__":
    main()

Loading model...


ValueError: Some modules are dispatched on the CPU or the disk. Make sure you have enough GPU RAM to fit the quantized model. If you want to dispatch the model on the CPU or the disk while keeping these modules in 32-bit, you need to set `load_in_8bit_fp32_cpu_offload=True` and pass a custom `device_map` to `from_pretrained`. Check https://huggingface.co/docs/transformers/main/en/main_classes/quantization#offload-between-cpu-and-gpu for more details. 

In [1]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import PeftModel
import pandas as pd
from tqdm import tqdm
import logging
import json
import re

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

def load_config(config_path):
    with open(config_path, 'r') as f:
        return json.load(f)

def load_model(model_path):
    torch_dtype = torch.float16
    quant_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch_dtype,
        bnb_4bit_use_double_quant=False,
    )
    try:
        base_model = AutoModelForCausalLM.from_pretrained(
            model_path,
            quantization_config=quant_config,
            device_map="auto",
            trust_remote_code=True,
        )
        model = PeftModel.from_pretrained(base_model, model_path)
        return model
    except Exception as e:
        logging.error(f"Error loading model: {e}")
        raise

def load_tokenizer(model_path):
    try:
        tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
        tokenizer.pad_token = tokenizer.eos_token
        tokenizer.padding_side = "right"
        return tokenizer
    except Exception as e:
        logging.error(f"Error loading tokenizer: {e}")
        raise

def clean_text(text):
    # Remove duplicates and clean up the text
    sentences = text.split('. ')
    unique_sentences = list(dict.fromkeys(sentences))
    cleaned_text = '. '.join(unique_sentences)
    # Remove any remaining repetitions
    cleaned_text = re.sub(r'(.+?)\1+', r'\1', cleaned_text)
    return cleaned_text

def generate_answer(question, model, tokenizer, max_length=150):  # Increased max_length
    inputs = tokenizer(question, return_tensors="pt", truncation=True, padding=True, max_length=150).to(model.device)
    
    try:
        with torch.no_grad():
            outputs = model.generate(
                input_ids=inputs["input_ids"],
                attention_mask=inputs["attention_mask"],
                max_length=max_length,
                num_beams=5,
                early_stopping=True
            )
        full_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
        
        # Extract the answer part more robustly
        answer_start = full_output.find('Answer:')
        if answer_start != -1:
            answer = full_output[answer_start + 7:].strip()  # +7 to skip 'Answer:'
        else:
            # If 'Answer:' is not found, return the whole output
            answer = full_output.strip()
        
        return clean_text(answer)
    except Exception as e:
        logging.error(f"Error generating answer: {e}")
        return ""

def process_batch(questions, model, tokenizer, max_length=150):
    answers = []
    for q in questions:
        try:
            answer = generate_answer(q, model, tokenizer, max_length=max_length)
            answers.append(answer)
        except Exception as e:
            logging.error(f"Error processing question '{q}': {e}")
            answers.append("")
    return answers

def main():
    config = load_config('config.json')
    model_path = config['model_path']
    batch_size = config['batch_size']
    test_file = config['test_file']
    submission_file = config['submission_file']
    output_file = config['output_file']
    max_answer_length = config.get('max_answer_length', 150)

    logging.info("Loading model...")
    model = load_model(model_path)
    tokenizer = load_tokenizer(model_path)

    logging.info("Loading test data...")
    test_df = pd.read_csv(test_file)
    submission_df = pd.read_csv(submission_file)

    logging.info("Generating answers...")
    answers = []
    for i in tqdm(range(0, len(test_df), batch_size)):
        batch_questions = test_df['Question'][i:i+batch_size].tolist()
        batch_answers = process_batch(batch_questions, model, tokenizer, max_length=max_answer_length)
        answers.extend(batch_answers)
        
        # Clear CUDA cache periodically
        if i % (batch_size * 10) == 0:
            torch.cuda.empty_cache()

    submission_df['Answer'] = answers
    submission_df.to_csv(output_file, index=False)
    logging.info(f"Results saved to {output_file}")

if __name__ == "__main__":
    try:
        main()
        torch.cuda.empty_cache()
    except Exception as e:
        logging.error(f"An error occurred: {e}")

  from .autonotebook import tqdm as notebook_tqdm
2024-08-16 22:06:05,047 - INFO - Loading model...
2024-08-16 22:06:05,618 - INFO - We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).
Loading checkpoint shards: 100%|██████████| 6/6 [00:19<00:00,  3.20s/it]
2024-08-16 22:06:25,272 - INFO - Already found a `peft_config` attribute in the model. This will lead to having multiple adapters in the model. Make sure to know what you are doing!
2024-08-16 22:06:25,679 - INFO - Loading test data...
2024-08-16 22:06:25,682 - INFO - Generating answers...
  attn_output = torch.nn.functional.scaled_dot_product_attention(
100%|██████████| 13/13 [16:19<00:00, 75.33s/it]
2024-08-16 22:22:45,019 - INFO - Results saved to submission_results.csv
