In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import PeftModel, PeftConfig
import torch

# Define the paths
trained_model_path = "/opt/notebooks/Chatbot-Credit-Card/backend/models/llama-3.1-8b-CC/base-final"
base_model_path = "meta-llama/Llama-3.1-8B"

# BitsAndBytes configuration
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
)

# Load the tokenizer from the trained model path
tokenizer = AutoTokenizer.from_pretrained(trained_model_path, trust_remote_code=True)

# Do not add special tokens or resize embeddings here

# Load the base model
print("Loading the base model...")
base_model = AutoModelForCausalLM.from_pretrained(
    base_model_path,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True,
)

# Load PEFT configuration
print("Loading the PEFT configuration...")
peft_config = PeftConfig.from_pretrained(trained_model_path)

# Load the PEFT model
print("Loading the PEFT model...")
model = PeftModel.from_pretrained(
    base_model,
    trained_model_path,
    device_map="auto",
    torch_dtype=torch.float16,
)

print("Model loaded successfully.")

# Verify that the tokenizer and model's embeddings are aligned
print(f"Tokenizer vocabulary size: {len(tokenizer)}")
print(f"Model's embedding size: {model.get_input_embeddings().weight.shape[0]}")


Loading the base model...


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Loading the PEFT configuration...
Loading the PEFT model...
Model loaded successfully.
Tokenizer vocabulary size: 128256
Model's embedding size: 128256


In [2]:
# Prepare the prompt
messages = [
    {"role": "user", "content": "My age is 30, income is $40,000, and credit score is 580."}
]

def format_chat_messages(messages):
    formatted_prompt = ""
    for message in messages:
        if message["role"] == "user":
            formatted_prompt += "<start_input>\n"
            formatted_prompt += f"{message['content']}\n"
            formatted_prompt += "<end_input>\n"
    formatted_prompt += "<start_label>\n"  # Indicate that assistant's response should follow
    return formatted_prompt

prompt = format_chat_messages(messages)

In [3]:
# Generate response from the model
print("Generating response...")
inputs = tokenizer(prompt, return_tensors="pt").to("cuda")  # Ensure the input is moved to the correct device (e.g., GPU)

# Generate tokens with the model
outputs = model.generate(
    **inputs,
    max_new_tokens=150,  # Adjust based on how long you want the output
    do_sample=True,      # Enable sampling for more varied responses
    temperature=0.7,     # Adjust for response diversity (lower for more deterministic output)
    top_k=50,            # Consider the top-k tokens for sampling
)

# Decode the generated tokens to text
response = tokenizer.decode(outputs[0], skip_special_tokens=True)

# Print the response
print("Response:")
print(response)


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Generating response...


Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)


Response:
<start_input>
My age is 30, income is $40,000, and credit score is 580.
<end_input>
<start_label>
Age: 30 years old, Income: $40,000.0, Credit score: 580.0
<end_label>
<start_prediction>
Income: $40,500.0, Credit score: 585.0
<end_prediction>
<start_output>
Income: 40500.0, Credit score: 585.0
<end_output>
<start_code>
age = 30.0
income = 40000.0
credit_score = 580.0
income_prediction = 40500.0
credit_score_prediction = 585.0
<end_code>
<start_markdown>
# Income and Credit Score Prediction
This is a machine learning project to predict income and credit score
