In [None]:
from fastapi import FastAPI
from pydantic import BaseModel
from transformers import AutoTokenizer, AutoModelForCausalLM
import uvicorn
import threading
from transformers import LlamaTokenizer, LlamaForCausalLM
import torch
import logging

logging.basicConfig(level=logging.DEBUG)
# Initialize FastAPI
app = FastAPI()

In [None]:
import os

# Define paths
BASE_MODEL = "meta-llama/Llama-3.2-8B-Instruct"
# ADAPTER_PATH = "/opt/notebooks/Chatbot-Credit-Card/backend/models/llama-3.2-3b-CC/final-1-epoch/"
ADAPTER_PATH = "/opt/notebooks/Chatbot-Credit-Card/backend/models/llama-3.2-8b-CC/base-1-epoch/"
print(os.listdir("/opt/notebooks/Chatbot-Credit-Card/backend/models/"))
print(os.listdir("/opt/notebooks/Chatbot-Credit-Card/backend/models/llama-3.2-8b-CC/base-1-epoch/"))

assert os.path.exists(ADAPTER_PATH), f"Path does not exist: {ADAPTER_PATH}"
assert os.path.isfile(os.path.join(ADAPTER_PATH, "adapter_config.json")), "adapter_config.json is missing in the specified path."


In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import PeftModel

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)

# Load base model with 4-bit quantization (if applicable)
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,  # Adjust dtype as needed
    bnb_4bit_use_double_quant=True,
)
model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True
)
# Load the adapter weights into the base model
model = PeftModel.from_pretrained(model, ADAPTER_PATH, trust_remote_code=True, ignore_mismatched_sizes=True)

NameError: name 'BASE_MODEL' is not defined

In [None]:
# Ensure model embeddings match the tokenizer
model.resize_token_embeddings(len(tokenizer))
# Debugging information
print("Tokenizer vocab size:", len(tokenizer))
print("Model parameters:", model.num_parameters())

In [None]:
# Assuming you already have `tokenizer` and `model` initialized and loaded
# Set a padding token
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token  # Set `eos_token` as the `pad_token`

# Test the prompt directly
def test_prompt():
    # Define the sample input
    test_prompt = (
        "I am a Male who owns a car (yes) and a house (yes). "
        "I earn 427500.0 per year and am in Civil marriage. "
        "My education level is Higher education, and I live in a Rented apartment. "
        "I am 32 years old and have 0 children. "
        "My employment duration is 12.44 years."
    )
    max_length = 100
    temperature = 0.8

    # Tokenize the input and move it to the GPU
    inputs = tokenizer(
        test_prompt,
        return_tensors="pt",
        padding=True,
        truncation=True
    ).to("cuda")
    
    # Generate text using the model
    outputs = model.generate(
        inputs.input_ids,
        attention_mask=inputs.attention_mask,  # Pass attention mask explicitly
        max_length=max_length,
        temperature=temperature,
        num_return_sequences=1,
        pad_token_id=tokenizer.eos_token_id
    )
    
    # Decode the output
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    print("Generated Text:", generated_text)

# Run the test
test_prompt()

In [None]:
# Start FastAPI in a separate thread
def run_fastapi():
    import uvicorn
    from fastapi import FastAPI
    from pydantic import BaseModel

    # Define FastAPI app
    app = FastAPI()

    class PromptRequest(BaseModel):
        prompt: str
        max_length: int = 50
        temperature: float = 0.7

    @app.post("/generate/")
    async def generate_text(request: PromptRequest):
        # Generate text using the model
        inputs = tokenizer.encode(request.prompt, return_tensors="pt").to("cuda")  # Move input to GPU
        outputs = model.generate(
            inputs,
            max_length=request.max_length,
            temperature=request.temperature,
            num_return_sequences=1,
            pad_token_id=tokenizer.eos_token_id
        )
        generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
        return {"response": generated_text}

    # Run the FastAPI app
    uvicorn.run(app, host="127.0.0.1", port=8000)

# Start the server thread
server_thread = threading.Thread(target=run_fastapi, daemon=True)
server_thread.start()

# Wait for the server to start
time.sleep(2)

# Define the FastAPI server URL
BASE_URL = "http://127.0.0.1:8000"

# Define a sample input aligned with your cc data
test_prompt = {
    "prompt": (
        "I am a Male who owns a car (yes) and a house (yes). "
        "I earn 427500.0 per year and am in Civil marriage. "
        "My education level is Higher education, and I live in a Rented apartment. "
        "I am 32 years old and have 0 children. "
        "My employment duration is 12.44 years."
    ),
    "max_length": 100,  # Adjust max_length to match complexity
    "temperature": 0.8  # Adjust temperature as needed
}

# Send a POST request to the FastAPI /generate/ endpoint
response = requests.post(f"{BASE_URL}/generate/", json=test_prompt)

# Check the response
if response.status_code == 200:
    print("Generated Text:", response.json()["response"])
else:
    print(f"Error: {response.status_code} - {response.text}")
