In [1]:
import torch
from transformers import AutoProcessor, AutoModelForCausalLM
from PIL import Image
import requests
import yaml
from dotenv import load_dotenv
import os

# Load environment variables from .env file
load_dotenv()

# Securely get the Hugging Face token
HUGGING_FACE_TOKEN = os.getenv("HUGGING_FACE_TOKEN")

# Check if the token is available
if not HUGGING_FACE_TOKEN:
    raise ValueError("Hugging Face token not found. Please add it to your .env file.")

print("Libraries imported and environment variables loaded successfully.")

  from .autonotebook import tqdm as notebook_tqdm


Libraries imported and environment variables loaded successfully.


In [2]:
# Load the configuration file
with open('../configs/training_config.yaml', 'r') as file:
    config = yaml.safe_load(file)

print("Configuration loaded:")
print(config)

Configuration loaded:
{'model': {'base_model_name': 'llava-hf/llava-1.5-7b-hf', 'fine_tuned_path': './artifacts/fine_tuned_model', 'quantized_path': './artifacts/quantized_model', 'coreml_output_path': './artifacts/AuraModel.mlmodel'}, 'data': {'dataset_name': 'name-of-your-multimodal-dataset', 'local_path': './data'}, 'training': {'num_epochs': 3, 'learning_rate': '1e-5', 'batch_size_per_device': 4, 'output_dir': './artifacts/training_output'}, 'optimization': {'quantization_type': 'int8'}}


In [5]:
import torch
# --- THE FIX IS HERE ---
from transformers import AutoProcessor, AutoModelForVision2Seq # <- Use the correct class for Vision models
from PIL import Image
import requests
import yaml
from dotenv import load_dotenv
import os

# ... (Load environment variables and check token as before) ...

# --- Load Configuration ---
with open('../configs/training_config.yaml', 'r') as file:
    config = yaml.safe_load(file)

model_name = config['model']['base_model_name']
HUGGING_FACE_TOKEN = os.getenv("HUGGING_FACE_TOKEN") # Make sure token is loaded

# --- Load the Processor (this was already correct) ---
processor = AutoProcessor.from_pretrained(model_name, use_auth_token=HUGGING_FACE_TOKEN)

# --- Load the Model with the CORRECT class ---
# Using float16 to save memory, essential for large models
model = AutoModelForVision2Seq.from_pretrained(
    model_name, 
    torch_dtype=torch.float16, 
    low_cpu_mem_usage=True, 
    use_auth_token=HUGGING_FACE_TOKEN
)

# Move the model to the GPU if available
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

print(f"Model '{model_name}' loaded successfully on device: {device}")

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

: 