## Models

In [None]:
from huggingface_hub import login
from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer, BitsAndBytesConfig
import torch
import os
from dotenv import load_dotenv

In [None]:
load_dotenv(override=True)
hf_token = os.getenv("HF_TOKEN")
login(token=hf_token)

In [None]:
# instruct models
LLAMA = "meta-llama/Meta-Llama-3.1-8B-Instruct"
PHI3 = "microsoft/Phi-3-mini-4k-instruct"
GEMMA2 = "google/gemma-2-2b-it"
QWEN2 = "Qwen/Qwen2-7B-Instruct" # exercise for you
MIXTRAL = "mistralai/Mixtral-8x7B-Instruct-v0.1" # If this doesn't fit it your GPU memory, try others from the hub

In [None]:
messages = [
    {"role": "system", "content": "You are a helpful assistant"},
    {"role": "user", "content": "Tell a light-hearted joke for a room of Data Scientists"}
  ]

In [None]:
# Quantization Config - this allows us to load the model into memory and use less memory

quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_quant_type="nf4"
)

In [None]:
tokenizer = AutoTokenizer.from_pretrained(LLAMA)
tokenizer.pad_token = tokenizer.eos_token
inputs = tokenizer.apply_chat_template(messages, return_tensors="pt", padding=True, return_attention_mask=True).to("cuda")

In [None]:
model = AutoModelForCausalLM.from_pretrained(LLAMA, device_map="cuda", quantization_config=quant_config)

In [None]:
memory = model.get_memory_footprint()/ 1e6
print(f"Memory footprint: {memory:,.1f} MB")

In [None]:
model

In [None]:
inputs

In [None]:
# Generate with proper parameters
outputs = model.generate(
    inputs,
    attention_mask=torch.ones_like(inputs),  # All tokens are important
    pad_token_id=tokenizer.eos_token_id,     # Or tokenizer.pad_token_id
    max_new_tokens=80
)

# Clean decoding
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

In [None]:
#Wrapping everything in a function - and adding Streaming and generation prompts
import gc
def generate(model_name, messages):
    # Initialize tokenizer and model
    tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side="left")
    tokenizer.pad_token = tokenizer.eos_token
    
    # Tokenize with padding and attention mask
    inputs = tokenizer.apply_chat_template(
        messages,
        add_generation_prompt=True,
        padding=True,  # Enable padding
        return_tensors="pt",  # Return PyTorch tensors
        return_attention_mask=True  # Return attention mask
    ).to("cuda")
    
    # Create attention mask (1 for real tokens, 0 for padding)
    attention_mask = (inputs != tokenizer.pad_token_id).int().to("cuda")
    
    # Initialize streamer
    streamer = TextStreamer(tokenizer, skip_prompt=True)
    
    # Load model with quantization
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        device_map="cuda",
        quantization_config=quant_config,
        torch_dtype=torch.float16  # Add for better memory usage
    )
    
    # Generate with proper parameters
    outputs = model.generate(
        input_ids=inputs,
        attention_mask=attention_mask,
        max_new_tokens=80,
        pad_token_id=tokenizer.pad_token_id,
        eos_token_id=tokenizer.eos_token_id,
        streamer=streamer,
        do_sample=True,  # Enable sampling for more diverse outputs
        temperature=0.7,  # Control randomness
        top_p=0.9,       # Nucleus sampling
    )
    
    # Clean up
    del tokenizer, streamer, model, inputs, outputs, attention_mask
    torch.cuda.empty_cache()
    gc.collect()  # Additional garbage collection

In [None]:
generate(PHI3, messages)

In [None]:
messages = [
    {"role": "user", "content": "Tell a light-hearted joke for a room of Data Scientists"}
]

generate(GEMMA2, messages)