In [3]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
from tqdm import tqdm


# Set up device
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model_name = "microsoft/phi-4"

# Load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto")

def calculate_perplexity(text, model_name="microsoft/phi-4"):
    """
    Calculate the perplexity of an LLM given a text input.
    Perplexity = exp(average negative log-likelihood per token)
    
    Args:
        text (str): The text to calculate perplexity for
        model_name (str): Name of the HuggingFace model to use
        
    Returns:
        float: The perplexity score
    """
    
    # Encode the text
    encodings = tokenizer(text, return_tensors="pt").to(model.device)
    
    # Get the sequence length and model's maximum context length
    seq_len = encodings.input_ids.size(1)
    max_length = min(model.config.max_position_embeddings, 2048)  # Use model's max or cap at 2048
    
    # For perplexity calculation, we'll use a sliding window approach if the text is long
    stride = max(1, max_length // 2)  # Use half the max length as stride for efficiency
    
    # Initialize variables to track the cumulative negative log-likelihood and token count
    nlls = []
    total_tokens = 0
    
    # Process the text in chunks with sliding window
    for i in tqdm(range(0, seq_len, stride)):
        # Define the chunk boundaries
        begin_loc = i
        end_loc = min(begin_loc + max_length, seq_len)
        
        # Extract the chunk
        input_ids = encodings.input_ids[:, begin_loc:end_loc]
        
        # Create targets by shifting inputs to the right
        # This is how we set up the task for the model to predict the next token
        target_ids = input_ids.clone()
        
        # Calculate loss
        with torch.no_grad():
            outputs = model(input_ids, labels=target_ids)
            neg_log_likelihood = outputs.loss.item()
        
        # The model calculates average loss, so we need to multiply by the number of tokens
        # to get total loss, subtracting 1 to account for the label shifting
        num_tokens = end_loc - begin_loc - 1
        total_loss = neg_log_likelihood * num_tokens
        
        nlls.append(total_loss)
        total_tokens += num_tokens
        
        # If we've reached the end of the sequence, break
        if end_loc == seq_len:
            break
    
    # Calculate average negative log-likelihood
    avg_nll = sum(nlls) / total_tokens if total_tokens > 0 else 0
    
    # Calculate perplexity
    perplexity = torch.exp(torch.tensor(avg_nll))
    
    return perplexity.item()

Loading checkpoint shards:   0%|          | 0/6 [00:00<?, ?it/s]

Some parameters are on the meta device because they were offloaded to the cpu.


In [10]:
calculate_perplexity("I strive for da gold...  And pic my own shit. Move bricks ill break and go pitch da whole load")

  0%|                                                                                                                                                                                          | 0/1 [00:03<?, ?it/s]


1066.717529296875

In [11]:
calculate_perplexity("I strive for the gold... And choose my own things. Move bricks Iâ€™ll break and then go sell the whole load.")

  0%|                                                                                                                                                                                          | 0/1 [00:03<?, ?it/s]


224.47000122070312