## Imports

In [None]:
import huggingface_hub
import os

import torch
from huggingface_hub import login
import os
import pandas as pd
from transformers import AutoTokenizer, AutoModelForCausalLM
import time
import transformers
import matplotlib.pyplot as plt

from utils import bytes_to_giga_bytes

## Constants

In [None]:
torch.cuda.memory_allocated()/1e9

In [None]:
TOKEN_PATH = ".secrets/hf_token.txt"
MODEL_ID = "meta-llama/Meta-Llama-3-8B"

QUANTIZATION_CONFIG = {"load_in_4bit": True}

## Download Model and Tokenizer

In [None]:
# read huggingface token from hf_token.txt
with open(TOKEN_PATH, "r") as f:
    hf_token = f.read().strip()

login(hf_token)

In [None]:
model = AutoModelForCausalLM.from_pretrained(MODEL_ID, quantization_config=QUANTIZATION_CONFIG, device_map=0)

In [None]:
# Plot a subset of the weights in the model
def plot_weights(model, layer=0):
    weights = model.transformer.h[layer].weight.detach().cpu().numpy()
    plt.imshow(weights, aspect='auto')
    plt.colorbar()
    plt.show()

In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)

print out memory usage

In [None]:
bytes_to_giga_bytes(torch.cuda.memory_allocated())

Write function for making a forward pass

In [None]:
tokenizer.encode("Hello, my dog is cute", return_tensors="pt")

In [None]:
def get_n_word_prob_dict(prompt, model, tokenizer, n=5):
    """
    Returns a dictionary of the top n most likely words to be predicted next with the corresponding probability
    """

    # Tokenize the input prompt
    encoded_input = tokenizer.encode(prompt, return_tensors='pt').to(model.device)

    # predict next tokens
    outputs = model(encoded_input)

    # Get logits from  the final output and convert to probabilities
    probs = outputs.logits[0, -1:].softmax(dim=1).detach().cpu().flatten().numpy()

    # Sort probabilities and pick top n examples
    top_n_tokens = probs.argsort()[::-1][:n]

    # Decode all top n words
    top_n_words = [tokenizer.decode(token) for token in top_n_tokens]

    # Output
    output_seq = tokenizer.decode(model.generate(encoded_input, max_length=len(encoded_input[0]) + 1)[0], skip_special_tokens=True)

    # Return dictionary of words and corresponding probability
    return  output_seq, dict(zip(top_n_words, probs[top_n_tokens]))

In [None]:
def plot(prob_dict, prompt):
    fig, ax = plt.subplots()
    ax.bar(prob_dict.keys(), prob_dict.values())
    ax.set_title(prompt + ' . . .')
    ax.set_ylabel('Probability')
    return ax

## Make

In [None]:
prompt = 'I went to the supermarket and bought a'
output_seq, prob_dict = get_n_word_prob_dict(prompt, model, tokenizer, n=50)

In [None]:
prob_dict

In [None]:
torch.cuda.max_memory_allocated()/1e9