In [1]:
#!pip install --upgrade transformers
#pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
#!pip install bitsandbytes
#!pip install accelerate 

# Load and Test LLM (llama-3.2-1B)

In [None]:
# Import required Libraries

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers import BitsAndBytesConfig
import bitsandbytes as bnb

torch.cuda.is_available()

True

In [3]:
#assign llama 3.3 to variable name
model_id = "meta-llama/Llama-3.2-1B"

#"meta-llama/Llama-3.3-70B-Instruct"

# Load Model on CUDA
device = "cuda" if torch.cuda.is_available() else "cpu"

#model check points for memory optimization
#quantization_config = BitsAndBytesConfig(load_in_8bit=True)
quantization_config = BitsAndBytesConfig(load_in_4bit=True)

quantized_model = AutoModelForCausalLM.from_pretrained(model_id, 
                                                       device_map="auto",
                                                       low_cpu_mem_usage=True,
                                                       torch_dtype=torch.bfloat16,
                                                       quantization_config=quantization_config)  #.to(device)

#print(f"Model loaded on {device}")

In [None]:
print(quantized_model.hf_device_map)  # Shows layer-device mapping
#if model output {'': 0} means the entire model is loaded on GPU 0 (CUDA device 0)#
#This confirms the model is running on CUDA and has been correctly assigned by device_map="auto".

{'': 0}


In [5]:
#check that model is running on GPU

# Check if gpu is available
print(torch.cuda.is_available())  

# Return the number of GPUs available
print(torch.cuda.device_count())  

# Shows the GPU model
print(torch.cuda.get_device_name(0))  

True
1
NVIDIA GeForce RTX 4060 Laptop GPU


In [6]:
# Load the tokenizer for your LLaMA model
#tokenizer = AutoTokenizer.from_pretrained(model_id)

#input_text = "What is your opinion on Manchester United?"
#inputs = tokenizer(input_text, return_tensors="pt").to(device)

#with torch.no_grad():
#    output = quantized_model.generate(**inputs,
#                                      max_length=150,
#                                      temperature=0.7,
#                                      top_k = 54,
#                                      top_p=0.9,
#                                      repetition_penalty = 1.3,
#                                      eos_token_id=tokenizer.eos_token_id
#                                      )

#response = tokenizer.decode(output[0], skip_special_tokens=True)
#print(response)

In [None]:
#Prompts
input_texts = ["How does Manchester United’s dominance in the 1990s compare to Manchester City’s recent success?",
               "What is your opinion on Manchester United?",
               "Who is the most underrated midfielder in the Premier League right now, and why?",
               "How has Arsenal’s playing style changed under Mikel Arteta compared to previous managers?",
]            

In [18]:
def agent_response(user_input, quantized_model,model_id):
    """
    Generates a response from the quantized model for a single input or a list of inputs.

    Args:
        user_input (str or list): The input text(s) for the model.
        quantized_model: The quantized model instance.
        model_id (str): The model ID for loading the tokenizer.

    Returns:
        list: A list of input-response pairs.
    """
    
        
    tokenizer = AutoTokenizer.from_pretrained(model_id)

    # Set padding side to avoid incorrect input handling
    tokenizer.padding_side = "left"  

    tokenizer.pad_token = tokenizer.eos_token  # Ensure there's a pad token set

    results = []

    # Ensure user_input is a list
    if not isinstance(user_input, list):
        user_input = [user_input]  # Convert single string to a list

    # Process each sentence in the user input
    for sent in user_input:
        inputs = tokenizer(sent, return_tensors="pt").to('cuda')

        with torch.no_grad():
            output = quantized_model.generate(
               **inputs,
                max_length=250,
                temperature=0.6,
                top_k=27,
                top_p=0.6,
                repetition_penalty=1.9, #higher values discourage text generation with tokens in prompt
                eos_token_id= tokenizer.eos_token_id,
                do_sample = True
            )

        response = tokenizer.decode(output[0], skip_special_tokens=True)

        # Avoid redundancy: ensure the response does not repeat the input
        if response.startswith(sent):
            response = response[len(sent):].strip()

        results.append((sent, response))  # Store input-response pair

    return results





In [None]:
#Generate model response
agent_response(input_texts,quantized_model,model_id)

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


[('How does Manchester United’s dominance in the 1990s compare to Manchester City’s recent success?',
  'The answer is simple: it doesn’t. While there are many factors that have contributed towards Man Utd being a dominant force, one of them has been their ability and willingness (and sometimes lack thereof) when recruiting.\nThe following article will examine some key players who were recruited by both clubs during different periods – specifically looking at those from before they became world beaters; after becoming World Beatners for good! So let us begin!\nManchester united was founded on November\n1st,1919 as an amalgamation between several smaller teams including Everton F.C., Bolton Wanderer FC &apos ; s Rangers Football Club plus Preston North End A.F C. However despite this initial merger all four sides remained independent until January whereupon each club had its own identity once again with only two exceptions i.e Liverpool football team which joined up later than others wh

The model as expected generates responses. However, it tends to end its response mid sentence.
Further research to determine potential cause and solutions for iterative experimentation later.