In [11]:
# ==============================================================================
# 1. SETUP AND INSTALLATION
# ==============================================================================

# Install the essential libraries using pip. The '-q' flag ensures a quiet installation.
# - transformers: Provides the core architecture (like AutoModelForCausalLM) and tools to work with LLMs.
# - torch: The backend tensor library for model computations.
# - datasets: A convenient library for downloading and using standard evaluation benchmarks like HumanEval.
# - accelerate: Helps in efficiently loading and running large models on available hardware, like a GPU.
!pip install transformers torch datasets accelerate -q

# Import the necessary classes and modules for the script.
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset
import re  # The regular expression module, used for cleaning up generated code.
from tqdm import tqdm  # A utility for creating smart, descriptive progress bars for loops.

print("Libraries installed and imported successfully.")

# ==============================================================================
# 2. MODEL AND TOKENIZER INITIALIZATION
# ==============================================================================

# Set the computational device. We check if a CUDA-enabled GPU is available
# (which is standard in Colab T4 runtimes) and fall back to the CPU if not.
# Running on a GPU is significantly faster for model inference.
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# Define the model checkpoint from the Hugging Face Hub.
# 'Salesforce/codegen-350M-mono' is a 350-million parameter model specifically
# trained on Python code ('mono' means single-language). It's "non-gated,"
# meaning we can use it without needing to log in or accept special terms.
checkpoint = "Salesforce/codegen-350M-mono"

# Load the tokenizer associated with the chosen model. The tokenizer is responsible
# for converting human-readable text into a sequence of numerical tokens that the model understands.
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

# Load the pre-trained model itself.
# - torch_dtype=torch.bfloat16: This loads the model weights in a 16-bit format,
#   which uses half the memory of standard 32-bit floats, making it run faster
#   and fit more easily into the GPU's memory without a significant loss in accuracy.
# - .to(device): This crucial step moves the entire model onto the selected device (the GPU).
model = AutoModelForCausalLM.from_pretrained(
    checkpoint,
    torch_dtype=torch.bfloat16,
).to(device)

print(f"Model '{checkpoint}' loaded successfully.")

# ==============================================================================
# 3. CODE GENERATION FUNCTION
# ==============================================================================

def generate_code(prompt: str, max_new_tokens: int = 150) -> str:
    """
    Generates a code completion for a given prompt using the loaded LLM.
    """
    # First, the tokenizer converts the input string `prompt` into numerical tensors.
    # .to(device) moves these tensors to the GPU to be processed by the model.
    inputs = tokenizer(prompt, return_tensors="pt").to(device)

    # Call the model's generate method to perform inference.
    # - **inputs: Unpacks the tokenized input tensors.
    # - max_new_tokens: Limits the length of the generated code to prevent overly long or rambling outputs.
    # - temperature=0.2: Controls randomness. A lower value makes the output more deterministic and focused.
    # - top_p=0.95: Nucleus sampling, which considers only the most probable tokens that make up 95% of the probability mass.
    # - do_sample=True: Enables sampling-based generation, which is necessary for temperature and top_p to have an effect.
    # - pad_token_id: Sets the padding token to the "end-of-sentence" token, a common practice.
    outputs = model.generate(
        **inputs,
        max_new_tokens=max_new_tokens,
        temperature=0.2,
        top_p=0.95,
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id
    )

    # The model's output is a tensor of token IDs. The tokenizer's decode method
    # converts these IDs back into a human-readable string.
    full_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # The generated output includes our original prompt. We slice the string
    # to return only the newly generated code that comes after the prompt.
    return full_text[len(prompt):]

print("Code generation function is defined.")

# ==============================================================================
# 4. EVALUATION FRAMEWORK
# ==============================================================================

# Load the HumanEval dataset from the Hugging Face Hub. This dataset consists of
# 164 programming problems, each with a prompt, a canonical solution, and unit tests.
human_eval_dataset = load_dataset("openai_humaneval")

def create_full_script(prompt: str, generated_code: str) -> str:
    """
    Cleans the generated code and combines it with the original prompt
    to create a complete, executable Python script.
    """
    # Models sometimes generate more than just the function body (e.g., another function
    # definition). We define "stop words" to truncate the generation at the first
    # sign that it's moving on from the intended function.
    stop_words = ["\n\n", "\ndef", "\nclass", "\nif __name__"]
    stop_pattern = re.compile("|".join(map(re.escape, stop_words)))
    match = stop_pattern.search(generated_code)
    if match:
        generated_code = generated_code[:match.start()]

    # Concatenate the original prompt (which includes the function signature)
    # with the cleaned, generated function body.
    script = prompt + generated_code
    return script

def run_test(script: str, test_code: str) -> bool:
    """
    Executes the generated script along with its corresponding unit tests safely.
    Returns True if the tests pass, and False otherwise.
    """
    try:
        # Combine the generated function with the provided unit tests from HumanEval.
        full_test_script = script + "\n" + test_code

        # IMPORTANT: `exec()` runs Python code from a string. This is powerful but can be
        # risky if the code is malicious. Here, we execute it in a restricted, empty
        # namespace (`exec_globals`) to limit its scope and potential for harm.
        exec_globals = {}
        exec(full_test_script, exec_globals)

        # If `exec` completes without raising an error (like an AssertionError from a failing test),
        # it means all unit tests passed.
        return True
    except (AssertionError, Exception):
        # If any exception occurs (e.g., a test fails, syntax error in generated code),
        # we catch it and consider the test failed.
        return False

print("Evaluation helper functions are defined.")

# ==============================================================================
# 5. MAIN EVALUATION LOOP
# ==============================================================================
print("\nStarting evaluation on the HumanEval dataset...")

num_correct = 0
total_problems = len(human_eval_dataset["test"])

# The `tqdm` wrapper provides a real-time progress bar for the loop.
for problem in tqdm(human_eval_dataset["test"], desc="Evaluating"):
    prompt = problem["prompt"]
    test = problem["test"]

    # Step 1: Call our function to generate code based on the problem's prompt.
    generated_code = generate_code(prompt)

    # Step 2: Combine the prompt and the generated code into a clean, single script.
    full_script = create_full_script(prompt, generated_code)

    # Step 3: Execute the script against the problem's unit tests.
    is_correct = run_test(full_script, test)

    # Step 4: If the tests passed, increment our success counter.
    if is_correct:
        num_correct += 1

# ==============================================================================
# 6. DISPLAY RESULTS
# ==============================================================================

# Calculate the pass@1 score as a percentage. This metric represents the
# percentage of problems the model solved correctly on its very first attempt.
pass_at_1 = (num_correct / total_problems) * 100

# Print a formatted summary of the evaluation results.
print("\n" + "="*50)
print("EVALUATION COMPLETE")
print("="*50)
print(f"Total problems evaluated: {total_problems}")
print(f"Problems solved correctly: {num_correct}")
print(f"pass@1 Score: {pass_at_1:.2f}%")
print("="*50)

Libraries installed and imported successfully.
Using device: cuda


Some weights of the model checkpoint at Salesforce/codegen-350M-mono were not used when initializing CodeGenForCausalLM: ['transformer.h.0.attn.causal_mask', 'transformer.h.1.attn.causal_mask', 'transformer.h.10.attn.causal_mask', 'transformer.h.11.attn.causal_mask', 'transformer.h.12.attn.causal_mask', 'transformer.h.13.attn.causal_mask', 'transformer.h.14.attn.causal_mask', 'transformer.h.15.attn.causal_mask', 'transformer.h.16.attn.causal_mask', 'transformer.h.17.attn.causal_mask', 'transformer.h.18.attn.causal_mask', 'transformer.h.19.attn.causal_mask', 'transformer.h.2.attn.causal_mask', 'transformer.h.3.attn.causal_mask', 'transformer.h.4.attn.causal_mask', 'transformer.h.5.attn.causal_mask', 'transformer.h.6.attn.causal_mask', 'transformer.h.7.attn.causal_mask', 'transformer.h.8.attn.causal_mask', 'transformer.h.9.attn.causal_mask']
- This IS expected if you are initializing CodeGenForCausalLM from the checkpoint of a model trained on another task or with another architecture (e

Model 'Salesforce/codegen-350M-mono' loaded successfully.
Code generation function is defined.
Evaluation helper functions are defined.

Starting evaluation on the HumanEval dataset...


Evaluating: 100%|██████████| 164/164 [12:57<00:00,  4.74s/it]


EVALUATION COMPLETE
Total problems evaluated: 164
Problems solved correctly: 150
pass@1 Score: 91.46%





In [12]:
# ==============================================================================
# INFERENCE FUNCTION
# ==============================================================================
def get_code_completion(prompt: str, max_new_tokens: int = 120) -> str:
    """
    Performs inference and generates code completion for a given prompt.

    Args:
        prompt: The starting code snippet, including function signature and docstring.
        max_new_tokens: The maximum number of new tokens to generate.

    Returns:
        The generated code as a string.
    """
    # Tokenize the input prompt: The tokenizer converts the string `prompt` into
    # a sequence of numerical tokens that the model can understand.
    # The result is a tensor, which is moved to the GPU (`.to(device)`).
    inputs = tokenizer(prompt, return_tensors="pt").to(device)

    # Generate code using the model. This is the core inference step.
    # The parameters below control how the model generates the text:
    # - **inputs: The tokenized prompt.
    # - max_new_tokens: Sets a limit on the length of the generated code.
    # - temperature=0.2: Controls the randomness of the output. A lower value like 0.2
    #   makes the model's choices more deterministic and predictable, which is good for code.
    # - top_p=0.95: Nucleus sampling. The model considers only the most likely tokens
    #   that make up a cumulative probability of 95%, filtering out less likely options.
    # - do_sample=True: This must be enabled to use temperature and top_p for sampling.
    # - pad_token_id: A standard setting to handle variable-length sequences.
    outputs = model.generate(
        **inputs,
        max_new_tokens=max_new_tokens,
        temperature=0.2,
        top_p=0.95,
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id
    )

    # Decode the full output: The `decode` function converts the numerical tokens
    # generated by the model back into a human-readable string.
    # `skip_special_tokens=True` removes any special tokens like [EOS] (end of sequence).
    full_code = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Slice the prompt out of the result: The model's output includes the original
    # prompt. We remove it by slicing the string to get only the newly generated part.
    completion = full_code[len(prompt):]
    return completion

print("Inference function is defined.")


# ==============================================================================
# SAMPLE PROBLEMS AND INFERENCE
# ==============================================================================

# We define a list of dictionaries to hold our sample problems.
# This structure makes it easy to add new problems or modify existing ones.
# Each dictionary contains a 'title' for display and a 'prompt' which is the
# actual code stub that will be fed to the model.
sample_problems = [
    {
        "title": "Problem 1: Check for Prime Number",
        "prompt": '''
def is_prime(n):
    """
    Check if a number is a prime number. A prime number is a natural
    number greater than 1 that has no positive divisors other than 1 and itself.
    Return True if the number is prime, otherwise return False.
    """
'''
    },
    {
        "title": "Problem 2: Reverse a String",
        "prompt": '''
def reverse_string(s):
    """
    Takes a string `s` as input and returns the string reversed.
    For example, reverse_string("hello") should return "olleh".
    """
'''
    },
    {
        "title": "Problem 3: Find the Nth Fibonacci Number",
        "prompt": '''
def fibonacci(n):
    """
    Return the nth Fibonacci number. The Fibonacci sequence starts
    with 0 and 1. The next number is the sum of the two preceding ones.
    fibonacci(0) = 0, fibonacci(1) = 1, fibonacci(2) = 1, fibonacci(3) = 2, ...
    """
'''
    },
    {
        "title": "Problem 4: Check for Palindrome",
        "prompt": '''
def is_palindrome(text):
    """
    Checks if a given string is a palindrome. A palindrome is a word,
    phrase, or sequence that reads the same backward as forward,
    e.g., "madam" or "racecar". The check should be case-insensitive.
    """
'''
    }
]

# --- Run Inference on All Sample Problems ---
print("\n" + "="*50)
print("Starting Inference on Sample Problems...")
print("="*50 + "\n")

# We loop through each problem defined in our `sample_problems` list.
for i, problem in enumerate(sample_problems):
    print(f"--- {problem['title']} ---\n")

    # We use .strip() to remove any leading/trailing whitespace from the prompt
    # which can sometimes affect the model's output.
    prompt = problem['prompt'].strip()

    # This is where we call our main inference function to get the model's completion.
    generated_code = get_code_completion(prompt)

    # Finally, we print the results in a clear, readable format to show exactly
    # what was given to the model (the prompt) and what it produced (the completion).
    print("PROMPT (Input to Model):")
    print(prompt)
    print("\nMODEL'S GENERATED CODE (Output):")
    print(generated_code)
    print("\n" + "="*50 + "\n")

Inference function is defined.

Starting Inference on Sample Problems...

--- Problem 1: Check for Prime Number ---

PROMPT (Input to Model):
def is_prime(n):
    """
    Check if a number is a prime number. A prime number is a natural
    number greater than 1 that has no positive divisors other than 1 and itself.
    Return True if the number is prime, otherwise return False.
    """

MODEL'S GENERATED CODE (Output):

    if n == 1:
        return False
    if n == 2:
        return True
    if n % 2 == 0:
        return False
    for i in range(3, int(math.sqrt(n)) + 1, 2):
        if n % i == 0:
            return False
    return True

def is_prime_number(n):
    """
    Check if a number is a prime number. A prime number is a natural
    number greater than 1 that has no positive divisors other


--- Problem 2: Reverse a String ---

PROMPT (Input to Model):
def reverse_string(s):
    """
    Takes a string `s` as input and returns the string reversed.
    For example, reverse_str