In [None]:
# ============================================================================
# Util code : Restart the kernel
# ============================================================================
import IPython
#IPython.Application.instance().kernel.do_shutdown(True)

In [2]:
# ============================================================================
# Install Required Libraries
# ============================================================================
!pip install -q --upgrade bitsandbytes accelerate transformers

In [3]:
# ============================================================================
# Import Libraries
# ============================================================================
import torch
import json
from IPython.display import Markdown, display, update_display
from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer, BitsAndBytesConfig


from google.colab import drive
drive.mount('/content/drive/')

import os
# List all notebooks in the folder
notebook_folder = '/content/drive/MyDrive/Colab_Notebooks'
os.listdir(notebook_folder)


Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


['02 Text Preprocessing - Exercise 2.ipynb',
 'dataset.csv',
 'FeatureExtraction_BoW.ipynb',
 'Copy of Fine-tune Llama 2 in Google Colab.ipynb',
 'IR_pipeline.ipynb',
 'IR_pipeline_colab.ipynb',
 'denver_extract.mp3',
 'starcoder2-3b.ipynb',
 'Dataset.json',
 'starcoder2-7b 0.2 .ipynb',
 'Output.json']

In [4]:
# ============================================================================
# Util Function : Track the consumption of GPU memory
# ============================================================================
def display_gpu_memory():
    """Display current GPU memory usage"""
    if torch.cuda.is_available():
        # Get memory in bytes and convert to GB
        allocated = torch.cuda.memory_allocated() / 1024**3
        reserved = torch.cuda.memory_reserved() / 1024**3
        total = torch.cuda.get_device_properties(0).total_memory / 1024**3
        free = total - allocated

        print(f"{'='*50}")
        print(f"GPU Memory Usage:")
        print(f"{'='*50}")
        print(f"Allocated: {allocated:.2f} GB")
        print(f"Reserved:  {reserved:.2f} GB")
        print(f"Free:      {free:.2f} GB")
        print(f"Total:     {total:.2f} GB")
        print(f"Usage:     {(allocated/total)*100:.1f}%")
        print(f"{'='*50}")
    else:
        print("CUDA is not available")



In [5]:
display_gpu_memory()

GPU Memory Usage:
Allocated: 0.00 GB
Reserved:  0.00 GB
Free:      14.74 GB
Total:     14.74 GB
Usage:     0.0%


In [None]:
# ============================================================================
# Load StarCoder2 Model
# ============================================================================

# Quantization Config - this allows us to load the model into memory and use less memory
quant_config = BitsAndBytesConfig(
    load_in_4bit=True, # quantization for huge memory saving
    bnb_4bit_use_double_quant=True, # double quantization for a bit more memory saving
    bnb_4bit_compute_dtype=torch.bfloat16, # good practice
    bnb_4bit_quant_type="nf4" # minor perfromance improvement
)

starCoder2 = "bigcode/starcoder2-7b"
model_name = starCoder2
quant=True
max_new_tokens=80

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

# Load model
if quant:
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        quantization_config=quant_config,
        device_map="auto"  # Better than .to("cuda") for quantized models
    )
else:
    model = AutoModelForCausalLM.from_pretrained(model_name).to("cuda")

# Setup streamer
#streamer = TextStreamer(tokenizer, skip_special_tokens=True)



In [None]:
# ============================================================================
# Define the Prompt Template
# ============================================================================
SYSTEM_PROMPT = '''
Below is an instruction that describes a task, paired with an input that provides further context.
Write a response that appropriately completes the request.

## Instruction:
Summarize the purpose of this C++ code in one or two sentences.

## Input:'''

ASSISTANT_PRIMER = """
## Response:
"""


In [None]:
# ============================================================================
# Generate Function
# ============================================================================
def generate(code_snippet):
  # Format prompt directly for code completion
  full_prompt = f"{SYSTEM_PROMPT}\n{code_snippet}\n\n{ASSISTANT_PRIMER}"

  # For StarCoder2, directly tokenize the prompt (no chat template)
  input_ids = tokenizer(full_prompt, return_tensors="pt").input_ids.to("cuda")
  attention_mask = torch.ones_like(input_ids, dtype=torch.long, device="cuda")

  outputs = model.generate(
      input_ids=input_ids,
      attention_mask=attention_mask,
      max_new_tokens=max_new_tokens,
      temperature=0.2,
      top_p=0.95,
      do_sample=True,  # Required when using temperature
      pad_token_id=tokenizer.eos_token_id
      #,streamer=streamer
  )
  full_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
  # Extract only the newly generated explanation
  prediction = full_text.split("Explanation:")[-1].split("---")[0].strip()

  return prediction




In [None]:
# ============================================================================
# Example Usage
# ============================================================================
print("\n" + "="*80)
print("StarCoder2 Code Analyzer Ready!")
print("="*80)

# Example 1:
example_code_1 = """
#include <iostream>
#include <cmath>
using namespace std;

int main() {
    double num = 16.0;
    cout << "Square root of " << num << " is " << sqrt(num) << endl;
    cout << "2 raised to power 3 is " << pow(2, 3) << endl;
    return 0;
}
"""
#generate(example_code_1)
display(Markdown(generate(example_code_1)))

print("\n" + "="*80)
display_gpu_memory()

In [None]:
def create_benchmark_prompt(obj):
    # Demonstrations teach the BASE model the pattern
    few_shot_examples = """Below is an instruction that describes a task, paired with an input that provides further context.
Write a response that appropriately completes the request.

### Instruction:
Summarize the purpose of this C++ code in one or two sentences.

### Input:
#include <iostream>
using namespace std;

int main(){
    cout << "Hello world!\n";
    return 0;
}

### Response:
## This program includes the standard input-output library and prints the message "Hello world!" to the console, then terminates successfully.

### Instruction:
Summarize the purpose of this C++ code in one or two sentences.

### Input:
#include <iostream>
using namespace std;

int main(){
    int a = 10, b = 20;
    int sum = a + b;
    cout << "Sum: " << sum << endl;
    return 0;
}

### Response:
## This program declares two integer variables, adds them together, and displays the result to the console. It demonstrates basic arithmetic operations and output formatting in C++.

### Instruction:
Summarize the purpose of this C++ code in one or two sentences.

### Input:
#include <iostream>
using namespace std;

int main(){
    for(int i = 1; i <= 5; i++){
        cout << i << " ";
    }
    cout << endl;
    return 0;
}

### Response:
This program uses a for loop to iterate from 1 to 5 and prints each number separated by spaces. It demonstrates basic loop control structure in C++.
"""

    # Your specific test case
    test_case = f"""### Instruction:
{obj['instruction']}

### Input:
{obj['input']}

### Response:
"""
    return few_shot_examples + test_case

In [None]:
# Load Dataset from Drive
with open('/content/drive/MyDrive/Colab_Notebooks/Dataset.json', 'r') as f:
    dataset = json.load(f)

results = []

# 4. Inference Loop
for item in dataset:
  if(item["id"] > 3): # first 3 data are used in the prompt
    prediction = generate(create_benchmark_prompt(item))

    results.append({
        "id": item["id"],
        "input": item["input"],
        "expected": item["output"],
        "actual": prediction
    })
    print(f"Processed ID: {item['id']}")

# 5. Save to Drive
with open('/content/drive/MyDrive/Colab_Notebooks/Output.json', 'w') as f:
    json.dump(results, f, indent=4)

print("Success! Output.json has been saved to your Drive.")

In [None]:
import json
import pandas as pd
from IPython.display import display, HTML

# 1. Load the results
file_path = '/content/drive/MyDrive/Colab_Notebooks/Output.json'
with open(file_path, 'r') as f:
    results = json.load(f)

df = pd.DataFrame(results)

# 2. CLEANING LOGIC: Isolate only the actual response
def extract_clean_response(text):
    # If the model repeated the prompt, we split at '### Response:'
    if "### Response:" in text:
        text = text.split("### Response:")[-1]

    # Remove any trailing "###" (common if the model tries to start a new block)
    text = text.split("###")[0]

    # Remove any line breaks and extra whitespace
    return text.strip()

# Apply the cleaning to the 'actual' column
df['actual'] = df['actual'].apply(extract_clean_response)

# 3. Add a Word Count for monitoring conciseness
df['word_count'] = df['actual'].apply(lambda x: len(x.split()))

# 4. Display the table
print("--- MODEL BENCHMARK RESULTS (CLEANED) ---")
# Using display(HTML(...)) to make it readable in Colab
display(HTML(df[['id', 'expected', 'actual', 'word_count']].to_html()))

# 5. Save as CSV
csv_path = '/content/drive/MyDrive/Colab_Notebooks/Benchmark_Report_Clean.csv'
df.to_csv(csv_path, index=False)
print(f"\nâœ… Clean report saved to: {csv_path}")