In [26]:
import os
import requests
from dotenv import load_dotenv
from huggingface_hub import login
from transformers.utils.quantization_config import BitsAndBytesConfig
from transformers.models.auto.modeling_auto import AutoModelForCausalLM
from transformers.models.auto.tokenization_auto import AutoTokenizer
import gc
import torch


In [27]:
load_dotenv(override=True)
hf_token = os.getenv('HF_TOKEN')
login(token=hf_token)

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


In [28]:
# model_name = 'cosmo3769/starcoderbase-1b-GGUF'
model_name = 'bigcode/starcoder2-3b'
system_message = "You are an assistant that reimplements Python code in high performance C++ for a windows machine. "
system_message += "Respond only with C++ code; use comments sparingly and do not provide any explanations other than occassional comments. "
system_message += "The C++ response needs to production an identical output in the fastest possible time."

In [29]:
def user_prompt_for(python):
    user_prompt = "Rewrite this python code in C++ with the fastest possible implementation that produces identical output in the least time. Respond on;y with C++ code; do not explain your work other than a few comments. Pay attention to number types to ensure no int overflows. Remember to #include all necessary C++ packages such as iomanip.\n\n"
    user_prompt += python
    return user_prompt

In [30]:
def message_for(python):
    return [
        {"role": "system", "content": system_message},
        {"role": "user", "content": user_prompt_for(python)}
    ]

In [31]:
# write to a file called optimized.cpp
def write_output(cpp):
    code = cpp.replace("```cpp", "").replace("```", "")
    with open("optimized.cpp", "w") as f:
        f.write(cpp)

In [32]:
# Quantization Config - this allows us to load the model into memory and use less memory

quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_quant_type="nf4"
)

In [40]:
from transformers.generation.streamers import TextIteratorStreamer
import threading
import torch
import gc

def optimize_starcoder(python):
    print(model_name)
    tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side="left")
    tokenizer.pad_token = tokenizer.eos_token

    # Use a plain string prompt for StarCoder2
    # prompt = str(message_for(str(python)))
    prompt = f"{system_message}\n\nRewrite this Python code in high-performance C++, also only provide C++ code noting more, nothing less :\n\n{python}\n\n// C++ implementation:\n"

    inputs = tokenizer(
        prompt,
        return_tensors="pt",
        padding=True,
        return_attention_mask=True
    ).to("cuda")

    attention_mask = (inputs.input_ids != tokenizer.pad_token_id).int().to("cuda")

    streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)

    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        device_map="cuda",
        quantization_config=quant_config,
        torch_dtype=torch.float16,
    )

    generation_kwargs = {
        "input_ids": inputs.input_ids,
        "attention_mask": attention_mask,
        "max_new_tokens": 512,
        "pad_token_id": tokenizer.pad_token_id,
        "eos_token_id": tokenizer.eos_token_id,
        "streamer": streamer,
        "do_sample": True,
        "temperature": 0.7,
        "top_p": 0.9,
    }

    thread = threading.Thread(target=model.generate, kwargs=generation_kwargs)
    thread.start()

    reply = ""
    for text in streamer:
        print(text, end="", flush=True)
        reply += text

    write_output(reply)
    thread.join()
    # Clean up
    del tokenizer, streamer, model, inputs, attention_mask
    torch.cuda.empty_cache()
    gc.collect()

In [41]:
pi = """
import time

def calculate(iterations, param1, param2):
    result = 1.0
    for i in range(1, iterations+1):
        j = i * param1 - param2
        result -= (1/j)
        j = i * param1 + param2
        result += (1/j)
    return result

start_time = time.time()
result = calculate(100_000_000, 4, 1) * 4
end_time = time.time()

print(f"Result: {result:.12f}")
print(f"Execution Time: {(end_time - start_time):.6f} seconds")
"""

In [42]:
exec(pi)

Result: 3.141592658589
Execution Time: 9.848153 seconds


In [43]:
optimize_starcoder(pi)

bigcode/starcoder2-3b

#include <iostream>

double calculate(const int iterations, const int param1, const int param2) {
    double result = 1.0;
    for (int i = 1; i <= iterations; i++) {
        const int j = i * param1 - param2;
        result -= 1.0 / j;
        j = i * param1 + param2;
        result += 1.0 / j;
    }
    return result;
}

int main() {
    double result = calculate(100000000, 4, 1) * 4;
    std::cout << "Result: " << result << std::endl;
}


// Expected output:

// Result: 1.2463054572948753
// Execution Time: 0.000010 seconds


// You can use any language that you like to write your code. You can use a language that is not covered in the course.

// You can use any compiler, tool, library, etc. to compile your code.

// You can use any language that you like to test your code.

// You can use any language that you like to test your code.

// You can use any library, tool, etc. to test your code.

// You can use any test framework, tool, etc. to test your code.



In [4]:
!g++ -O3 -std=c++17 -march=native -o optimized.exe optimized.cpp
!optimized.exe

Result: 3.14159


Note: you may need to restart the kernel to use updated packages.


d:\Codes\Hackathon\llm_engineering\.venv\Scripts\python.exe: No module named uv
