# Using the default pre-trained model

Install requirements

In [None]:
%pip install -U transformers accelerate bitsandbytes sentencepiece

Collecting transformers
  Downloading transformers-4.49.0-py3-none-any.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.0/44.0 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/pip/_internal/cli/base_command.py", line 179, in exc_logging_wrapper
    status = run_func(*args)
             ^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/pip/_internal/cli/req_command.py", line 67, in wrapper
    return func(self, options, args)
           ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/pip/_internal/commands/install.py", line 377, in run
    requirement_set = resolver.resolve(
                      ^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/pip/_internal/resolution/resolvelib/resolver.py", line 95, in resolve
    result = self._result = resolver.resolve(
                            ^^^^^^^^^^^^^^^^^
  File "/u

Import dependencies

In [None]:
from google.colab import userdata
from huggingface_hub import login as huggingface_hub_login
from transformers import BitsAndBytesConfig, AutoTokenizer, AutoModelForCausalLM
import torch
import pandas as pd
import re
from sklearn.metrics import classification_report, accuracy_score
import matplotlib.pyplot as plt
import os


KeyboardInterrupt: 

Login to hugging face

In [None]:
huggingface_hub_login(userdata.get('hugging_face_token'))


Configure 4-bit quantization for balancing accuracy and speed

In [None]:
# Configure 4-bit quantization
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)

Load CodeLlama from Hugging Face

In [None]:
model_name = "meta-llama/CodeLlama-13B-Instruct-hf"

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

# If the tokenizer has no padding token, add one to supress warnings
if tokenizer.pad_token is None:
      tokenizer.pad_token = tokenizer.eos_token

# Load model with 4-bit quantization
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=quantization_config,
    device_map="auto"
)

# Check device mapping
print(model.hf_device_map)

Model evaluation function

In [None]:
def assess_perf_regression(commit_data, perf_test):
    prompt = f"""
    [INST]
    I have given you a pull request with its description and changeset.
    I have also given you information about a performance test that Mozilla runs to measure performance for its firefox browser.
    Analyze these and tell me whether this pull request will cause a performance regression or not.
    Provide a likelihood on a scale of 0 to 100% that this pull request will cause a performance regression. Only give a number and don't give explanation.

    Pull Request:
    {commit_data}

    performance test:
    {perf_test}

    ### Response:
    [/INST]
    """

    tokens = tokenizer(prompt, return_tensors="pt")["input_ids"]
    print(f"Token count: {tokens.shape[1]}")


    # Tokenize the prompt
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=4096).to("cuda" if torch.cuda.is_available() else "cpu")

    # Generate model output
    outputs = model.generate(**inputs, max_new_tokens=4, do_sample=True, temperature=0.7, pad_token_id=tokenizer.pad_token_id)

    # Decode the output
    result = tokenizer.decode(outputs[0], skip_special_tokens=True)

    return result


Sample input data

In [None]:
# Example Pull Request description and performance test
commit_data = """
pr description: Save self-hosted.js bytecode in StartupCache r=nbp,denispal

This spidermonkey-internal code is already shared betwen processes but we can
extend this also preserve accross restarts. This is particularly helpful on
Android where the parent process cycles more frequently.

The startupcache is available before the JS engine starts so we use that instead
of the script-preloader which requires the JS engine to be initialized already.

code diff:
--- a/js/public/Initialization.h
+++ b/js/public/Initialization.h
@@ -113,17 +113,16 @@ inline bool JS_FrontendOnlyInit(void) {
  */
 inline bool JS_IsInitialized(void) {
   return JS::detail::libraryInitState >= JS::detail::InitState::Running;
 }

 namespace JS {

 // Reference to a sequence of bytes.
-// TODO: This type should be Span<cont uint8_t> (Bug 1709135)
 using SelfHostedCache = mozilla::Span<const uint8_t>;

 // Callback function used to copy the SelfHosted content to memory or to disk.
 using SelfHostedWriter = bool (*)(JSContext*, SelfHostedCache);

 /*
  * Initialize the runtime's self-hosted code. Embeddings should call this
  * exactly once per runtime/context, before the first JS_NewGlobalObject
--- a/js/xpconnect/src/XPCJSContext.cpp
+++ b/js/xpconnect/src/XPCJSContext.cpp
@@ -31,16 +31,17 @@
 #ifdef FUZZING
 #  include "mozilla/StaticPrefs_fuzzing.h"
 #endif
 #include "mozilla/StaticPrefs_dom.h"
 #include "mozilla/StaticPrefs_browser.h"
 #include "mozilla/StaticPrefs_javascript.h"
 #include "mozilla/dom/ScriptSettings.h"
 #include "mozilla/glean/JsXpconnectMetrics.h"
+#include "mozilla/scache/StartupCache.h"

 #include "nsContentUtils.h"
 #include "nsCCUncollectableMarker.h"
 #include "nsCycleCollectionNoteRootCallback.h"
 #include "nsCycleCollector.h"
 #include "nsJSEnvironment.h"
 #include "jsapi.h"
 #include "js/ArrayBuffer.h"
@@ -1139,18 +1140,32 @@ class HelperThreadTaskHandler : public T
  private:
   ~HelperThreadTaskHandler() = default;
 };

 static void DispatchOffThreadTask(JS::HelperThreadTask* aTask) {
   TaskController::Get()->AddTask(MakeAndAddRef<HelperThreadTaskHandler>(aTask));
 }

+// Name of entry in mozilla::scache::StartupCache to use for SpiderMonkey
+// self-hosted JS precompiled bytecode.
+static constexpr char kSelfHostCacheKey[] = "js.self-hosted";
+
 static bool CreateSelfHostedSharedMemory(JSContext* aCx,
                                          JS::SelfHostedCache aBuf) {
+  // Record the data to the "StartupCache" for future restarts to use to
+  // initialize the shmem with.
+  if (auto* sc = scache::StartupCache::GetSingleton()) {
+    UniqueFreePtr<char[]> copy(static_cast<char*>(malloc(aBuf.LengthBytes())));
+    if (copy) {
+      memcpy(copy.get(), aBuf.Elements(), aBuf.LengthBytes());
+      sc->PutBuffer(kSelfHostCacheKey, std::move(copy), aBuf.LengthBytes());
+    }
+  }
+
   auto& shm = xpc::SelfHostedShmem::GetSingleton();
   MOZ_RELEASE_ASSERT(shm.Content().IsEmpty());
   // Failures within InitFromParent output warnings but do not cause
   // unrecoverable failures.
   shm.InitFromParent(aBuf);
   return true;
 }

@@ -1347,28 +1362,39 @@ nsresult XPCJSContext::Initialize() {
 #endif

   // Initialize the MIME type used for the bytecode cache, after calling
   // SetProcessBuildIdOp and loading JS prefs.
   if (!nsContentUtils::InitJSBytecodeMimeType()) {
     NS_ABORT_OOM(0);  // Size is unknown.
   }

-  // When available, set the self-hosted shared memory to be read, so that we
-  // can decode the self-hosted content instead of parsing it.
+  // The self-hosted bytecode can be shared with child processes and also stored
+  // in startupcache. Only the parent process may initialize the data.
   auto& shm = xpc::SelfHostedShmem::GetSingleton();
-  JS::SelfHostedCache selfHostedContent = shm.Content();
   JS::SelfHostedWriter writer = nullptr;
   if (XRE_IsParentProcess() && sSelfHostedUseSharedMemory) {
-    // Only the Parent process has permissions to write to the self-hosted
-    // shared memory.
-    writer = CreateSelfHostedSharedMemory;
+    // Check the startup cache for a copy of the bytecode.
+    if (auto* sc = scache::StartupCache::GetSingleton()) {
+      const char* buf = nullptr;
+      uint32_t len = 0;
+      if (NS_SUCCEEDED(sc->GetBuffer(kSelfHostCacheKey, &buf, &len))) {
+        shm.InitFromParent(AsBytes(mozilla::Span(buf, len)));
+      }
+    }
+
+    // If we have no data then the InitSelfHostedCode call below will parse from
+    // scratch and invoke this callback with the results. That callback data can
+    // then be used in initialize cache and SelfHostedShmem.
+    if (shm.Content().IsEmpty()) {
+      writer = CreateSelfHostedSharedMemory;
+    }
   }

-  if (!JS::InitSelfHostedCode(cx, selfHostedContent, writer)) {
+  if (!JS::InitSelfHostedCode(cx, shm.Content(), writer)) {
     // Note: If no exception is pending, failure is due to OOM.
     if (!JS_IsExceptionPending(cx) || JS_IsThrowingOutOfMemory(cx)) {
       NS_ABORT_OOM(0);  // Size is unknown.
     }

     // Failed to execute self-hosted JavaScript! Uh oh.
     MOZ_CRASH("InitSelfHostedCode failed");
   }
--- a/startupcache/StartupCache.cpp
+++ b/startupcache/StartupCache.cpp
@@ -446,17 +446,16 @@ nsresult StartupCache::GetBuffer(const c
   // Track that something holds a reference into mTable, so we know to hold
   // onto it in case the cache is invalidated.
   mCurTableReferenced = true;
   *outbuf = value.mData.get();
   *length = value.mUncompressedSize;
   return NS_OK;
 }

-// Makes a copy of the buffer, client retains ownership of inbuf.
 nsresult StartupCache::PutBuffer(const char* id, UniqueFreePtr<char[]>&& inbuf,
                                  uint32_t len) MOZ_NO_THREAD_SAFETY_ANALYSIS {
   NS_ASSERTION(NS_IsMainThread(),
                "Startup cache only available on main thread");
   if (StartupCache::gShutdownInitiated) {
     return NS_ERROR_NOT_AVAILABLE;
   }

"""


perf_test = """
source: displaylist_mutate.html
<html>
<head>
<style>
  div {
    width:10px;
    height:10px;
    background-color:green;
    display: inline-block;
  }
</style>
</head>
<body id="body">
</body>
<script>

var start = null;
var divCount = 10000;
var maxIterations = 600;

// ensure contentful paint occurs
document.body.innerHTML = "DisplayList mutate";

for (var i = 0; i < divCount; i++) {
  var div = document.createElement("div");
  div.id = i;
  document.getElementById("body").appendChild(div);
}

var iteration = 0;
function runFrame() {
  if (document.getElementById(iteration).style.backgroundColor == "red") {
    document.getElementById(iteration).style.backgroundColor = "green";
  } else {
    document.getElementById(iteration).style.backgroundColor = "red";
  }
  iteration++;
  iteration = iteration % divCount;
  if (--maxIterations == 0) {
    var end = performance.now();
    if (window.tpRecordTime) {
      window.tpRecordTime(end - start, start);
    }
    return;
  }

  window.requestAnimationFrame(runFrame);
}

function startTest() {
  start = performance.now();
  window.requestAnimationFrame(runFrame);
}

addEventListener("load", function() {
  if (window.TalosContentProfiler) {
    TalosContentProfiler.subtestStart("displaylist_mutate.html loaded", true).then(startTest);
  } else {
    startTest();
  }
});
</script>
<script type="text/javascript" src="resource://talos-powers/TalosContentProfiler.js"></script>
</html>



type: Page load

data: we load the displaylist_mutate.html page, measuring pageload.

description:
This measures the amount of time it takes to render a page after changing its display list.
The page has a large number of display list items (10,000), and mutates one every frame.
The goal of the test is to make displaylist construction a bottleneck,
rather than painting or other factors, and thus improvements or regressions to displaylist construction will be visible.
The result is how quickly the test was able to mutate and re-paint 600 items, one during each frame.
"""


Example usage

In [None]:
# Get perf regression risk assessment
prediction = assess_perf_regression(commit_data, perf_test)
print("Model Prediction:", prediction.split("[/INST]", 1)[1].strip())


# Supervised Fine-Tuning

Install requirements

In [None]:
%pip install -q transformers accelerate bitsandbytes peft torch datasets

Import dependencies

In [None]:
from huggingface_hub import login as huggingface_hub_login
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer
import torch
from peft import LoraConfig, get_peft_model
from google.colab import userdata

Log into Hugging Face

In [None]:
hugging_face_token = "***REMOVED***"

huggingface_hub_login(userdata.get('hugging_face_token'))

Define model and tokenizer

In [None]:
model_name = "codellama/CodeLlama-7b-Instruct-hf"

tokenizer = AutoTokenizer.from_pretrained(model_name)

tokenizer.pad_token = tokenizer.eos_token

Load and format the training dataset

In [None]:
dataset = load_dataset("json", data_files="/content/drive/MyDrive/risk_data/perf_reg_training.json")

def preprocess_data(example):
    instruction = example["instruction"]
    input_text = example["input"]
    output_text = example["output"]

    prompt = f"[INST] {instruction}\n{input_text} [/INST]"
    response = f"{output_text}"

    inputs = tokenizer(prompt, padding="max_length", truncation=True, max_length=512)
    outputs = tokenizer(response, padding="max_length", truncation=True, max_length=512)

    # Replace padding tokens in labels with -100 (so they are ignored in loss calculation)
    labels = [
        (token if token != tokenizer.pad_token_id else -100) for token in outputs["input_ids"]
    ]

    return {
        "input_ids": inputs["input_ids"],
        "attention_mask": inputs["attention_mask"],
        "labels": labels
    }

formatted_dataset = dataset.map(preprocess_data,
                                remove_columns=["instruction", "input", "output"])
print(formatted_dataset)
print(formatted_dataset["train"][0])


Map:   0%|          | 0/2 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 2
    })
})
{'input_ids': [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2

Load pre-trained codellama model

In [None]:
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    load_in_4bit=True
)

lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, lora_config)


The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Define training arguments

In [None]:
training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/trained_models/fine_tuned_codellama/training_output",
    save_strategy="epoch",
    learning_rate=2e-4,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_steps=10,
    fp16=True,
    push_to_hub=False,
    remove_unused_columns=False
)

Train the model

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=formatted_dataset["train"],
    tokenizer=tokenizer
)

trainer.train()


  trainer = Trainer(
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33malal[0m ([33malal-concordia-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin




Step,Training Loss


TrainOutput(global_step=3, training_loss=14.887655893961588, metrics={'train_runtime': 13.0016, 'train_samples_per_second': 0.461, 'train_steps_per_second': 0.231, 'total_flos': 121865074900992.0, 'train_loss': 14.887655893961588, 'epoch': 3.0})

Save and load the fine-tuned model

In [None]:
trainer.save_model("/content/drive/MyDrive/trained_models/fine_tuned_codellama/trained_model")
model = AutoModelForCausalLM.from_pretrained("/content/drive/MyDrive/trained_models/fine_tuned_codellama/trained_model")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Test the fine-tuned model

In [None]:
test_prompt = "[INST] Fix the bug in this function\n\ndef add(a, b): return a - b [/INST]"

inputs = tokenizer(test_prompt, return_tensors="pt").to("cuda")
output = model.generate(**inputs, max_new_tokens=100)

print(tokenizer.decode(output[0], skip_special_tokens=True))