In [1]:
pip install accelerate peft bitsandbytes transformers trl

Collecting bitsandbytes
  Downloading bitsandbytes-0.44.1-py3-none-manylinux_2_24_x86_64.whl.metadata (3.5 kB)
Collecting trl
  Downloading trl-0.12.1-py3-none-any.whl.metadata (10 kB)
Collecting datasets>=2.21.0 (from trl)
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets>=2.21.0->trl)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets>=2.21.0->trl)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets>=2.21.0->trl)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets>=2.21.0->trl)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading bitsandbytes-0.44.1-py3-none-manylinux_2_24_x86_64.whl (122.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m122.4

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
import json
from datasets import Dataset
from peft import LoraConfig, AutoPeftModelForCausalLM
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments
from trl import SFTTrainer

def preprocess_example(example):
    example_dict = {
        "srno": example.get("srno", None),
        "nl_command": example.get("nl_command", None),
        "bash_code": example.get("bash_code", None),
    }
    text = f"[INST] Docstring: {example_dict['nl_command']} [/INST] Code: {example_dict['bash_code']}"
    return {"text": text}

def load_json_as_dataset(json_path):
    with open(json_path, "r") as f:
        data = json.load(f)
    return Dataset.from_list(data)

def finetune_llama_v2():
    # Load your dataset
    train_dataset = load_json_as_dataset("/content/drive/MyDrive/majorproj/data_train.json")

    # Preprocess the dataset
    train_data = train_dataset.map(preprocess_example, remove_columns=["srno", "nl_command", "bash_code"])

    tokenizer = AutoTokenizer.from_pretrained("codellama/CodeLlama-7b-hf")
    tokenizer.pad_token = tokenizer.eos_token

    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype="float16", bnb_4bit_use_double_quant=True
    )

    model = AutoModelForCausalLM.from_pretrained(
        "codellama/CodeLlama-7b-hf", quantization_config=bnb_config, device_map="auto"
    )

    model.config.use_cache = False
    model.config.pretraining_tp = 1

    peft_config = LoraConfig(
        r=8, lora_alpha=16, lora_dropout=0.05, bias="none", task_type="CAUSAL_LM"
    )

    training_arguments = TrainingArguments(
        output_dir="codellama2-finetuned-nl2bash",
        per_device_train_batch_size=2,
        gradient_accumulation_steps=16,
        optim="paged_adamw_32bit",
        learning_rate=2e-4,
        lr_scheduler_type="cosine",
        save_strategy="epoch",
        logging_steps=10,
        num_train_epochs=1,
        max_steps=100,
        fp16=True,
        push_to_hub=False,
        report_to="none"
    )

    trainer = SFTTrainer(
        model=model,
        train_dataset=train_data,
        peft_config=peft_config,
        dataset_text_field="text",
        args=training_arguments,
        tokenizer=tokenizer,
        packing=False,
        max_seq_length=512
    )

    trainer.train()
    return model, tokenizer

if __name__ == "__main__":
    ft_mode, og_tokenizer = finetune_llama_v2()

Map:   0%|          | 0/19658 [00:00<?, ? examples/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


Map:   0%|          | 0/19658 [00:00<?, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs


Step,Training Loss
10,2.3549
20,1.5668
30,1.3731
40,1.3085
50,1.2561
60,1.2766
70,1.2645
80,1.2106
90,1.2156
100,1.2708


In [5]:
ft_model = ft_mode
def run_inference(input_text):

    # Tokenize the user input
    input_ids = og_tokenizer(input_text, return_tensors="pt").input_ids

    # Feed the tokenized input into the model for inference
    output_ids = ft_model.generate(input_ids, max_length=200, num_return_sequences=1, temperature=0.7)

    # Decode the output tokens to generate the predicted bash command
    output_text = og_tokenizer.decode(output_ids[0], skip_special_tokens=True)

    return output_text

if __name__ == "__main__":
    # Example usage
    nl_command = 'List all the processes that are using more than 100MB of memory and sort them by memory usage in descending order.'

    # Prepare the input for the model
    input_text = f"[INST] Docstring: {nl_command} [/INST] Code:"
    predicted_bash_command = run_inference(input_text)
    print("Predicted Bash Command:", predicted_bash_command)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Predicted Bash Command: [INST] Docstring: List all the processes that are using more than 100MB of memory and sort them by memory usage in descending order. [/INST] Code: ps -eo pid,user,cmd,rss | awk '$4 > 100000 {print $0}' | sort -k 4 -nr | head -n 10 | column -t | sed 's/^[ \t]*//' | sed 's/[ \t]*$//' | sed 's/^[ \t]*//' | sed 's/[ \t]*$//' | sed 's/^[ \t]*//' | sed 's/[ \t]*$//' | sed 's/^[ \t]*//' | sed 's/[ \t]*$//' | sed 's/^[ \t]*//' | sed 's


In [6]:
# Define the natural language command
nl_command = 'Bash code to list all the processes that are using more than 100MB of memory and sort them by memory usage in descending order.'

# Prepare the input for the model
input_text = f"[INST] Docstring: {nl_command} [/INST] Code:"

# Tokenize the input
inputs = og_tokenizer(nl_command, return_tensors="pt").to(ft_model.device)

# Generate the output
outputs = ft_model.generate(**inputs, max_length=150)

# Decode the output
generated_code = og_tokenizer.decode(outputs[0], skip_special_tokens=True)

print("Generated Bash Command:")
print(generated_code)

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Generated Bash Command:
Bash code to list all the processes that are using more than 100MB of memory and sort them by memory usage in descending order.

\begin{code}
ps -eo pid,user,pri,vsz,args | awk '$4>100000 {print $0}' | sort -k 4 -r
\end{code}

Comment: I'm not sure what you mean by "sort them by memory usage in descending order".  Do you mean sort by memory usage in descending order?  Or do you mean sort by memory usage in ascending order?  Or do you mean sort by memory usage in descending order, but


In [9]:
test_dataset = load_json_as_dataset("/content/drive/MyDrive/majorproj/data_test.json")
test_data = test_dataset.map(preprocess_example, remove_columns=["srno", "nl_command", "bash_code"])


Map:   0%|          | 0/2458 [00:00<?, ? examples/s]

In [10]:
sample_example = test_data[16]

import re

# Input string
input_string = sample_example['text']

# Regular expression pattern to extract instruction and code
pattern = r'\[INST\] Docstring: (.+?) \[/INST\] Code: (.+)'

# Match the pattern
match = re.match(pattern, input_string)

# Extract instruction and code
if match:
    nl_command = match.group(1)
    actual_code = match.group(2)
    print("nl_command =", nl_command)
    print("code =", actual_code)
else:
    print("No match found.")

nl_command = find the oldest normal file in the current directory
code = find -type f -printf '%T+ %p\n' | sort | head -n 1


In [11]:
input_text = f"Bash code for {nl_command}"
inputs = og_tokenizer(input_text, return_tensors="pt").to(ft_model.device)
outputs = ft_model.generate(**inputs, max_length=150)
generated_code = og_tokenizer.decode(outputs[0], skip_special_tokens=True)

# Print the results
print("Input NL Command:", nl_command)
print("Predicted Bash Command:", generated_code)
print(f"actual_code: {actual_code}")

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Input NL Command: find the oldest normal file in the current directory
Predicted Bash Command: Bash code for find the oldest normal file in the current directory and its subdirectories and delete it.

\begin{code}
find . -type f -print0 | xargs -0 stat -c "%Y %n" | sort -n | tail -1 | cut -d' ' -f2- | xargs rm -f
\end{code}

Comment: I'm not sure if this is the best way to do it, but it works.

Comment: @user1078749: I'm not sure if this is the best way to do it, but it works.

Comment: @user1078749
actual_code: find -type f -printf '%T+ %p\n' | sort | head -n 1


In [12]:
trainer.save_model("/content/drive/MyDrive/majorproj/codellama-fine-tuned-nl2bash")


NameError: name 'trainer' is not defined

In [None]:
ft_model.eval()

# Preprocess function for testing data
def preprocess_example(example):
    nl_command = example.get("nl_command", "").strip()
    bash_code = example.get("bash_code", "").strip()

    if not nl_command or not bash_code:
        return None  # Skip examples with missing fields

    text = f"[INST] Docstring: {nl_command} [/INST] Code: {bash_code}"
    return {"text": text, "nl_command": nl_command, "bash_code": bash_code}

# Function to load JSON test data
def load_json_as_dataset(json_path):
    with open(json_path, "r") as f:
        data = json.load(f)
    dataset = Dataset.from_list(data)
    return dataset.filter(lambda x: x is not None)  # Filter out invalid examples

# Function to normalize Bash code for comparison
def normalize_bash_code(code):
    code = code.strip()
    code = re.sub(r"\s+", " ", code)  # Normalize whitespace
    return code

# Function to run inference
def run_inference(nl_command):
    input_text = f"[INST] Docstring: {nl_command} [/INST] Code:"
    input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to(ft_model.device)
    output_ids = ft_model.generate(input_ids, max_length=150, temperature=0.7)
    output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    return output_text

# Function to evaluate functional correctness
def is_functionally_correct(predicted, actual):
    try:
        result_predicted = subprocess.run(predicted, shell=True, capture_output=True, text=True, timeout=5)
        result_actual = subprocess.run(actual, shell=True, capture_output=True, text=True, timeout=5)
        return result_predicted.stdout.strip() == result_actual.stdout.strip()
    except Exception as e:
        print(f"Execution error: {e}")
        return False

# Testing the model
def test_model(test_data):
    correct_predictions = 0
    functional_correct = 0
    total_examples = len(test_data)

    mismatches = []

    for example in test_data:
        nl_command = example["nl_command"]
        actual_bash_code = example["bash_code"]

        # Run inference
        try:
            predicted_bash_code = run_inference(nl_command)
        except Exception as e:
            print(f"Inference error: {e}")
            continue

        # Normalize for string match
        normalized_predicted = normalize_bash_code(predicted_bash_code)
        normalized_actual = normalize_bash_code(actual_bash_code)

        # Exact match evaluation
        if normalized_predicted == normalized_actual:
            correct_predictions += 1

        # Functional correctness evaluation
        if is_functionally_correct(normalized_predicted, normalized_actual):
            functional_correct += 1
        else:
            mismatches.append({
                "nl_command": nl_command,
                "predicted_bash_code": predicted_bash_code,
                "actual_bash_code": actual_bash_code,
            })

        # Print results for debugging
        print(f"NL Command: {nl_command}")
        print(f"Predicted Bash Command: {predicted_bash_code}")
        print(f"Actual Bash Command: {actual_bash_code}")
        print("-" * 80)

    accuracy = correct_predictions / total_examples
    functional_accuracy = functional_correct / total_examples

    # Log mismatches
    with open("mismatches.log", "w") as log_file:
        for mismatch in mismatches:
            log_file.write(json.dumps(mismatch, indent=4) + "\n")

    print(f"Exact Match Accuracy: {accuracy:.2%}")
    print(f"Functional Accuracy: {functional_accuracy:.2%}")
    return accuracy, functional_accuracy

# Load and preprocess the test dataset
if __name__ == "__main__":
    test_dataset = load_json_as_dataset("./test_data.json")  # Replace with your test file path
    test_data = test_dataset.map(preprocess_example).filter(lambda x: x is not None)

    # Run testing
    exact_match_acc, func_acc = test_model(test_data)
    print(f"Model Test Exact Match Accuracy: {exact_match_acc:.2%}")
    print(f"Model Test Functional Accuracy: {func_acc:.2%}")