In [1]:
! pip install accelerate peft bitsandbytes transformers trl

Collecting accelerate
  Downloading accelerate-0.30.1-py3-none-any.whl.metadata (18 kB)
Collecting peft
  Downloading peft-0.11.1-py3-none-any.whl.metadata (13 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.43.1-py3-none-manylinux_2_24_x86_64.whl.metadata (2.2 kB)
Collecting transformers
  Downloading transformers-4.41.1-py3-none-any.whl.metadata (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.8/43.8 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting trl
  Downloading trl-0.8.6-py3-none-any.whl.metadata (11 kB)
Collecting huggingface-hub (from accelerate)
  Downloading huggingface_hub-0.23.1-py3-none-any.whl.metadata (12 kB)
Collecting safetensors>=0.3.1 (from accelerate)
  Downloading safetensors-0.4.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.8 kB)
Collecting regex!=2019.12.17 (from transformers)
  Downloading regex-2024.5.15-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (40 kB)
[2K  

In [2]:
from datasets import load_dataset
from peft import LoraConfig, AutoPeftModelForCausalLM
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments
from trl import SFTTrainer

def preprocess_example(example):
    example_dict = {
        "srno": example.get("srno", None),
        "nl_command": example.get("nl_command", None),
        "bash_code": example.get("bash_code", None),
    }
    text = f"[INST] Docstring: {example_dict['nl_command']} [/INST] Code: {example_dict['bash_code']}"
    return {"text": text}

def finetune_llama_v2():
    # Load your dataset
    dataset = load_dataset("AnishJoshi/nl2bash-custom")

    # Preprocess the dataset
    data = dataset["train"].map(preprocess_example, remove_columns=["srno", "nl_command", "bash_code"])

    tokenizer = AutoTokenizer.from_pretrained("codellama/CodeLlama-7b-hf")
    tokenizer.pad_token = tokenizer.eos_token

    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype="float16", bnb_4bit_use_double_quant=True
    )

    model = AutoModelForCausalLM.from_pretrained(
        "codellama/CodeLlama-7b-hf", quantization_config=bnb_config, device_map="auto"
    )

    model.config.use_cache = False
    model.config.pretraining_tp = 1

    peft_config = LoraConfig(
        r=8, lora_alpha=16, lora_dropout=0.05, bias="none", task_type="CAUSAL_LM"
    )

    training_arguments = TrainingArguments(
        output_dir="codellama2-finetuned-nl2bash",
        per_device_train_batch_size=2,
        gradient_accumulation_steps=16,
        optim="paged_adamw_32bit",
        learning_rate=2e-4,
        lr_scheduler_type="cosine",
        save_strategy="epoch",
        logging_steps=10,
        num_train_epochs=1,
        max_steps=100,
        fp16=True,
        push_to_hub=False,
        report_to="none"
    )

    trainer = SFTTrainer(
        model=model,
        train_dataset=data,
        peft_config=peft_config,
        dataset_text_field="text",
        args=training_arguments,
        tokenizer=tokenizer,
        packing=False,
        max_seq_length=512
    )

    trainer.train()
    return model, tokenizer

if __name__ == "__main__":
    ft_mode, og_tokenizer = finetune_llama_v2()

Downloading readme:   0%|          | 0.00/1.91k [00:00<?, ?B/s]

Repo card metadata block was not found. Setting CardData to empty.


Downloading data:   0%|          | 0.00/4.32M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/540k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/544k [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/19658 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/749 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/411 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/637 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Map:   0%|          | 0/19658 [00:00<?, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs


Step,Training Loss
10,2.5022
20,1.5955
30,1.4024
40,1.3364
50,1.2934
60,1.3076
70,1.2962
80,1.2558
90,1.2474
100,1.2834




In [3]:
ft_model = ft_mode
def run_inference(input_text):

    # Tokenize the user input
    input_ids = og_tokenizer(input_text, return_tensors="pt").input_ids

    # Feed the tokenized input into the model for inference
    output_ids = ft_model.generate(input_ids, max_length=200, num_return_sequences=1, temperature=0.7)

    # Decode the output tokens to generate the predicted bash command
    output_text = og_tokenizer.decode(output_ids[0], skip_special_tokens=True)

    return output_text

if __name__ == "__main__":
    # Example usage
    nl_command = 'List all the processes that are using more than 100MB of memory and sort them by memory usage in descending order.'

    # Prepare the input for the model
    input_text = f"[INST] Docstring: {nl_command} [/INST] Code:"
    predicted_bash_command = run_inference(input_text)
    print("Predicted Bash Command:", predicted_bash_command)


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Predicted Bash Command: [INST] Docstring: List all the processes that are using more than 100MB of memory and sort them by memory usage in descending order. [/INST] Code: ps -eo pid,user,cmd,rss | awk '$4 > 100000 {print $0}' | sort -k 4 -nr | head -n 10 | column -t | sed 's/^[ \t]*//' | sed 's/[ \t]*$//' | sed 's/^[ \t]*//' | sed 's/[ \t]*$//' | sed 's/^[ \t]*//' | sed 's/[ \t]*$//' | sed 's/^[ \t]*//' | sed 's/[ \t]*$//' | sed 's/^[ \t]*//' | sed 's


In [4]:
# Define the natural language command
nl_command = 'Bash code to list all the processes that are using more than 100MB of memory and sort them by memory usage in descending order.'

# Prepare the input for the model
input_text = f"[INST] Docstring: {nl_command} [/INST] Code:"

# Tokenize the input
inputs = og_tokenizer(nl_command, return_tensors="pt").to(ft_model.device)

# Generate the output
outputs = ft_model.generate(**inputs, max_length=150)

# Decode the output
generated_code = og_tokenizer.decode(outputs[0], skip_special_tokens=True)

print("Generated Bash Command:")
print(generated_code)


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Generated Bash Command:
Bash code to list all the processes that are using more than 100MB of memory and sort them by memory usage in descending order.

\begin{code}
ps -eo pid,user,pri,vsz,args | awk '$4>100000 {print $0}' | sort -k 4 -r
\end{code}

Comment: I'm not sure what you mean by "sort them by memory usage in descending order".  Do you mean "sort them by memory usage in descending order of memory usage"?  Or do you mean "sort them by memory usage in descending order of memory usage, but only if memory usage


In [5]:
test_dataset = load_dataset("AnishJoshi/nl2bash-custom", split="test")
test_data = test_dataset.map(preprocess_example, remove_columns=["srno", "nl_command", "bash_code"])

Repo card metadata block was not found. Setting CardData to empty.


Map:   0%|          | 0/2458 [00:00<?, ? examples/s]

In [6]:
sample_example = test_data[1976]

import re

# Input string
input_string = sample_example['text']

# Regular expression pattern to extract instruction and code
pattern = r'\[INST\] Docstring: (.+?) \[/INST\] Code: (.+)'

# Match the pattern
match = re.match(pattern, input_string)

# Extract instruction and code
if match:
    nl_command = match.group(1)
    actual_code = match.group(2)
    print("nl_command =", nl_command)
    print("code =", actual_code)
else:
    print("No match found.")



nl_command = Find all .mp3 files with more then 10MB and delete them from root directory .
code = find /  -type f -name *.mp3 -size +10M -exec rm  {} \;


In [7]:
# Run inference
input_text = f"Bash code for {nl_command}"
inputs = og_tokenizer(input_text, return_tensors="pt").to(ft_model.device)
outputs = ft_model.generate(**inputs, max_length=150)
generated_code = og_tokenizer.decode(outputs[0], skip_special_tokens=True)

# Print the results
print("Input NL Command:", nl_command)
print("Predicted Bash Command:", generated_code)
print(f"actual_code: {actual_code}")

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Input NL Command: Find all .mp3 files with more then 10MB and delete them from root directory .
Predicted Bash Command: Bash code for Find all .mp3 files with more then 10MB and delete them from root directory .

\begin{code}
find / -name "*.mp3" -size +10M -exec rm -f {} \;
\end{code}

Comment: I'm not sure if this is a good idea. I'm not sure if it will delete all the files in the directory.

Comment: @user1126332, it will delete all the files in the directory.

Comment: @user1126332, it will delete all the files in the directory.

Comment: @user1126
actual_code: find /  -type f -name *.mp3 -size +10M -exec rm  {} \;


In [10]:
import torch
torch.save(ft_model.state_dict(), 'models/codellama-fine-tuned-nl2bash.pth')