In [1]:
pip install accelerate peft bitsandbytes transformers trl

Collecting bitsandbytes
  Downloading bitsandbytes-0.44.1-py3-none-manylinux_2_24_x86_64.whl.metadata (3.5 kB)
Collecting trl
  Downloading trl-0.12.1-py3-none-any.whl.metadata (10 kB)
Collecting datasets>=2.21.0 (from trl)
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets>=2.21.0->trl)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets>=2.21.0->trl)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets>=2.21.0->trl)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets>=2.21.0->trl)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading bitsandbytes-0.44.1-py3-none-manylinux_2_24_x86_64.whl (122.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m122.4

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
import json
from datasets import Dataset
from peft import LoraConfig, AutoPeftModelForCausalLM
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments
from trl import SFTTrainer

def preprocess_example(example):
    example_dict = {
        "srno": example.get("srno", None),
        "nl_command": example.get("nl_command", None),
        "bash_code": example.get("bash_code", None),
    }
    text = f"[INST] Docstring: {example_dict['nl_command']} [/INST] Code: {example_dict['bash_code']}"
    return {"text": text}

def load_json_as_dataset(json_path):
    with open(json_path, "r") as f:
        data = json.load(f)
    return Dataset.from_list(data)

def finetune_llama_v2():
    # Load your dataset
    train_dataset = load_json_as_dataset("/content/drive/MyDrive/majorproj/data_train.json")

    # Preprocess the dataset
    train_data = train_dataset.map(preprocess_example, remove_columns=["srno", "nl_command", "bash_code"])

    tokenizer = AutoTokenizer.from_pretrained("codellama/CodeLlama-7b-hf")
    tokenizer.pad_token = tokenizer.eos_token

    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype="float16", bnb_4bit_use_double_quant=True
    )

    model = AutoModelForCausalLM.from_pretrained(
        "codellama/CodeLlama-7b-hf", quantization_config=bnb_config, device_map="auto"
    )

    model.config.use_cache = False
    model.config.pretraining_tp = 1

    peft_config = LoraConfig(
        r=8, lora_alpha=16, lora_dropout=0.05, bias="none", task_type="CAUSAL_LM"
    )

    training_arguments = TrainingArguments(
        output_dir="codellama2-finetuned-nl2bash",
        per_device_train_batch_size=2,
        gradient_accumulation_steps=16,
        optim="paged_adamw_32bit",
        learning_rate=2e-4,
        lr_scheduler_type="cosine",
        save_strategy="epoch",
        logging_steps=10,
        num_train_epochs=1,
        max_steps=500,
        fp16=True,
        push_to_hub=False,
        report_to="none"
    )

    trainer = SFTTrainer(
        model=model,
        train_dataset=train_data,
        peft_config=peft_config,
        dataset_text_field="text",
        args=training_arguments,
        tokenizer=tokenizer,
        packing=False,
        max_seq_length=512
    )

    trainer.train()
    return model, tokenizer

if __name__ == "__main__":
    ft_mode, og_tokenizer = finetune_llama_v2()

Map:   0%|          | 0/19658 [00:00<?, ? examples/s]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/749 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/411 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/637 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


Map:   0%|          | 0/19658 [00:00<?, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs


Step,Training Loss
10,2.3479
20,1.5552
30,1.3655
40,1.2988
50,1.2434
60,1.2499
70,1.218
80,1.1352
90,1.1267
100,1.173


Step,Training Loss
10,2.3479
20,1.5552
30,1.3655
40,1.2988
50,1.2434
60,1.2499
70,1.218
80,1.1352
90,1.1267
100,1.173


In [10]:
ft_model = ft_mode
def run_inference(input_text):

    # Tokenize the user input
    input_ids = og_tokenizer(input_text, return_tensors="pt").input_ids

    # Feed the tokenized input into the model for inference
    output_ids = ft_model.generate(input_ids, max_length=200, num_return_sequences=1, temperature=0.7)

    # Decode the output tokens to generate the predicted bash command
    output_text = og_tokenizer.decode(output_ids[0], skip_special_tokens=True)

    return output_text

if __name__ == "__main__":
    # Example usage
    nl_command = 'List all the processes that are using more than 100MB of memory and sort them by memory usage in descending order.'

    # Prepare the input for the model
    input_text = f"[INST] Docstring: {nl_command} [/INST] Code:"
    predicted_bash_command = run_inference(input_text)
    print("Predicted Bash Command:", predicted_bash_command)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Predicted Bash Command: [INST] Docstring: List all the processes that are using more than 100MB of memory and sort them by memory usage in descending order. [/INST] Code: ps aux | awk '{if ($3 > 100) print $0}' | sort -k 4 -r | head -n 10 | nl -nln -s ' ' | sed 's/^[ \t]*//' | column -t -s ' ' | less -S -N -X -K -s -m -p '^[0-9]*' -e '/\s*$/d' -e '/\s*$/d' -e '/\s*$/d' -e '/\s*$/d' -e '/\s*$/d' -e '/\s*$/d' -e '/\s*$/d' -e '/\s*$/d' -


In [11]:
# Define the natural language command
nl_command = 'Bash code to list all the processes that are using more than 100MB of memory and sort them by memory usage in descending order.'

# Prepare the input for the model
input_text = f"[INST] Docstring: {nl_command} [/INST] Code:"

# Tokenize the input
inputs = og_tokenizer(nl_command, return_tensors="pt").to(ft_model.device)

# Generate the output
outputs = ft_model.generate(**inputs, max_length=150)

# Decode the output
generated_code = og_tokenizer.decode(outputs[0], skip_special_tokens=True)

print("Generated Bash Command:")
print(generated_code)

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Generated Bash Command:
Bash code to list all the processes that are using more than 100MB of memory and sort them by memory usage in descending order.

\begin{code}
ps -eo pid,user,group,rss,vsz,args | awk '$5>100000 {print $0}' | sort -k5nr
\end{code}

Comment: I think you need to add a `,` after `vsz` in the `ps` command.

Comment: @Kusalananda, yes, you are right. Thanks for pointing it out.

Answer: \begin{code}
ps -eo pid,user


In [15]:
test_dataset = load_json_as_dataset("/content/drive/MyDrive/majorproj/data_test.json")
test_data = test_dataset.map(preprocess_example, remove_columns=["srno", "nl_command", "bash_code"])


Map:   0%|          | 0/2458 [00:00<?, ? examples/s]

In [16]:
sample_example = test_data[1976]

import re

# Input string
input_string = sample_example['text']

# Regular expression pattern to extract instruction and code
pattern = r'\[INST\] Docstring: (.+?) \[/INST\] Code: (.+)'

# Match the pattern
match = re.match(pattern, input_string)

# Extract instruction and code
if match:
    nl_command = match.group(1)
    actual_code = match.group(2)
    print("nl_command =", nl_command)
    print("code =", actual_code)
else:
    print("No match found.")

nl_command = Find all .mp3 files with more then 10MB and delete them from root directory .
code = find /  -type f -name *.mp3 -size +10M -exec rm  {} \;


In [17]:
input_text = f"Bash code for {nl_command}"
inputs = og_tokenizer(input_text, return_tensors="pt").to(ft_model.device)
outputs = ft_model.generate(**inputs, max_length=150)
generated_code = og_tokenizer.decode(outputs[0], skip_special_tokens=True)

# Print the results
print("Input NL Command:", nl_command)
print("Predicted Bash Command:", generated_code)
print(f"actual_code: {actual_code}")

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Input NL Command: Find all .mp3 files with more then 10MB and delete them from root directory .
Predicted Bash Command: Bash code for Find all .mp3 files with more then 10MB and delete them from root directory .mp3 files are deleted from root directory .mp3 files are deleted from root directory .mp3 files are deleted from root directory .mp3 files are deleted from root directory .mp3 files are deleted from root directory .mp3 files are deleted from root directory .mp3 files are deleted from root directory .mp3 files are deleted from root directory .mp3 files are deleted from root directory .mp3 files are deleted from root directory .mp3 files are deleted from root directory .mp3 files are deleted from root directory .mp3 files are deleted from root directory .mp3 files are deleted from root directory .
actual_code: find /  -type f -name *.mp3 -size +10M -exec rm  {} \;


In [None]:
import torch
torch.save(ft_model.state_dict(), '/content/drive/MyDrive/majorproj/codellama-fine-tuned-nl2bash')