In [1]:
pip install accelerate peft bitsandbytes transformers trl

Collecting bitsandbytes
  Downloading bitsandbytes-0.44.1-py3-none-manylinux_2_24_x86_64.whl.metadata (3.5 kB)
Collecting trl
  Downloading trl-0.12.1-py3-none-any.whl.metadata (10 kB)
Collecting datasets>=2.21.0 (from trl)
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets>=2.21.0->trl)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets>=2.21.0->trl)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets>=2.21.0->trl)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets>=2.21.0->trl)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading bitsandbytes-0.44.1-py3-none-manylinux_2_24_x86_64.whl (122.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m122.4

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import json
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM
from nltk.translate.bleu_score import sentence_bleu
from rouge_score import rouge_scorer

# Load the fine-tuned model and tokenizer
output_dir = "/content/drive/MyDrive/majorproj/llm"
ft_model = AutoModelForCausalLM.from_pretrained(output_dir)
og_tokenizer = AutoTokenizer.from_pretrained(output_dir)

# Set pad_token_id if it's not already set
og_tokenizer.pad_token = og_tokenizer.eos_token

# Load the test dataset
def load_json_as_dataset(json_path):
    with open(json_path, "r") as f:
        data = json.load(f)
    return Dataset.from_list(data)

# Preprocessing function for test data
def preprocess_example(example):
    # Construct text prompt with nl_command and bash_code
    text = f"[INST] Docstring: {example.get('nl_command', '')} [/INST] Code: {example.get('bash_code', '')}"
    return {
        "text": text,
        "nl_command": example.get("nl_command", ""),
        "bash_code": example.get("bash_code", "")
    }

# Load and preprocess the test dataset
test_dataset = load_json_as_dataset("/content/drive/MyDrive/majorproj/data_test.json")
test_data = test_dataset.map(preprocess_example)

# Check the first example to ensure correct column names
print(test_data[0])  # Inspect the first example to confirm column names

# Initialize the ROUGE scorer
rouge = rouge_scorer.RougeScorer(["rouge1"])

# Function to run inference
def run_inference(input_text):
    # Tokenize the input and generate attention mask
    encoding = og_tokenizer(input_text, return_tensors="pt", padding=True, truncation=True, max_length=150)
    input_ids = encoding.input_ids
    attention_mask = encoding.attention_mask

    # Generate output using the model
    output_ids = ft_model.generate(input_ids, attention_mask=attention_mask, max_length=150, num_return_sequences=1, temperature=0.01)
    output_text = og_tokenizer.decode(output_ids[0], skip_special_tokens=True)
    return output_text.strip()

# Evaluate the model on the first `num_samples` examples from the test dataset
def evaluate_model(test_data, num_samples=10):
    total_bleu_score = 0
    total_rouge_score = 0
    total_correct_predictions = 0
    total_samples = min(num_samples, len(test_data))

    for i in range(total_samples):
        example = test_data[i]
        nl_command = example["nl_command"]
        actual_bash_code = example["bash_code"]

        # Run inference on the NL command
        input_text = f"[INST] Docstring: {nl_command} [/INST] Code:"
        predicted_bash_code = run_inference(input_text)

        # Print results for debugging
        print(f"NL Command: {nl_command}")
        print(f"Predicted Bash Command: {predicted_bash_code}")
        print(f"Actual Bash Command: {actual_bash_code}")
        print("-" * 80)

        # Calculate BLEU score
        bleu_score = sentence_bleu([actual_bash_code.split()], predicted_bash_code.split())
        total_bleu_score += bleu_score

        # Calculate ROUGE score
        rouge_scores = rouge.get_scores(predicted_bash_code, actual_bash_code)
        total_rouge_score += rouge_scores[0]['rouge1'].f

        # Calculate binary accuracy
        if predicted_bash_code.strip() == actual_bash_code.strip():
            total_correct_predictions += 1

    # Calculate average BLEU and ROUGE scores
    avg_bleu_score = total_bleu_score / total_samples
    avg_rouge_score = total_rouge_score / total_samples
    accuracy = total_correct_predictions / total_samples

    # Print the evaluation results
    print(f"Average BLEU Score on first {total_samples} samples: {avg_bleu_score:.4f}")
    print(f"Average ROUGE-1 Score on first {total_samples} samples: {avg_rouge_score:.4f}")
    print(f"Accuracy on first {total_samples} samples: {accuracy:.2%}")

    return avg_bleu_score, avg_rouge_score, accuracy

# Evaluate and print the evaluation metrics
avg_bleu, avg_rouge, accuracy = evaluate_model(test_data, num_samples=10)


Unused kwargs: ['_load_in_4bit', '_load_in_8bit', 'quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.
`low_cpu_mem_usage` was None, now default to True since model is quantized.


In [8]:
!pip install rouge_score

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24935 sha256=d932e91ca5e97f8ac23788f40753cd7889c9f5c6c9f56efc7364d8ab894ddc10
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2
