In [None]:
! pip install nltk rouge

In [None]:
import numpy as np
from datasets import load_dataset
import re
from nltk.translate.bleu_score import sentence_bleu
from rouge import Rouge
import requests
import os
from dotenv import load_dotenv
load_dotenv()

In [None]:
def preprocess_example(example):
    example_dict = {
        "srno": example.get("srno", None),
        "nl_command": example.get("nl_command", None),
        "bash_code": example.get("bash_code", None),
    }
    text = f"[INST] Docstring: {example_dict['nl_command']} [/INST] Code: {example_dict['bash_code']}"
    return {"text": text}

def extract_code_type1(input_string):
    # Define the regex pattern to match text between \begin{code} and \end{code}
    pattern = r'\\begin\{code\}(.*?)\\end\{code\}'
    
    # Use re.search to find the first occurrence of the pattern
    match = re.search(pattern, input_string, re.DOTALL)
    
    # Check if a match is found and return the captured group
    if match:
        return match.group(1)
    else:
        return None

def run_llama_inference(nl_command):
    
    API_URL = os.getenv("llama_api_url")
    headers = {
        "Accept" : "application/json",
        "Authorization": os.getenv("hf_token"),
        "Content-Type": "application/json"
    }

    def query(payload):
        response = requests.post(API_URL, headers=headers, json=payload)
        return response.json()

    output = query({
        "inputs": f"Bash code to {nl_command}",
        "parameters": {
            "temperature": 0.01,
            "max_new_tokens": 500
        }
    })
    return output[0]["generated_text"]

def extract_pairs(input_string):

    # Regular expression pattern to extract instruction and code
    pattern = r'\[INST\] Docstring: (.+?) \[/INST\] Code: (.+)'

    # Match the pattern
    match = re.match(pattern, input_string)

    # Extract instruction and code
    if match:
        nl_command = match.group(1)
        actual_code = match.group(2)
    else:
        print("No match found.")
    return actual_code, nl_command


In [None]:
test_dataset = load_dataset("AnishJoshi/nl2bash-custom", split="test")
test_data = test_dataset.map(preprocess_example, remove_columns=["srno", "nl_command", "bash_code"])

# Initialize variables for scores and total number of valid examples
total_bleu_score = 0.0
total_rouge_score = 0.0
total_correct_predictions = 0
num_examples = len(test_data)

rouge = Rouge()

In [None]:
for example in test_data:
    try:
        # Encapsulate the processing of each example inside a try-except block
        actual_code, nl_command = extract_pairs(example['text'])

        # Run inference to get the predicted code
        predicted_code = run_llama_inference(nl_command)
        
        # Extract the code body if necessary
        predicted_code = extract_code_type1(predicted_code)

        # Calculate BLEU score
        bleu_score = sentence_bleu([actual_code.split()], predicted_code.split())
        total_bleu_score += bleu_score

        # Calculate ROUGE score
        rouge_scores = rouge.get_scores(predicted_code, actual_code)
        total_rouge_score += rouge_scores[0]['rouge-1']['f']

        # Calculate binary accuracy
        if predicted_code.strip() == actual_code.strip():
            total_correct_predictions += 1
    except Exception as e:
        print(f"Error processing example: {str(e)}")
        # Skip example if any step fails
        num_examples -= 1



In [None]:
# Calculate average scores
average_bleu_score = total_bleu_score / num_examples
average_rouge_score = total_rouge_score / num_examples
binary_accuracy = total_correct_predictions / num_examples

# Print the average scores
print(f"Average BLEU-1 score: {average_bleu_score:.2f}")
print(f"Average ROUGE-1 score: {average_rouge_score:.2f}")
print(f"Binary accuracy: {binary_accuracy:.2%}")