In [None]:
!pip install openai

# Dataset Generation

In [None]:
from openai import OpenAI
import os, pandas as pd
OPENAI_KEY="YOUR KEY"
client = OpenAI(api_key=OPENAI_KEY)
prompt="""
Given a code snippet (X), GENERATE A NATURAL LANGUAGE TASK (T) THAT Summarize the overall FUNCTIONALITY OF THE CODE (X) .
T should not have any implementation details 
T should Start the task with "Write a l language (the language the code belongs to) function ,  and returns an output z , if X doesn't return anything then skip this line returns an output z "
T should only contain the overall summary of the code functionality.
T should not have any code related details.
T should not contain any variable names.
T should end with  these lines Only return the code, don't include any other information,such as a preamble or suffix.
Writing Format "Write a C function that allocates memory for and copies a given array of strings, and returns a pointer to the new array.

    Only return the code, don't include any other information,
    such as a preamble or suffix.
"
"""

# Initialize an empty dictionary to store DataFrames
dfs = {}
import time
def count_words(text):
    # Split the text into words
    words = text.split()
    
    # Count the number of words
    num_words = len(words)
    
    return num_words

ref_model='gpt-4'  
path='Path to Datasets in csv format. MUST have two columns CWE, Original Code '
savepath=path

files = os.listdir(path)
    
# Filter the list to include only CSV files
csv_files = [file for file in files if file.endswith('.csv')]
    
# Read each CSV file into a DataFrame and store it in the dictionary
for csv_file in csv_files:
      output_file_path = os.path.join(savepath, f"Generated_{csv_file}")
      print(output_file_path)
      if not (os.path.exists(output_file_path)):
        file_path = os.path.join(path, csv_file)
        df = pd.read_csv(file_path)
        df['NLP_Task']=None
        df['time_taken']=None
        df['no_of_tokens']=None
        
        for index, row in df.iterrows():
              org_code=row['Original_Code']
              
              try:
                    # Do it for code where vulnerability was present
                    # Record the start time
                    start_time = time.time()
                    completion = client.chat.completions.create(
                      model=ref_model,
                      messages=[
                        {
                          "role": "system",
                          "content": prompt
                        },
                        {
                          "role": "user",
                          "content": org_code
                        }
                        
                          
                      ],
                      temperature=0,
                      seed=42
                    )
                    generated_task=completion.choices[0].message.content
                    df.at[index, 'NLP_Task'] =generated_task 
                    # Record the end time
                    end_time = time.time()
                    # Calculate the execution time
                    execution_time = end_time - start_time
                    df.at[index, 'time_taken'] = execution_time
                    df.at[index, 'no_of_tokens'] = count_words(generated_task)
                    
                   
                    
                    
              except Exception as e:
                    # Catch the exception and handle it
                    print(f"An exception occurred: {e}")
        
        df.to_csv(output_file_path, index=False) 
      else:
        print("File Exists")






# NL Prompt Evaluation

In [None]:
!pip install pycocoevalcap

In [None]:

def generate_code(task,message,temp=0):
   
   while(1):
    completion = client.chat.completions.create(
                      model='gpt-4',
                      messages=[
                        {
                          "role": "system",
                          "content":message
                        },
                        {
                          "role": "user",
                          "content":task
                        }
                      ],
                      temperature=temp,  # Adjust based on desired creativity
                      seed=42  # Optional: use for consistency
                    )
    #print("gpt",completion)
    context=completion.choices[0].message.content
    if(len(context.split(' '))>5):
        return context
    
import pandas as pd
import re
datapath='Your Datasets Path'


datasets=['LLMSecEval-prompts','SecureEvalDataset']
for dataset in datasets:
    df = pd.read_csv(datapath+'baseline/'+dataset+'.csv')
    baseline_models=['gpt-4']
    
    for mod in baseline_models:
        OriginalFile=dataset+'_'+mod+'_Generator_Baseline'+'.csv'
   
        for index, row in df.iterrows():
            
                    
                    task=row['NL Prompt']
                    if(dataset=='SecureEvalDataset'):
                        lang='python'
                    else:
                        lang=row['Language']
                    message="""You are a code generator. Given the task, generate the code.
                    (a) The Code MUST be runnable without errors. 
                    (b) ONLY RETURN CODE NO Details or comments. 
                    (c) If Code to be generated is in C or C++ language It MUST have a main function 
                    (d) It should have all the dependencies, libraries required to compile the code without error"""
                    task=task.replace("<language>",lang)
                    
                    onlycode=''
                    code=''
                    tries=0
                    while onlycode=='' and  '```' not in code:
                        
                              code=generate_code(task,message)
                        
                        tries+=1
                        if(tries>10):
                            onlycode=code
                            break
                        print("onlycode ",onlycode)
                    df.at[index,'org_code']=row['Original_Code']
                    df.at[index,'generated_code']=(code)
                    df.at[index,'generator']=mod
                    
 
        df.to_csv(datapath+'baseline/'+OriginalFile, index=False)

In [None]:
import nltk
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from lexical_diversity import lex_div as ld
from textblob import TextBlob
import textstat
import torch

# Download required NLTK data
nltk.download('punkt')

# Load pre-trained GPT-2 model and tokenizer for perplexity
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2")

# Define evaluation functions
def calculate_perplexity(text):
    """Calculate perplexity using GPT-2 model"""
    inputs = tokenizer(text, return_tensors="pt")
    outputs = model(**inputs, labels=inputs["input_ids"])
    loss = outputs.loss
    perplexity = torch.exp(loss).item()
    return perplexity

def lexical_diversity(text):
    """Calculate Lexical Diversity (Type-Token Ratio)"""
    return ld.ttr(text)

def sentiment_analysis(text):
    """Calculate Sentiment Polarity and Subjectivity"""
    blob = TextBlob(text)
    polarity = blob.sentiment.polarity  # -1 (negative) to 1 (positive)
    subjectivity = blob.sentiment.subjectivity  # 0 (objective) to 1 (subjective)
    return polarity, subjectivity

def readability_scores(text):
    """Calculate Readability scores using textstat"""
    fk_score = textstat.flesch_reading_ease(text)  # Flesch-Kincaid Grade Level
    gf_score = textstat.gunning_fog(text)  # Gunning Fog Index
    smog_score = textstat.smog_index(text)  # SMOG Index
    std=textstat.text_standard(text, float_output=False)
    return fk_score, gf_score, smog_score,std

def calculate_conciseness(text):
    """Calculate the length of the text to measure conciseness"""
    word_count = len(nltk.word_tokenize(text))
    return word_count



In [None]:
from pycocoevalcap.bleu.bleu import Bleu
from pycocoevalcap.rouge.rouge import Rouge
import pandas as pd
import os

def _strip(s):
    return s.strip()

def compute_metrics(hypothesis, references, no_overlap=False, no_skipthoughts=False, no_glove=False):
    # Assuming hypothesis and references are lists of strings
    hyp_list = hypothesis if isinstance(hypothesis, list) else [hypothesis]
    ref_list = references if isinstance(references, list) else [references]
    
    refs = {idx: [ref] for idx, ref in enumerate(ref_list)}
    hyps = {idx: [hyp] for idx, hyp in enumerate(hyp_list)}
    
    assert len(refs) == len(hyps), "The number of hypotheses must match the number of references."
    
    ret_scores = {}
    if not no_overlap:
        scorers = [
            (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]),
            (Rouge(), "ROUGE_L"),
            
        ]
        for scorer, method in scorers:
            score, scores = scorer.compute_score(refs, hyps)
            if isinstance(method, list):
                for sc, scs, m in zip(score, scores, method):
                    ret_scores[m] = sc
            else:
                ret_scores[method] = score
        del scorers
    
    return ret_scores



def _strip(s):
    return s.strip()

def compute_metrics(hypothesis, references):
    refs = {idx: [ref] for idx, ref in enumerate(references)}
    hyps = {idx: [hyp] for idx, hyp in enumerate(hypothesis)}
    assert len(refs) == len(hyps)

    ret_scores = {}
    scorers = [
        (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]),
        (Rouge(), "ROUGE_L"),
    ]
    
    for scorer, method in scorers:
        try:
            score, scores = scorer.compute_score(refs, hyps)
            if isinstance(method, list):
                for sc, m in zip(score, method):
                    ret_scores[m] = sc
            else:
                ret_scores[method] = score
        except Exception as e:
            print(f"Error computing {method}: {e}")
            continue

    return ret_scores

# Paths
datasets = ['SecureEvalDataset.csv','LLMSecEval-prompts_.csv','SecLLMHolmes.csv']



for dataset in datasets:
    output_file = f'./dataseteva/gpt-4_output_metrics_{dataset}'

    
    datapath = f'./dataset/{dataset}'

    
    df = pd.read_csv(datapath)
    # Initialize result list
    results = []
    for index, row in df.iterrows():
            origin_code = row['Original_Code']
            task = row['NL Prompt']
            generated_code = row['generated code']

            # Compute metrics
            metrics = compute_metrics([generated_code], [origin_code])
            # Calculate Evaluation Metrics
            perplexity = calculate_perplexity(task)
            lexical_div = lexical_diversity(task)
            polarity, subjectivity = sentiment_analysis(task)
            fk_score, gf_score, smog_score,std = readability_scores(task)
            word_count = calculate_conciseness(task)

            print(metrics)
            # Store the result
            result = {
                'origin_code': origin_code,
                'task': task,
                'generated_code': generated_code,
                'perplexity': perplexity,
                'lexical_diversity': lexical_div,
                'sentiment_polarity': polarity,
                'sentiment_subjectivity': subjectivity,
                'fk_score': fk_score,
                'gf_score': gf_score,
                'smog_score': smog_score,
                'std_score': std,
                'word_count': word_count
            }
            result.update(metrics)  # Add all metrics to the dictionary
            results.append(result)

    # Convert results to a DataFrame and save to CSV
    df_results = pd.DataFrame(results)
    df_results.to_csv(output_file, index=False)

    print(f"Metrics have been saved to {output_file}")

        
