In [5]:
import json
import math
import ast

# Function to count lines of code
def count_loc(jsonl_file):
    loc_count = {}

    with open(jsonl_file, 'r') as f:
        for line in f:
            data = json.loads(line.strip())
            task_id = data.get('task_id')
            completion = data.get('completion')

            if task_id not in loc_count:
                loc_count[task_id] = 0

            # Count lines of code if completion exists
            if completion:
                loc_count[task_id] += completion.count('\n') + 1

    return loc_count

# Function to extract variable operands
def get_variable_operands(code):
    operands = set()
    operands_list = []

    try:
        tree = ast.parse(code)
        for node in ast.walk(tree):
            if isinstance(node, ast.Name):
                operands.add(node.id)
                operands_list.append(node.id)
    except SyntaxError:
        print("Syntax error in code, skipping operands extraction.")

    return operands, operands_list

# Function to calculate Halstead metrics
def calculate_halstead_metrics(completion):
    operators = [
        '+', '-', '*', '/', '//', '%', '**',
        '==', '!=', '>', '<', '>=', '<=',
        'and', 'or', 'not',
        '=', '+=', '-=', '*=', '/=', '//=', '%=', '**=',
        '&', '|', '^', '~', '<<', '>>',
        'in', 'not in',
        'is', 'is not'
    ]

    # Count operators and operands
    N1 = sum(1 for token in completion.split() if token in operators)

    n1 = {token for token in completion.split() if token in operators}
    n2, operands_list = get_variable_operands(completion)
    N2 = len(operands_list)

    N = N1 + N2
    n = len(n1) + len(n2)

    if n == 0:  # Handling math domain error
        return {"Error": "Halstead Vocabulary (n) is zero. Cannot calculate log2."}

    V = N * math.log2(n)
    D = (len(n1) / 2) * (N2 / len(n2))
    E = D * V

    return {
        "Halstead Length (N)": N,
        "Halstead Vocabulary (n)": n,
        "Halstead Volume (V)": V,
        "Halstead Difficulty (D)": D,
        "Halstead Effort (E)": E,
    }

# Function to calculate average Halstead metrics
def calculate_avg_metrics(halstead_metrics_list):
    avg_metrics = {}

    # Collect all metrics for calculation
    metric_keys = [key for key in halstead_metrics_list[0].keys() if key != "Error"]

    for key in metric_keys:
        valid_metrics = [metrics[key] for metrics in halstead_metrics_list if key in metrics]
        if valid_metrics:
            total = sum(valid_metrics)
            avg_metrics[key] = total / len(valid_metrics)

    return avg_metrics

# Main execution block
if __name__ == "__main__":
    jsonl_file = "/content/data_gpt-3.5-turbo-1106.jsonl"

    loc_counts = count_loc(jsonl_file)
    halstead_metrics_list = []

    for task_id, loc_count in loc_counts.items():
        print(f"Task ID: {task_id}, LOC: {loc_count}")

        with open(jsonl_file, 'r') as f:
            for line in f:
                data = json.loads(line.strip())
                if data.get('task_id') == task_id:
                    completion = data.get('completion')

                    if completion:
                        try:
                            halstead_metrics = calculate_halstead_metrics(completion)

                            if "Error" in halstead_metrics:
                                print(f"Error calculating Halstead metrics for Task ID {task_id}: {halstead_metrics['Error']}")
                            else:
                                halstead_metrics_list.append(halstead_metrics)

                                print("Halstead Metrics:")
                                for metric, value in halstead_metrics.items():
                                    print(f"{metric}: {value}")
                        except Exception as e:
                            print(f"An error occurred while calculating Halstead metrics for Task ID {task_id}: {e}")
                    break
        print()

    # Calculate the average Halstead metrics
    if halstead_metrics_list:
        avg_metrics = calculate_avg_metrics(halstead_metrics_list)

        print("Average Halstead Metrics:")
        for metric, value in avg_metrics.items():
            print(f"{metric}: {value}")
    else:
        print("No valid Halstead metrics were calculated.")


Task ID: HumanEval/0, LOC: 6
Halstead Metrics:
Halstead Length (N): 23
Halstead Vocabulary (n): 13
Halstead Volume (V): 85.11011351724513
Halstead Difficulty (D): 2.8499999999999996
Halstead Effort (E): 242.56382352414857

Task ID: HumanEval/1, LOC: 15
Halstead Metrics:
Halstead Length (N): 33
Halstead Vocabulary (n): 12
Halstead Volume (V): 118.30376252379816
Halstead Difficulty (D): 7.5
Halstead Effort (E): 887.2782189284861

Task ID: HumanEval/2, LOC: 10
Halstead Metrics:
Halstead Length (N): 8
Halstead Vocabulary (n): 5
Halstead Volume (V): 18.575424759098897
Halstead Difficulty (D): 1.6666666666666667
Halstead Effort (E): 30.95904126516483

Task ID: HumanEval/3, LOC: 7
Halstead Metrics:
Halstead Length (N): 13
Halstead Vocabulary (n): 10
Halstead Volume (V): 43.18506523353571
Halstead Difficulty (D): 3.0
Halstead Effort (E): 129.55519570060713

Task ID: HumanEval/4, LOC: 6
Halstead Metrics:
Halstead Length (N): 24
Halstead Vocabulary (n): 13
Halstead Volume (V): 88.81055323538621


In [1]:
import json
import random

def count_loc(jsonl_file):
    loc_count = {}

    with open(jsonl_file, 'r') as f:
        for line in f:
            data = json.loads(line.strip())
            task_id = data.get('task_id')
            completion = data.get('completion')

            if task_id not in loc_count:
                loc_count[task_id] = 0

            # Count lines of code if completion exists
            if completion:
                loc_count[task_id] += completion.count('\n') + 1

    return loc_count

jsonl_file = "/content/cleanedClaude_2.jsonl"
loc_counts = count_loc(jsonl_file)
for task_id, loc_count in loc_counts.items():
    print(f"Task ID: {task_id}, LOC: {loc_count}")

# Calculate the total LOC and the number of tasks
total_loc = sum(loc_counts.values())
num_tasks = len(loc_counts)

# Calculate the average LOC
average_loc = total_loc / num_tasks

# Print the average LOC
print(f"\nAverage LOC across all tasks: {average_loc:.2f}")

Task ID: HumanEval/0, LOC: 16
Task ID: HumanEval/1, LOC: 35
Task ID: HumanEval/2, LOC: 12
Task ID: HumanEval/3, LOC: 10
Task ID: HumanEval/4, LOC: 20
Task ID: HumanEval/5, LOC: 16
Task ID: HumanEval/6, LOC: 26
Task ID: HumanEval/7, LOC: 11
Task ID: HumanEval/8, LOC: 19
Task ID: HumanEval/9, LOC: 15
Task ID: HumanEval/10, LOC: 20
Task ID: HumanEval/11, LOC: 16
Task ID: HumanEval/12, LOC: 25
Task ID: HumanEval/13, LOC: 10
Task ID: HumanEval/14, LOC: 12
Task ID: HumanEval/15, LOC: 8
Task ID: HumanEval/16, LOC: 19
Task ID: HumanEval/17, LOC: 28
Task ID: HumanEval/18, LOC: 18
Task ID: HumanEval/19, LOC: 34
Task ID: HumanEval/20, LOC: 22
Task ID: HumanEval/21, LOC: 20
Task ID: HumanEval/22, LOC: 11
Task ID: HumanEval/23, LOC: 8
Task ID: HumanEval/24, LOC: 9
Task ID: HumanEval/25, LOC: 25
Task ID: HumanEval/26, LOC: 16
Task ID: HumanEval/27, LOC: 14
Task ID: HumanEval/28, LOC: 11
Task ID: HumanEval/29, LOC: 11
Task ID: HumanEval/30, LOC: 8
Task ID: HumanEval/31, LOC: 23
Task ID: HumanEval/32,