# 2. Performance Evaluation

Evaluating the LLM performance on the generated results against human expert understanding.

This code compares the LLM-generated results (JSON file format) against the human expert labelled ground truth (Also JSON format).

- Author:
    - [Chen Lequn](https://www.linkedin.com/in/lequn-chen-433040159)
    - [Muhammad Tayyab Khan](https://www.linkedin.com/in/mtayyabkhan/)


#### Run this code to calculate the performance evaluations for different models, and different experiments!

### Metrics Definitions

- **Accuracy score**: Calculate accuracy as the proportion of correctly predicted existing features (exists=True) in the ground truth.
- **Hallucination score**
- **Feature name matching score**
- **Mean Absolute Error (MAE)**: Compute the average absolute difference in the quantities of predicted and ground truth features.
- **Feature-Level Evaluation**: Assess metrics for each feature independently to understand the performance at the feature level.

In [1]:
import os
import json
import re
import pandas as pd
from sklearn.metrics import precision_score, recall_score, f1_score

### Define paths

In [2]:
# Define paths
model_name = "Llava_v1_6_mistral_7b" #MiniCPM, GPT-4o, Claude-3.5, Claude-3, Llava_v1_6_mistral_7b
## Exp_1_Basic_Prompts
## Exp_2_Improved_Prompts
## Exp_3_Multi_View
## Exp_4a_Multi_View_Few_Shot
## Exp_4b_Multi_View_Chain_of_Thougths
## Exp_5_Multi_View_Few_Shot_Chain_of_thoughts
Experiment = "Exp_4b_Multi_View_Chain_of_Thougths"
Results_dir = os.path.join('..', 'Results/LLM_AFR')
ground_truth_dir = os.path.join(Results_dir, 'Test_Cases_STEP_Ground_Truth')
predicted_dir = os.path.join(Results_dir, f'{Experiment}', model_name)
# Define file path for the combined CSV file
combined_csv_file = os.path.join(Results_dir, f'evaluation_results_{model_name}_{Experiment}.csv')

# Specify the path where you want to save the CSV file
csv_file_path = os.path.join(Results_dir, f'feature_analysis_{model_name}_{Experiment}.csv')

### Import Feature Name List

In [3]:
# Define the relative path to the generated_images folder
JSON_FILES_PATH = os.path.join('..', 'JSON_Files')
# File names
feature_list_file = 'Manufacturing_Feature_List_Selected.json'

# Full paths
feature_list_path = os.path.join(JSON_FILES_PATH, feature_list_file)

# Function to read JSON file
def read_json_file(file_path):
    with open(file_path, 'r') as file:
        data = json.load(file)
    return data

# Read the JSON files
manufacturing_features = read_json_file(feature_list_path)

# Print to verify
# print(json.dumps(manufacturing_features, indent=2))

In [4]:
# Function to extract hierarchy
def extract_hierarchy(dictionary, current_path=""):
    result = []
    for key, value in dictionary.items():
        new_path = f"{current_path} > {key}" if current_path else key
        if isinstance(value, dict):
            result.extend(extract_hierarchy(value, new_path))
        elif isinstance(value, list) and not value:
            result.append({
                # "index": len(result) + 1,
                "feature_name": key,
                "hierarchy": new_path
            })
        else:
            for item in value:
                result.append({
                    # "index": len(result) + 1,
                    "feature_name": item,
                    "hierarchy": new_path
                })
    return result

# Function to create name hierarchy dictionaries
def create_name_hierarchy_dicts(manufacturing_features):
    feature_list = extract_hierarchy(manufacturing_features["Manufacturing Features"])
    return feature_list

# Generate name hierarchy dictionaries
manufacturing_features_names = create_name_hierarchy_dicts(manufacturing_features)

# # Print to verify
print ("manufacturing_features_names \n\n")
print(json.dumps(manufacturing_features_names, indent=2))

manufacturing_features_names 


[
  {
    "feature_name": "Hole (Through / Blind Hole)",
    "hierarchy": "Machining Features > Hole (Through / Blind Hole)"
  },
  {
    "feature_name": "Slot (Through / Blind / T-Slot / Dovetail)",
    "hierarchy": "Machining Features > Slot (Through / Blind / T-Slot / Dovetail)"
  },
  {
    "feature_name": "Step (Through / Blind Step)",
    "hierarchy": "Machining Features > Step (Through / Blind Step)"
  },
  {
    "feature_name": "Pocket (Blind / Through / Circular End Pocket)",
    "hierarchy": "Machining Features > Pocket (Blind / Through / Circular End Pocket)"
  },
  {
    "feature_name": "Chamfer / Bevel (Sharp Edge)",
    "hierarchy": "Machining Features > Edges & Contours > Chamfer / Bevel (Sharp Edge)"
  },
  {
    "feature_name": "Fillet / Round (Concave / Convex)",
    "hierarchy": "Machining Features > Edges & Contours > Fillet / Round (Concave / Convex)"
  },
  {
    "feature_name": "Threaded Feature",
    "hierarchy": "Machining Featur

### Function to convert simplifed results into full feature name list

In [5]:
# Function to load JSON file
def load_json(file_path):
    with open(file_path, 'r') as file:
        return json.load(file)
        
def complete_features_list(manufacturing_features_names, simplified_features):
    identified_features = simplified_features.get("identified_features", [])
    
    output_features = []
    
    for full_feature in manufacturing_features_names:
        feature_name = full_feature["feature_name"]
        
        found = False
        for identified_feature in identified_features:
            if identified_feature["feature_name"] == feature_name:
                found = True
                exists = identified_feature["exists"]
                quantity = identified_feature["quantity"]
                break
        
        if found:
            output_features.append({
                "feature_name": feature_name,
                "exists": exists,
                "quantity": quantity
            })
        else:
            output_features.append({
                "feature_name": feature_name,
                "exists": False,
                "quantity": 0
            })
    
    return {"identified_features": output_features}


# Function to complete the features list for ground truth files
def complete_ground_truth_features_list(manufacturing_features_names, ground_truth):
    identified_features = ground_truth.get("identified_features", [])
    
    output_features = []
    
    for full_feature in manufacturing_features_names:
        feature_name = full_feature["feature_name"]
        
        found = False
        quantity = 0
        
        for identified_feature in identified_features:
            if identified_feature["feature_name"] == feature_name:
                found = True
                quantity = identified_feature["quantity"]
                break
        
        if found:
            output_features.append({
                "feature_name": feature_name,
                "exists": True,
                "quantity": quantity
            })
        else:
            output_features.append({
                "feature_name": feature_name,
                "exists": False,
                "quantity": 0
            })
    
    return {"identified_features": output_features}

In [6]:
# Example usage, to test its effectiveness
simplified_features = {
    "identified_features": [
        {
            "feature_name": "Slot (Through / Blind / T-Slot / Dovetail)",
            "exists": True,
            "quantity": 2
        },
        {
            "feature_name": "Fillet / Round (Concave / Convex)",
            "exists": True,
            "quantity": 1
        }
    ]
}

simplified_ground_truth = {
  "identified_features": [
    {
      "feature_name": "Sheet Metal Features",
      "quantity": 1
    },
    {
      "feature_name": "Hole (Through / Blind Hole)",
      "quantity": 10
    }
  ]
}

# Call the function and retrieve the result
# result = complete_features_list(manufacturing_features_names, simplified_features)
# print ("simplified example list")
# print(json.dumps(result, indent=2))
print("-----------------")
print ("simplified_ground_truth list")
result_ground_truth = complete_ground_truth_features_list(manufacturing_features_names, simplified_ground_truth)
print(json.dumps(result_ground_truth, indent=2))

-----------------
simplified_ground_truth list
{
  "identified_features": [
    {
      "feature_name": "Hole (Through / Blind Hole)",
      "exists": true,
      "quantity": 10
    },
    {
      "feature_name": "Slot (Through / Blind / T-Slot / Dovetail)",
      "exists": false,
      "quantity": 0
    },
    {
      "feature_name": "Step (Through / Blind Step)",
      "exists": false,
      "quantity": 0
    },
    {
      "feature_name": "Pocket (Blind / Through / Circular End Pocket)",
      "exists": false,
      "quantity": 0
    },
    {
      "feature_name": "Chamfer / Bevel (Sharp Edge)",
      "exists": false,
      "quantity": 0
    },
    {
      "feature_name": "Fillet / Round (Concave / Convex)",
      "exists": false,
      "quantity": 0
    },
    {
      "feature_name": "Threaded Feature",
      "exists": false,
      "quantity": 0
    },
    {
      "feature_name": "Gear Teeth",
      "exists": false,
      "quantity": 0
    },
    {
      "feature_name": "Neck",
   

## Part Level Evaluation

### Step 1: Load Ground Truth and Prediction Files

In [7]:
# Function to load JSON file
def load_json(file_path):
    with open(file_path, 'r') as file:
        return json.load(file)

In [10]:
# Function to iterate through CAD files and evaluate performance
def load_results_and_compare(ground_truth_dir, predicted_dir, manufacturing_features_names):
    # List to store evaluation results for all CAD files
    evaluation_results = []

    # Iterate over all ground truth files
    for gt_file in os.listdir(ground_truth_dir):
        # Extract part name from ground truth file
        part_name = gt_file.replace('.json', '')
        gt_file_path = os.path.join(ground_truth_dir, gt_file)

        # Construct predicted file name
        predicted_file = f"{model_name}_{part_name}_features.json"
        predicted_file_path = os.path.join(predicted_dir, predicted_file)

        # Check if predicted file exists
        # if os.path.exists(predicted_file_path):
        # Load ground truth and predicted JSON files
        ground_truth = load_json(gt_file_path)
        predicted = load_json(predicted_file_path)

        # Apply complete_features_list function
        ground_truth_full = complete_ground_truth_features_list(manufacturing_features_names, ground_truth)
        predicted_full = complete_features_list(manufacturing_features_names, predicted)
        
        # Store the results for individual CAD file comparison
        evaluation_results.append({
            'part_name': part_name,  # Using ground truth file name for identification
            'ground_truth': ground_truth_full,
            'predicted': predicted_full
        })

    return evaluation_results


CAD_parts_results = load_results_and_compare(ground_truth_dir, predicted_dir, manufacturing_features_names)
print(json.dumps(CAD_parts_results, indent=2))

[
  {
    "part_name": "easy_24",
    "ground_truth": {
      "identified_features": [
        {
          "feature_name": "Hole (Through / Blind Hole)",
          "exists": true,
          "quantity": 6
        },
        {
          "feature_name": "Slot (Through / Blind / T-Slot / Dovetail)",
          "exists": false,
          "quantity": 0
        },
        {
          "feature_name": "Step (Through / Blind Step)",
          "exists": false,
          "quantity": 0
        },
        {
          "feature_name": "Pocket (Blind / Through / Circular End Pocket)",
          "exists": false,
          "quantity": 0
        },
        {
          "feature_name": "Chamfer / Bevel (Sharp Edge)",
          "exists": false,
          "quantity": 0
        },
        {
          "feature_name": "Fillet / Round (Concave / Convex)",
          "exists": false,
          "quantity": 0
        },
        {
          "feature_name": "Threaded Feature",
          "exists": false,
          "quant

#### Convert into pandas dataframe for easy analysis

In [11]:
# Function to flatten the data
def flatten_CAD_parts_results(CAD_parts_results):
    flattened_data = []

    for part in CAD_parts_results:
        part_name = part['part_name']
        for gt_feature, pred_feature in zip(part['ground_truth']['identified_features'], part['predicted']['identified_features']):
            flattened_data.append({
                'part_name': part_name,
                'feature_name': gt_feature['feature_name'],
                'gt_exists': gt_feature['exists'],
                'gt_quantity': gt_feature['quantity'],
                'pred_exists': pred_feature['exists'],
                'pred_quantity': pred_feature['quantity']
            })

    return flattened_data

In [12]:
# Flatten the data
flattened_data = flatten_CAD_parts_results(CAD_parts_results)

# Create DataFrame
df_results = pd.DataFrame(flattened_data)

# Display the DataFrame
df_results

Unnamed: 0,part_name,feature_name,gt_exists,gt_quantity,pred_exists,pred_quantity
0,easy_24,Hole (Through / Blind Hole),True,6,True,2
1,easy_24,Slot (Through / Blind / T-Slot / Dovetail),False,0,False,0
2,easy_24,Step (Through / Blind Step),False,0,False,0
3,easy_24,Pocket (Blind / Through / Circular End Pocket),False,0,False,0
4,easy_24,Chamfer / Bevel (Sharp Edge),False,0,True,1
...,...,...,...,...,...,...
1595,easy_3,"Freeform Features (Depression, Protrusion)",False,0,False,0
1596,easy_3,Rib,False,0,False,0
1597,easy_3,Gusset,False,0,False,0
1598,easy_3,Draft,False,0,False,0


In [13]:
df_results.to_csv(os.path.join(Results_dir, "prediction_vs_ground_truth_results_all.csv"))

### Step 2: Define Evaluation Metrics

define functions to calculate evaluation metrics such as accuracy, precision, recall, and F1-score based on the comparisons between ground truth and predictions.

In [16]:
def calculate_accuracy_and_hallucination(evaluation_results):
    detailed_results = []

    total_true_positive_quantity = 0
    total_ground_truth_quantity = 0
    total_hallucinated_quantity = 0
    total_predicted_quantity = 0
    total_correctly_identified_names = 0
    total_ground_truth_names = 0
    total_absolute_error = 0
    num_files = len(evaluation_results)
    feature_count = 1

    for result in evaluation_results:
        part_name = result['part_name']
        ground_truth_features = result['ground_truth']['identified_features']
        predicted_features = result['predicted']['identified_features']

        true_positive_quantity = 0
        hallucinated_quantity = 0
        ground_truth_quantity = 0
        predicted_quantity = 0
        correctly_identified_names = 0
        absolute_error = 0
        ground_truth_names_set = set()

        for gt_feature, pred_feature in zip(ground_truth_features, predicted_features):
            # Calculate absolute error for all features
            absolute_error += abs(gt_feature['quantity'] - pred_feature['quantity'])
            # Track ground truth feature names
            if gt_feature['exists']:
                ground_truth_quantity += gt_feature['quantity']
                ground_truth_names_set.add(gt_feature['feature_name'])
                if pred_feature['exists']:
                    true_positive_quantity += min(gt_feature['quantity'], pred_feature['quantity'])
            # Track predicted feature names and true positive quantity
            if pred_feature['exists']:
                predicted_quantity += pred_feature['quantity']
                if gt_feature['exists']:
                    if gt_feature['feature_name'] == pred_feature['feature_name']:
                        correctly_identified_names += 1
                else:
                    hallucinated_quantity += pred_feature['quantity']

        accuracy = true_positive_quantity / ground_truth_quantity if ground_truth_quantity > 0 else 0
        hallucination = hallucinated_quantity / predicted_quantity if predicted_quantity > 0 else 0
        feature_name_accuracy = correctly_identified_names / len(ground_truth_names_set) if len(ground_truth_names_set) > 0 else 0

        detailed_results.append({
            'part_name': part_name,
            'accuracy': accuracy * 100,
            'hallucination_rate': hallucination * 100,
            'feature_name_matching_rate': feature_name_accuracy * 100,
            'mae': absolute_error/17,
            'quantity_absolute_error': absolute_error,
            'true_positive_quantity': true_positive_quantity,
            'ground_truth_quantity': ground_truth_quantity,
            'hallucinated_quantity': hallucinated_quantity,
            'predicted_quantity': predicted_quantity,
            'correctly_identified_names': correctly_identified_names,
            'ground_truth_names_count': len(ground_truth_names_set)
        })

        total_true_positive_quantity += true_positive_quantity
        total_ground_truth_quantity += ground_truth_quantity
        total_hallucinated_quantity += hallucinated_quantity
        total_predicted_quantity += predicted_quantity
        total_correctly_identified_names += correctly_identified_names
        total_ground_truth_names += len(ground_truth_names_set)
        total_absolute_error += absolute_error

    total_features = num_files * feature_count
    overall_accuracy = total_true_positive_quantity / total_ground_truth_quantity if total_ground_truth_quantity > 0 else 0
    overall_hallucination = total_hallucinated_quantity / total_predicted_quantity if total_predicted_quantity > 0 else 0
    overall_feature_name_accuracy = total_correctly_identified_names / total_ground_truth_names if total_ground_truth_names > 0 else 0
    overall_mae = total_absolute_error / total_features

    # Display overall results
    overall_results = {
        'overall_accuracy': overall_accuracy * 100,
        'overall_feature_name_matching_rate': overall_feature_name_accuracy * 100,
        'overall_hallucination_rate': overall_hallucination * 100,
        'overall_mae': overall_mae,
        "total_true_positive_quantity": total_true_positive_quantity,
        "total_ground_truth_quantity": total_ground_truth_quantity,
        "total_hallucinated_quantity": total_hallucinated_quantity,
        "total_predicted_quantity": total_predicted_quantity,
        "total_absolute_error": total_absolute_error,
        "total_correctly_identified_names": total_correctly_identified_names,
        "total_ground_truth_names": total_ground_truth_names,
        "total_features": total_features
    }

    detailed_results_df = pd.DataFrame(detailed_results)

    # Define a function to split the part_name into a tuple of (category, index)
    def sort_key(part_name):
        match = re.match(r'(\D+)(\d+)', part_name)
        if match:
            category, index = match.groups()
            return (category, int(index))
        return part_name

    # Sort the DataFrame using the defined sort key
    detailed_results_df['sort_key'] = detailed_results_df['part_name'].apply(sort_key)
    detailed_results_df = detailed_results_df.sort_values(by='sort_key').drop(columns='sort_key')

    return detailed_results_df, overall_results

In [17]:
# Example usage:
# Assume evaluation_results has been defined as per previous code
detailed_results_df, overall_results = calculate_accuracy_and_hallucination(CAD_parts_results)
overall_results

{'overall_accuracy': 27.114427860696516,
 'overall_feature_name_matching_rate': 60.273972602739725,
 'overall_hallucination_rate': 55.919854280510016,
 'overall_mae': 9.17,
 'total_true_positive_quantity': 218,
 'total_ground_truth_quantity': 804,
 'total_hallucinated_quantity': 307,
 'total_predicted_quantity': 549,
 'total_absolute_error': 917,
 'total_correctly_identified_names': 176,
 'total_ground_truth_names': 292,
 'total_features': 100}

In [18]:
detailed_results_df

Unnamed: 0,part_name,accuracy,hallucination_rate,feature_name_matching_rate,mae,quantity_absolute_error,true_positive_quantity,ground_truth_quantity,hallucinated_quantity,predicted_quantity,correctly_identified_names,ground_truth_names_count
98,easy_1,40.000000,25.000000,100.000000,0.294118,5,2,5,1,4,2,2
49,easy_2,60.000000,0.000000,75.000000,0.176471,3,3,5,0,4,3,4
99,easy_3,100.000000,50.000000,100.000000,0.117647,2,2,2,2,4,2,2
66,easy_4,50.000000,50.000000,100.000000,0.352941,6,3,6,3,6,2,2
5,easy_5,50.000000,75.000000,50.000000,0.235294,4,1,2,3,4,1,2
...,...,...,...,...,...,...,...,...,...,...,...,...
52,medium_29,80.000000,75.000000,100.000000,0.764706,13,4,5,12,16,3,3
61,medium_30,0.000000,100.000000,0.000000,0.352941,6,0,3,3,3,0,2
58,medium_31,0.000000,100.000000,0.000000,0.294118,5,0,3,2,2,0,2
63,medium_32,28.571429,50.000000,33.333333,0.411765,7,2,7,2,4,1,3


### Subset Performance Evaluation (easy, medium, hard)
To evaluate performance across subsets (easy, medium, hard):

In [19]:
def combine_and_calculate_category_stats(detailed_results_df, overall_results):
    # Extract the category from the part_name and calculate overall accuracy and hallucination for each category
    detailed_results_df['category'] = detailed_results_df['part_name'].apply(lambda x: x.split('_')[0])

    # Calculate the number of files in each category
    category_counts = detailed_results_df['category'].value_counts().to_dict()

    category_stats = detailed_results_df.groupby('category').agg({
        'true_positive_quantity': 'sum',
        'ground_truth_quantity': 'sum',
        'hallucinated_quantity': 'sum',
        'predicted_quantity': 'sum',
        'quantity_absolute_error': 'sum',
        'correctly_identified_names': 'sum',
        'ground_truth_names_count': 'sum'
    }).reset_index()


    category_stats['accuracy'] = category_stats['true_positive_quantity'] / category_stats['ground_truth_quantity'] * 100
    category_stats['feature_name_matching_rate'] = category_stats['correctly_identified_names'] / category_stats['ground_truth_names_count'] * 100
    category_stats['hallucination_rate'] = category_stats['hallucinated_quantity'] / category_stats['predicted_quantity'] * 100
    category_stats['mae'] = category_stats['quantity_absolute_error'] / (category_stats['ground_truth_names_count'] )

    # Rename the columns to make them consistent
    category_stats = category_stats.rename(columns={
        'quantity_absolute_error': 'total_absolute_error',
        'correctly_identified_names': 'total_correctly_identified_names',
        'ground_truth_names_count': 'total_ground_truth_names'
    })

    # Add a row for overall evaluation results
    overall_stats = pd.DataFrame({
        'category': ['overall'],
        'true_positive_quantity': [overall_results['total_true_positive_quantity']],
        'ground_truth_quantity': [overall_results['total_ground_truth_quantity']],
        'hallucinated_quantity': [overall_results['total_hallucinated_quantity']],
        'predicted_quantity': [overall_results['total_predicted_quantity']],
        'accuracy': [overall_results['overall_accuracy']],
        'hallucination_rate': [overall_results['overall_hallucination_rate']],
        'mae': [overall_results['overall_mae']],
        'feature_name_matching_rate': [overall_results['overall_feature_name_matching_rate']],
        "total_ground_truth_names": [overall_results['total_ground_truth_names']],
        "total_absolute_error": [overall_results['total_absolute_error']],
        "total_correctly_identified_names": [overall_results['total_correctly_identified_names']]
    })

    # Append overall stats to category stats
    category_stats_with_overall = pd.concat([category_stats, overall_stats], ignore_index=True)

    return category_stats_with_overall

In [20]:
category_stats_with_overall = combine_and_calculate_category_stats(detailed_results_df, overall_results)

category_stats_with_overall

Unnamed: 0,category,true_positive_quantity,ground_truth_quantity,hallucinated_quantity,predicted_quantity,total_absolute_error,total_correctly_identified_names,total_ground_truth_names,accuracy,feature_name_matching_rate,hallucination_rate,mae
0,easy,43,115,73,129,158,36,67,37.391304,53.731343,56.589147,2.358209
1,hard,89,448,148,241,511,71,121,19.866071,58.677686,61.410788,4.22314
2,medium,86,241,86,179,248,69,104,35.684647,66.346154,48.044693,2.384615
3,overall,218,804,307,549,917,176,292,27.114428,60.273973,55.919854,9.17


## Feature-level metrics calculation

In [21]:
from collections import defaultdict
from sklearn.metrics import precision_score, recall_score, f1_score

def calculate_feature_metrics(evaluation_results):
    # Initialize a dictionary to store feature-level metrics
    feature_metrics = defaultdict(lambda: {
        'correctly_identified': 0,
        'total_ground_truth': 0,
        'hallucinated': 0,
        'total_predicted': 0
    })
    
    for result in evaluation_results:
        ground_truth = {feature['feature_name']: feature for feature in result['ground_truth']['identified_features']}
        predicted = {feature['feature_name']: feature for feature in result['predicted']['identified_features']}
        
        # Iterate over each feature in the predefined list of features
        for feature_name in ground_truth.keys():
            truth_feature = ground_truth[feature_name]
            predicted_feature = predicted[feature_name]

            if truth_feature['exists']:
                feature_metrics[feature_name]['total_ground_truth'] += truth_feature['quantity']
                feature_metrics[feature_name]['correctly_identified'] += min(truth_feature['quantity'], predicted_feature['quantity'])

            if predicted_feature['exists']:
                feature_metrics[feature_name]['total_predicted'] += predicted_feature['quantity']
                if not truth_feature['exists']:
                    feature_metrics[feature_name]['hallucinated'] += predicted_feature['quantity']

    return feature_metrics


def analyze_feature_metrics(feature_metrics):
    analysis_results = {}
    
    for feature_name, metrics in feature_metrics.items():
        total_ground_truth = metrics['total_ground_truth']
        correctly_identified = metrics['correctly_identified']
        total_predicted = metrics['total_predicted']
        hallucinated = metrics['hallucinated']
        
        accuracy = correctly_identified / total_ground_truth if total_ground_truth > 0 else 0.0
        hallucination_rate = hallucinated / total_predicted if total_predicted > 0 else 0.0
        
        analysis_results[feature_name] = {
            'accuracy': accuracy,
            'hallucination_rate': hallucination_rate
        }
    
    return analysis_results

In [22]:
CAD_parts_results

[{'part_name': 'easy_24',
  'ground_truth': {'identified_features': [{'feature_name': 'Hole (Through / Blind Hole)',
     'exists': True,
     'quantity': 6},
    {'feature_name': 'Slot (Through / Blind / T-Slot / Dovetail)',
     'exists': False,
     'quantity': 0},
    {'feature_name': 'Step (Through / Blind Step)',
     'exists': False,
     'quantity': 0},
    {'feature_name': 'Pocket (Blind / Through / Circular End Pocket)',
     'exists': False,
     'quantity': 0},
    {'feature_name': 'Chamfer / Bevel (Sharp Edge)',
     'exists': False,
     'quantity': 0},
    {'feature_name': 'Fillet / Round (Concave / Convex)',
     'exists': False,
     'quantity': 0},
    {'feature_name': 'Threaded Feature', 'exists': False, 'quantity': 0},
    {'feature_name': 'Gear Teeth', 'exists': False, 'quantity': 0},
    {'feature_name': 'Neck', 'exists': False, 'quantity': 0},
    {'feature_name': 'Pipe / Tube', 'exists': False, 'quantity': 0},
    {'feature_name': 'Boss (Circular / Obround / Irr

In [23]:
# Step 1: Calculate feature-level metrics
feature_metrics = calculate_feature_metrics(CAD_parts_results)

# Step 2: Analyze feature-level metrics
feature_analysis = analyze_feature_metrics(feature_metrics)

In [24]:
feature_analysis

{'Hole (Through / Blind Hole)': {'accuracy': 0.25355450236966826,
  'hallucination_rate': 0.11347517730496454},
 'Chamfer / Bevel (Sharp Edge)': {'accuracy': 0.5,
  'hallucination_rate': 0.9090909090909091},
 'Slot (Through / Blind / T-Slot / Dovetail)': {'accuracy': 0.2823529411764706,
  'hallucination_rate': 0.52},
 'Step (Through / Blind Step)': {'accuracy': 0.20833333333333334,
  'hallucination_rate': 0.375},
 'Pocket (Blind / Through / Circular End Pocket)': {'accuracy': 0.38,
  'hallucination_rate': 0.6545454545454545},
 'Fillet / Round (Concave / Convex)': {'accuracy': 0.3644859813084112,
  'hallucination_rate': 0.5411764705882353},
 'Pipe / Tube': {'accuracy': 0.23076923076923078,
  'hallucination_rate': 0.7272727272727273},
 'Boss (Circular / Obround / Irregular / Rectangular, etc)': {'accuracy': 0.22,
  'hallucination_rate': 0.5853658536585366},
 'Threaded Feature': {'accuracy': 0.2222222222222222,
  'hallucination_rate': 0.875},
 'Gear Teeth': {'accuracy': 0.0, 'hallucinatio

In [25]:
# Print or further process the analysis results
for feature_name, metrics in feature_analysis.items():
    print(f"Feature: {feature_name}")
    print(f"Accuracy: {metrics['accuracy'] * 100:.2f}%")
    print(f"Hallucination Rate: {metrics['hallucination_rate'] * 100:.2f}%")
    print("-" * 30)

Feature: Hole (Through / Blind Hole)
Accuracy: 25.36%
Hallucination Rate: 11.35%
------------------------------
Feature: Chamfer / Bevel (Sharp Edge)
Accuracy: 50.00%
Hallucination Rate: 90.91%
------------------------------
Feature: Slot (Through / Blind / T-Slot / Dovetail)
Accuracy: 28.24%
Hallucination Rate: 52.00%
------------------------------
Feature: Step (Through / Blind Step)
Accuracy: 20.83%
Hallucination Rate: 37.50%
------------------------------
Feature: Pocket (Blind / Through / Circular End Pocket)
Accuracy: 38.00%
Hallucination Rate: 65.45%
------------------------------
Feature: Fillet / Round (Concave / Convex)
Accuracy: 36.45%
Hallucination Rate: 54.12%
------------------------------
Feature: Pipe / Tube
Accuracy: 23.08%
Hallucination Rate: 72.73%
------------------------------
Feature: Boss (Circular / Obround / Irregular / Rectangular, etc)
Accuracy: 22.00%
Hallucination Rate: 58.54%
------------------------------
Feature: Threaded Feature
Accuracy: 22.22%
Halluci

In [26]:
# Convert feature_analysis to a list of dictionaries suitable for DataFrame
data_rows = [{'Feature': feature_name,
              'Accuracy': metrics['accuracy'],
              'Hallucination Rate': metrics['hallucination_rate']} for feature_name, metrics in feature_analysis.items()]

# Create a DataFrame from the list of dictionaries
df_feature_analysis = pd.DataFrame(data_rows)

In [27]:
df_feature_analysis

Unnamed: 0,Feature,Accuracy,Hallucination Rate
0,Hole (Through / Blind Hole),0.253555,0.113475
1,Chamfer / Bevel (Sharp Edge),0.5,0.909091
2,Slot (Through / Blind / T-Slot / Dovetail),0.282353,0.52
3,Step (Through / Blind Step),0.208333,0.375
4,Pocket (Blind / Through / Circular End Pocket),0.38,0.654545
5,Fillet / Round (Concave / Convex),0.364486,0.541176
6,Pipe / Tube,0.230769,0.727273
7,Boss (Circular / Obround / Irregular / Rectang...,0.22,0.585366
8,Threaded Feature,0.222222,0.875
9,Gear Teeth,0.0,1.0


In [28]:
def split_by_category(evaluation_results):
    easy_results = [result for result in evaluation_results if result['part_name'].startswith('easy')]
    medium_results = [result for result in evaluation_results if result['part_name'].startswith('medium')]
    hard_results = [result for result in evaluation_results if result['part_name'].startswith('hard')]
    
    return easy_results, medium_results, hard_results

def calculate_and_analyze_all_metrics(evaluation_results):
    easy_results, medium_results, hard_results = split_by_category(evaluation_results)

    # Calculate feature metrics for each category and overall
    overall_metrics = calculate_feature_metrics(evaluation_results)
    easy_metrics = calculate_feature_metrics(easy_results)
    medium_metrics = calculate_feature_metrics(medium_results)
    hard_metrics = calculate_feature_metrics(hard_results)

    # Analyze feature metrics for each category and overall
    overall_analysis = analyze_feature_metrics(overall_metrics)
    easy_analysis = analyze_feature_metrics(easy_metrics)
    medium_analysis = analyze_feature_metrics(medium_metrics)
    hard_analysis = analyze_feature_metrics(hard_metrics)

    # Combine all analyses into a DataFrame
    feature_names = set(overall_analysis.keys()).union(easy_analysis.keys(), medium_analysis.keys(), hard_analysis.keys())
    data_rows = []

    for feature_name in feature_names:
        row = {
            'Feature name': feature_name,
            'Overall Accuracy': overall_analysis.get(feature_name, {}).get('accuracy', 0),
            'Overall Hallucination Rate': overall_analysis.get(feature_name, {}).get('hallucination_rate', 0),
            'Easy Accuracy': easy_analysis.get(feature_name, {}).get('accuracy', 0),
            'Easy Hallucination Rate': easy_analysis.get(feature_name, {}).get('hallucination_rate', 0),
            'Medium Accuracy': medium_analysis.get(feature_name, {}).get('accuracy', 0),
            'Medium Hallucination Rate': medium_analysis.get(feature_name, {}).get('hallucination_rate', 0),
            'Hard Accuracy': hard_analysis.get(feature_name, {}).get('accuracy', 0),
            'Hard Hallucination Rate': hard_analysis.get(feature_name, {}).get('hallucination_rate', 0),
        }
        data_rows.append(row)

    df_feature_analysis = pd.DataFrame(data_rows)
    # Sort the DataFrame by 'Feature' in alphabetical order
    df_feature_analysis = df_feature_analysis.sort_values(by='Feature name').reset_index(drop=True)
    
    return df_feature_analysis

In [29]:
df_feature_analysis = calculate_and_analyze_all_metrics(CAD_parts_results)

df_feature_analysis

Unnamed: 0,Feature name,Overall Accuracy,Overall Hallucination Rate,Easy Accuracy,Easy Hallucination Rate,Medium Accuracy,Medium Hallucination Rate,Hard Accuracy,Hard Hallucination Rate
0,Boss (Circular / Obround / Irregular / Rectang...,0.22,0.585366,0.714286,0.47619,0.047619,0.857143,0.227273,0.615385
1,Chamfer / Bevel (Sharp Edge),0.5,0.909091,0.5,0.85,0.5,0.875,0.0,1.0
2,Draft,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0
3,Fillet / Round (Concave / Convex),0.364486,0.541176,0.166667,0.913043,0.5,0.481481,0.343284,0.342857
4,"Freeform Features (Depression, Protrusion)",0.166667,0.9,0.0,0.0,0.0,1.0,0.2,0.875
5,Gear Teeth,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0
6,Gusset,0.083333,0.9,0.0,0.0,0.0,1.0,0.090909,0.875
7,Hole (Through / Blind Hole),0.253555,0.113475,0.456522,0.263158,0.352941,0.109091,0.171206,0.0
8,Neck,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0
9,Pipe / Tube,0.230769,0.727273,0.0,0.0,0.0,1.0,0.3,0.625


# Consolidate results for All Model And Experiments

In [31]:
import os
import pandas as pd
import json

# Define the list of models and experiments
models = ["GPT-4o", "Claude-3.5", "Claude-3", "MiniCPM", "MiniCPM-V2.6", "Llava_v1_6_mistral_7b"] # "", "Llava_v1_6_mistral_7b", ,
experiments_all = [
    "Exp_1_Basic_Prompts",
    "Exp_2_Improved_Prompts",
    "Exp_3_Multi_View",
    "Exp_4a_Multi_View_Few_Shot",
    "Exp_4b_Multi_View_Chain_of_Thougths",
    "Exp_5_Multi_View_Few_Shot_Chain_of_thoughts"
]

# Define the base directory for results
Results_dir = os.path.join('..', 'Results')
CSV_Results_dir = os.path.join(Results_dir, "LLM_AFR", "CSV_results")
detailed_results_df.to_csv(os.path.join(CSV_Results_dir, "part_level_statistical_results.csv"))
ground_truth_dir = os.path.join(Results_dir, "LLM_AFR", 'Test_Cases_STEP_Ground_Truth')

In [32]:
# Function to perform the evaluations for each model and experiment
def evaluate_model_and_experiment(model_name, experiment, feature_analysis_dfs, category_stats_dfs):
    predicted_dir = os.path.join(Results_dir, f'LLM_AFR/{experiment}', model_name)
    
    # Load evaluation results
    evaluation_results = load_results_and_compare(ground_truth_dir, predicted_dir, manufacturing_features_names)
    
    # Calculate detailed results and overall metrics
    detailed_results_df, overall_results = calculate_accuracy_and_hallucination(evaluation_results)
    
    # Save detailed results to CSV
    combined_csv_file = os.path.join(CSV_Results_dir,  "CAD_level_analysis", f'CAD_evaluation_{model_name}_{experiment}.csv')
    detailed_results_df.to_csv(combined_csv_file, index=False)
    
    # Calculate feature-level metrics
    df_feature_analysis = calculate_and_analyze_all_metrics(evaluation_results)

    # Add model and experiment information to the DataFrame
    df_feature_analysis['Model'] = model_name
    df_feature_analysis['Experiment'] = experiment
    
    # Save feature-level analysis to CSV
    feature_csv_file_path = os.path.join(CSV_Results_dir,  "Feature_level_analysis", f'feature_analysis_{model_name}_{experiment}.csv')
    df_feature_analysis.to_csv(feature_csv_file_path, index=False)

    # Append to the list of feature analysis DataFrames
    feature_analysis_dfs.append(df_feature_analysis)
    
    # Calculate category stats with overall metrics
    category_stats_with_overall = combine_and_calculate_category_stats(detailed_results_df, overall_results)

    # Add model and experiment information to the DataFrame
    category_stats_with_overall['Model'] = model_name
    category_stats_with_overall['Experiment'] = experiment
    
    # Save category stats with overall to CSV
    category_stats_csv_file_path = os.path.join(CSV_Results_dir, "Overall_performance", f'category_overall_stats_{model_name}_{experiment}.csv')
    category_stats_with_overall.to_csv(category_stats_csv_file_path, index=False)

    # Append to the list of category stats DataFrames
    category_stats_dfs.append(category_stats_with_overall)


In [37]:
# Lists to store DataFrames for concatenation
feature_analysis_dfs = []
category_stats_dfs = []

# Iterate over each model and experiment
for model_name in models:
    
    for experiment in experiments_all:
        evaluate_model_and_experiment(model_name, experiment, feature_analysis_dfs, category_stats_dfs)

# Concatenate all feature analysis DataFrames
df_all_feature_analysis = pd.concat(feature_analysis_dfs, ignore_index=True)

# Concatenate all category stats DataFrames
df_all_category_stats = pd.concat(category_stats_dfs, ignore_index=True)


In [38]:
df_all_feature_analysis

Unnamed: 0,Feature name,Overall Accuracy,Overall Hallucination Rate,Easy Accuracy,Easy Hallucination Rate,Medium Accuracy,Medium Hallucination Rate,Hard Accuracy,Hard Hallucination Rate,Model,Experiment
0,Boss (Circular / Obround / Irregular / Rectang...,0.480000,0.142857,1.0,0.125000,0.190476,0.200000,0.590909,0.133333,GPT-4o,Exp_1_Basic_Prompts
1,Chamfer / Bevel (Sharp Edge),0.416667,0.444444,0.5,0.000000,0.333333,0.500000,0.000000,1.000000,GPT-4o,Exp_1_Basic_Prompts
2,Draft,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,GPT-4o,Exp_1_Basic_Prompts
3,Fillet / Round (Concave / Convex),0.570093,0.129870,1.0,0.235294,0.392857,0.142857,0.567164,0.086957,GPT-4o,Exp_1_Basic_Prompts
4,"Freeform Features (Depression, Protrusion)",0.500000,0.555556,1.0,0.000000,0.000000,1.000000,0.400000,0.666667,GPT-4o,Exp_1_Basic_Prompts
...,...,...,...,...,...,...,...,...,...,...,...
547,Rib,0.000000,1.000000,0.0,1.000000,0.000000,1.000000,0.000000,1.000000,Llava_v1_6_mistral_7b,Exp_5_Multi_View_Few_Shot_Chain_of_thoughts
548,Sheet Metal Features,0.125000,0.977778,0.0,1.000000,0.000000,1.000000,0.166667,0.944444,Llava_v1_6_mistral_7b,Exp_5_Multi_View_Few_Shot_Chain_of_thoughts
549,Slot (Through / Blind / T-Slot / Dovetail),0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,Llava_v1_6_mistral_7b,Exp_5_Multi_View_Few_Shot_Chain_of_thoughts
550,Step (Through / Blind Step),0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,Llava_v1_6_mistral_7b,Exp_5_Multi_View_Few_Shot_Chain_of_thoughts


In [39]:
df_all_category_stats

Unnamed: 0,category,true_positive_quantity,ground_truth_quantity,hallucinated_quantity,predicted_quantity,total_absolute_error,total_correctly_identified_names,total_ground_truth_names,accuracy,feature_name_matching_rate,hallucination_rate,mae,Model,Experiment
0,easy,85,115,11,111,56,53,67,73.913043,79.104478,9.909910,0.835821,GPT-4o,Exp_1_Basic_Prompts
1,hard,199,448,48,253,303,73,121,44.419643,60.330579,18.972332,2.504132,GPT-4o,Exp_1_Basic_Prompts
2,medium,142,241,17,162,119,65,104,58.921162,62.500000,10.493827,1.144231,GPT-4o,Exp_1_Basic_Prompts
3,overall,426,804,76,526,478,191,292,52.985075,65.410959,14.448669,4.780000,GPT-4o,Exp_1_Basic_Prompts
4,easy,82,115,13,110,61,50,67,71.304348,74.626866,11.818182,0.910448,GPT-4o,Exp_2_Improved_Prompts
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
139,overall,218,804,307,549,917,176,292,27.114428,60.273973,55.919854,9.170000,Llava_v1_6_mistral_7b,Exp_4b_Multi_View_Chain_of_Thougths
140,easy,14,115,162,180,267,11,67,12.173913,16.417910,90.000000,3.985075,Llava_v1_6_mistral_7b,Exp_5_Multi_View_Few_Shot_Chain_of_thoughts
141,hard,35,448,174,210,588,22,121,7.812500,18.181818,82.857143,4.859504,Llava_v1_6_mistral_7b,Exp_5_Multi_View_Few_Shot_Chain_of_thoughts
142,medium,31,241,131,171,350,23,104,12.863071,22.115385,76.608187,3.365385,Llava_v1_6_mistral_7b,Exp_5_Multi_View_Few_Shot_Chain_of_thoughts


In [40]:
# Save concatenated DataFrames to CSV for later visualization
all_feature_analysis_csv_path = os.path.join(CSV_Results_dir, 'all_feature_analysis.csv')
df_all_feature_analysis.to_csv(all_feature_analysis_csv_path, index=False)

all_category_stats_csv_path = os.path.join(CSV_Results_dir, 'all_category_stats.csv')
df_all_category_stats.to_csv(all_category_stats_csv_path, index=False)