In [1]:
import json
import csv
import matplotlib.pyplot as plt
from collections import Counter

# Load the JSON data from a file
def load_data(file_path):
    with open(file_path, 'r') as f:
        return json.load(f)

# Extract attributes and values from the model_response in the data
def extract_attributes_values(data):
    attributes_values = {}
    for product in data:
        model_response = product.get('model_response', {})
        for attr, value in model_response.items():
            if value:  # Ensure the attribute value is not empty
                if attr not in attributes_values:
                    attributes_values[attr] = []
                attributes_values[attr].append(value)
    return attributes_values

# Helper function to process attribute values for comparison
def process_attribute_values_for_comparison(value):
    """
    Recursively convert all types (list, tuple, etc.) into a comparable form (string),
    making sure they are uniform and can be compared in set operations.
    """
    if isinstance(value, (list, tuple)):  # Convert lists/tuples to sorted tuple of strings
        return tuple(sorted([process_attribute_values_for_comparison(v) for v in value]))
    return str(value)  # Convert everything else to string

# Process all attribute values for uniform comparison
def process_all_attributes_for_comparison(attributes_values):
    processed_attributes = {}
    for attr, values in attributes_values.items():
        processed_values = [process_attribute_values_for_comparison(value) for value in values]
        processed_attributes[attr] = processed_values
    return processed_attributes

# Count occurrences of each attribute
def count_attributes(attributes_values):
    return {attr: len(values) for attr, values in attributes_values.items()}

# Calculate the percentage of each attribute within each category
def calculate_categorywise_percentages(attribute_counts_list, category_names):
    """
    Calculate the percentage of each attribute within each category.
    
    Args:
        attribute_counts_list (list of dicts): A list of dictionaries where each dict corresponds to a category,
                                               and maps attributes to their counts in that category.
        category_names (list): Names of categories for labeling purposes.
    
    Returns:
        dict: A dictionary mapping each category to its attributes' counts and percentage breakdown.
    """
    categorywise_data = {}
    
    for i, counts in enumerate(attribute_counts_list):
        total_count_in_category = sum(counts.values())  # Total attributes in the current category
        category_data = {}
        
        for attr, count in counts.items():
            percentage = (count / total_count_in_category) * 100 if total_count_in_category > 0 else 0
            category_data[attr] = {
                'Count': count,
                'Percentage': percentage
            }
        
        categorywise_data[category_names[i]] = category_data
    
    return categorywise_data

# Export the results to a CSV file
def export_results_to_csv(result, filename="categorywise_results.csv"):
    """
    Export the category-wise attribute counts and percentages to a CSV file.
    
    Args:
        result (dict): The categorywise data dictionary returned by the analysis.
        filename (str): The name of the CSV file to which the data will be exported.
    """
    with open(filename, 'w', newline='', encoding='utf-8') as csvfile:  # Ensure UTF-8 encoding
        csvwriter = csv.writer(csvfile)
        # Write the header
        csvwriter.writerow(["Category", "Attribute", "Count", "Percentage"])
        
        # Write the data for each category
        for category, attributes in result.items():
            for attr, data in attributes.items():
                csvwriter.writerow([category, attr, data['Count'], f"{data['Percentage']:.2f}"])

    print(f"Results have been exported to {filename}")
# Main method to process multiple files and generate comparison
def analyze_product_data(files, category_names):
    attributes_values_list = []
    
    # Process each file and extract attributes and values
    for file in files:
        data = load_data(file)
        attributes_values = extract_attributes_values(data)
        attributes_values_list.append(attributes_values)
    
    # Count attributes for each category
    attribute_counts_list = [count_attributes(attr_values) for attr_values in attributes_values_list]
    
    # Calculate percentages by category
    categorywise_data = calculate_categorywise_percentages(attribute_counts_list, category_names)
    
    # Print attribute counts and percentages for each category
    print("\nAttribute Counts and Percentages by Category:")
    for category, attributes in categorywise_data.items():
        print(f"{category}:")
        for attr, data in attributes.items():
            print(f"  {attr}: Count = {data['Count']}, Percentage = {data['Percentage']:.2f}%")
    
    # Export results to CSV
    export_results_to_csv(categorywise_data)
    
    return categorywise_data


files = [
    'results1_gro_chatgpt4omini.json',
    'results1_hom_chatgpt4omini.json',
    'results1_jew_chatgpt4omini.json',  
    'results1_off_chatgpt4omini.json'
]

category_names = ['grocery', 'home', 'jewelry', 'office']
result = analyze_product_data(files, category_names)



Attribute Counts and Percentages by Category:
grocery:
  product_title: Count = 3, Percentage = 15.79%
  attributes: Count = 2, Percentage = 10.53%
  brand: Count = 1, Percentage = 5.26%
  product_code: Count = 1, Percentage = 5.26%
  pack_size: Count = 1, Percentage = 5.26%
  carton_quantity: Count = 1, Percentage = 5.26%
  category: Count = 1, Percentage = 5.26%
  gluten_free: Count = 1, Percentage = 5.26%
  food_allergen_info: Count = 1, Percentage = 5.26%
  fiber_content: Count = 1, Percentage = 5.26%
  whole_grain_content: Count = 1, Percentage = 5.26%
  sodium_content: Count = 1, Percentage = 5.26%
  cookie_type: Count = 1, Percentage = 5.26%
  chocolate_type: Count = 1, Percentage = 5.26%
  recommended_by: Count = 1, Percentage = 5.26%
  smart_snacks_approved: Count = 1, Percentage = 5.26%
home:
  product_title: Count = 8, Percentage = 15.69%
  attributes: Count = 4, Percentage = 7.84%
  categories: Count = 1, Percentage = 1.96%
  brand: Count = 4, Percentage = 7.84%
  type: Co