In [None]:
'''
@Time    :   03/16/2025 13:58:00
@Author  :   Ran Zhang 
@Email   :   ran0925@bu.edu / ran.zhang.77@hotmail.com
'''

## Preparation

In [1]:
!pip install openai gradio pandas networkx matplotlib



In [2]:
import openai
import gradio as gr
import pandas as pd
import json
import base64
import re
import matplotlib.pyplot as plt
import networkx as nx
from io import BytesIO
from tqdm.autonotebook import tqdm as notebook_tqdm


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
openai.api_key = "Mykey"
client = openai  # Using the openai module as our client
chatgpt_model = "gpt-4o-mini"

## AI Agent

### Step1: initiative insight

In [4]:
    #   "Assume the user is a U.S.-based small business owner with at most some college education. "
    #     "Use plain, easy-to-understand English and avoid technical jargon."

def get_chatgpt_insight_from_dataurl(data_url):
    prompt_text = (
        "what is this chart about? "
    )
    messages = [
        {
            "role": "user",
            "content": [
                {"type": "text", "text": prompt_text},
                {"type": "image_url", "image_url": {"url": data_url}}
            ]
        }
    ]
    try:
        chat_response = client.chat.completions.create(
            model=chatgpt_model,
            messages=messages,
            max_tokens=500
        )
        chat_text = chat_response.choices[0].message.content.strip()
    except Exception as e:
        chat_text = f"Error: {e}"
    return chat_text


### Step 2: self-evaluation

In [5]:
def evaluate_insight(insight_text):
    eval_prompt = f"""
Please evaluate the following insight according to these criteria. Provide a rating (0 to 5) for each category in JSON format:
Accuracy (5: Fully accurate identification of the chart type and key data insights. 3-4: Minor errors but generally correct. 1-2: Significant inaccuracies or missing key details. 0: Completely incorrect),
Relevance (5: Highlights the most critical insights with no unnecessary details. 3-4: Covers relevant points but includes minor off-topic information. 1-2: Misses key insights or includes too much irrelevant detail. 0: Completely irrelevant),
Clarity (5: Very easy to understand with no jargon. 3-4: Mostly clear but has some complex wording. 1-2: Hard to understand due to technical terms or vague phrasing. 0: Unclear or confusing),
Actionability (5: Clear, practical actions that are directly useful. 3-4: Somewhat useful but could be more specific. 1-2: Actions are too vague or impractical. 0: No actionable insights),
Conciseness (5: Under 50 words, concise, and to the point. 3-4: Slightly over 50 words but still clear. 1-2: Too wordy or contains unnecessary details. 0: Far exceeds word limit or is too brief to be meaningful).

Insight: {insight_text}

Please output the result as:
{{"Accuracy": x, "Relevance": x, "Clarity": x, "Actionability": x, "Conciseness": x}}
    """
    chat_messages = [{"role": "user", "content": eval_prompt}]
    try:
        chat_response = client.chat.completions.create(
            model=chatgpt_model,
            messages=chat_messages,
            temperature=0
        )
        eval_text = chat_response.choices[0].message.content.strip()
        # Clean markdown formatting if present:
        eval_text_clean = re.sub(r"^```(?:json)?\s*", "", eval_text).rstrip("`").strip()
        eval_scores = json.loads(eval_text_clean)
    except Exception as e:
        print("Error in evaluation:", e)
        eval_scores = {}
    return eval_scores


### Step 3: score

In [6]:
def evaluate_insight(insight_text):
    eval_prompt = f"""
Please evaluate the following insight according to these criteria. Provide a rating (0 to 5) for each category in JSON format:
Accuracy (5: Fully accurate identification of the chart type and key data insights. 3-4: Minor errors but generally correct. 1-2: Significant inaccuracies or missing key details. 0: Completely incorrect),
Relevance (5: Highlights the most critical insights with no unnecessary details. 3-4: Covers relevant points but includes minor off-topic information. 1-2: Misses key insights or includes too much irrelevant detail. 0: Completely irrelevant),
Clarity (5: Very easy to understand with no jargon. 3-4: Mostly clear but has some complex wording. 1-2: Hard to understand due to technical terms or vague phrasing. 0: Unclear or confusing),
Actionability (5: Clear, practical actions that are directly useful. 3-4: Somewhat useful but could be more specific. 1-2: Actions are too vague or impractical. 0: No actionable insights),
Conciseness (5: Under 50 words, concise, and to the point. 3-4: Slightly over 50 words but still clear. 1-2: Too wordy or contains unnecessary details. 0: Far exceeds word limit or is too brief to be meaningful).

Insight: {insight_text}

Please output the result as:
{{"Accuracy": x, "Relevance": x, "Clarity": x, "Actionability": x, "Conciseness": x}}
    """
    chat_messages = [{"role": "user", "content": eval_prompt}]
    try:
        chat_response = client.chat.completions.create(
            model=chatgpt_model,
            messages=chat_messages,
            temperature=0
        )
        eval_text = chat_response.choices[0].message.content.strip()
        # Clean markdown formatting if present:
        eval_text_clean = re.sub(r"^```(?:json)?\s*", "", eval_text).rstrip("`").strip()
        eval_scores = json.loads(eval_text_clean)
    except Exception as e:
        print("Error in evaluation:", e)
        eval_scores = {}
    return eval_scores


### Step 4: refine prompt

In [7]:
def refine_prompt(insight_text, eval_scores):
    feedback = []
    if eval_scores.get("Accuracy", 5) < 5:
        feedback.append("improve the accuracy by better identifying the chart type and key data insights")
    if eval_scores.get("Relevance", 5) < 5:
        feedback.append("focus more on the most critical insights and remove unnecessary details")
    if eval_scores.get("Clarity", 5) < 5:
        feedback.append("simplify the language to improve clarity and avoid technical jargon")
    if eval_scores.get("Actionability", 5) < 5:
        feedback.append("suggest clearer, more practical and actionable item")
    if eval_scores.get("Conciseness", 5) < 5:
        feedback.append("make the response less than 50 words, keep this critiria as the first priority")
    if not feedback:
        return insight_text
    feedback_text = "; ".join(feedback)
    new_prompt = f"Based on the following feedback: {feedback_text}, please refine the following insight: {insight_text}"
    return new_prompt


### Step 5: threshold & iteration

In [8]:
def iterative_insight_process(data_url):
    max_iterations = 5
    target_avg_score = 4.5
    eval_records = []
    current_insight = get_chatgpt_insight_from_dataurl(data_url)
    print("Initial Insight:", current_insight)
    eval_records.append({
        "Iteration": 0,
        "Insight": current_insight,
        "Average": None,
        "Accuracy": None,
        "Relevance": None,
        "Clarity": None,
        "Actionability": None,
        "Conciseness": None
    })
    
    for iteration in range(1, max_iterations + 1):
        print(f"\n--- Iteration {iteration} ---")
        eval_scores = evaluate_insight(current_insight)
        if not eval_scores:
            print("Evaluation failed. Exiting iteration.")
            break
        avg_score = sum(eval_scores.values()) / len(eval_scores)
        print("Evaluation Scores:")
        for key, value in eval_scores.items():
            print(f"  {key}: {value}")
        print("Average Score:", avg_score)
        record = {"Iteration": iteration, "Insight": current_insight, "Average": avg_score}
        record.update(eval_scores)
        eval_records.append(record)
        if avg_score >= target_avg_score:
            print("Target average score reached. Stopping iterations.")
            break
        refinement_prompt = refine_prompt(current_insight, eval_scores)
        print("Refinement Prompt:", refinement_prompt)
        chat_messages = [{"role": "user", "content": refinement_prompt}]
        try:
            chat_response = client.chat.completions.create(
                model="gpt-4o-mini",
                messages=chat_messages,
                temperature=0
            )
            current_insight = chat_response.choices[0].message.content.strip()
            print("New Insight:", current_insight)
        except Exception as e:
            print("Error during refinement:", e)
            break
    
    final_eval_scores = evaluate_insight(current_insight)
    if final_eval_scores:
        final_avg = sum(final_eval_scores.values()) / len(final_eval_scores)
        print("\nFinal Evaluation Scores:")
        for key, value in final_eval_scores.items():
            print(f"  {key}: {value}")
        final_record = {"Iteration": "Final", "Insight": current_insight, "Average": final_avg}
        final_record.update(final_eval_scores)
        eval_records.append(final_record)
    else:
        print("Final evaluation failed, no scores available.")
    
    df = pd.DataFrame(eval_records)
    print("\nEvaluation DataFrame:")
    print(df)
    return current_insight, df


## Gradio

In [9]:
import gradio as gr
from io import BytesIO
import base64
import pandas as pd

def process_chart(image):
    """
    Convert the uploaded image to a data URL, run the iterative insight process,
    and return a summary string containing the lowest-scored insight,
    the highest-scored insight, and the final insight.
    """
    # Convert the uploaded PIL image to a data URL
    buffered = BytesIO()
    image.save(buffered, format="PNG")
    img_bytes = buffered.getvalue()
    b64_image = base64.b64encode(img_bytes).decode("utf-8")
    mime_type = "image/png"  # Adjust if your image is in a different format
    data_url = f"data:{mime_type};base64,{b64_image}"
    
    # Run the iterative process to get the final insight and evaluation DataFrame
    final_insight, eval_df = iterative_insight_process(data_url)
    
    # Filter out records with valid average scores (ignore those with None)
    df_valid = eval_df[eval_df["Average"].notnull()]
    if not df_valid.empty:
        # Find the record with the lowest average score
        lowest_row = df_valid.loc[df_valid["Average"].idxmin()]
        # Find the record with the highest average score
        highest_row = df_valid.loc[df_valid["Average"].idxmax()]
        
        lowest_str = (f"Lowest Score Insight (Score: {lowest_row['Average']:.2f}):\n"
                      f"{lowest_row['Insight']}\n")
        highest_str = (f"Highest Score Insight (Score: {highest_row['Average']:.2f}):\n"
                       f"{highest_row['Insight']}\n")
    else:
        lowest_str = "No evaluation data available for lowest score."
        highest_str = "No evaluation data available for highest score."
    
    # Combine the results into a single output string
    result = f"{lowest_str}\n{highest_str}\nFinal Insight:\n{final_insight}"
    return result

# Define the Gradio interface
interface = gr.Interface(
    fn=process_chart,
    inputs=gr.Image(type="pil", label="Upload Chart Image"),
    outputs=gr.Textbox(label="Insight Summary"),
    title="AI-Powered Chart Insight Agent",
    description=(
        "Upload your chart image to receive AI-generated business insights along with evaluation metrics. "
        "The system iteratively refines its output and displays the lowest-scored and highest-scored insights "
        "to illustrate improvement over iterations, as well as the final refined insight."
    )
)

interface.launch(share=True)


* Running on local URL:  http://127.0.0.1:7860
* Running on public URL: https://f579252051d00991c6.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




Initial Insight: This chart appears to be a control chart related to packaging performance. The vertical axis represents "Late Packaging," indicating the number of late packaging instances, while the horizontal axis shows "Total Order," which likely refers to the total number of orders processed.

The trend line seems to indicate a decrease in late packaging incidents as the total orders increase, suggesting an improvement in the packaging process over time. The red dots represent individual data points, showing the number of late packaging instances corresponding to each total order value. Overall, the chart likely aims to track and analyze the efficiency of the packaging process.

--- Iteration 1 ---
Evaluation Scores:
  Accuracy: 4
  Relevance: 4
  Clarity: 4
  Actionability: 3
  Conciseness: 4
Average Score: 3.8
Refinement Prompt: Based on the following feedback: improve the accuracy by better identifying the chart type and key data insights; focus more on the most critical insight