In [2]:
#!pip install gymnasium

In [None]:
import sys
import os

# Add the project root directory to Python path
project_root = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
sys.path.append(project_root)

#--------------------------------------------------------------------------------------------#


In [6]:
import gymnasium as gym
from langchain.output_parsers import RegexParser
from langchain.schema import (
    HumanMessage,
    SystemMessage,
)
import numpy as np
from langchain_openai import ChatOpenAI
from utils.LLM_utils import get_completion_gpt4
from actor_agents.Invoice_extractor import generate_invoice_agent, baseline_invoice_agent 
from actor_agents.document_classifier import classify_document_with_llm
from actor_agents.schema_builder import schema_building_with_llm
from meta_prompting_agent import adjust_prompt
from evaluation.scoring import calculate_exact_match, calculate_similarity




In [None]:


class DataExtractionEnv(gym.Env):
    def __init__(self, invoice, schema, groundtruth):
        super(DataExtractionEnv, self).__init__()
        self.action_space = gym.spaces.Discrete(5)  # 5 possible prompt adjustments
        # Expanded observation space to include perplexity and linear probability
        self.observation_space = gym.spaces.Box(
            low=0, 
            high=1, 
            shape=(4,),  # [Exact Match, Similarity, Perplexity Score, Linear Probability]
            dtype=np.float32
        )
        
        # Store document extraction related data
        self.invoice = invoice
        self.schema = schema
        self.groundtruth = groundtruth
        self.current_prompt = generate_invoice_agent(invoice, schema)
        self.task_type = 'form-like document extraction'
        
        self.state = None
        self.reset()

    def get_perplexity_score(self, response):
        """Calculate perplexity score from response logprobs"""
        logprobs = [token.logprob for token in response.choices[0].logprobs.content]
        perplexity_score = np.exp(-np.mean(logprobs))
        # Normalize perplexity score to [0,1] range
        normalized_perplexity = 1 / (1 + perplexity_score)
        return normalized_perplexity

    def get_linear_probability(self, response):
        """Calculate linear probability score"""
        logprobs = [token.logprob for token in response.choices[0].logprobs.content]
        linear_prob = np.exp(np.sum(logprobs))
        # Normalize to [0,1] range
        normalized_linear_prob = np.clip(linear_prob, 0, 1)
        return normalized_linear_prob

    def step(self, action):
        # Get updated prompt using meta-prompting agent
        updated_prompt = adjust_prompt(
            actor_prompt=self.current_prompt,
            task_type=self.task_type,
            state=self.state,
            action=action,
            generated_output=self.last_output if hasattr(self, 'last_output') else None,
            groundtruth=self.groundtruth
        )
        self.current_prompt = updated_prompt

        # Generate new output using the updated prompt with logprobs enabled
        response = get_completion_gpt4(
            [{"role": "user", "content": updated_prompt}],
            logprobs=True
        )
        self.last_output = response.choices[0].message.content

        # Calculate all scores
        exact_match_score = calculate_exact_match(self.last_output, self.groundtruth)
        similarity_score = calculate_similarity(self.last_output, self.groundtruth)
        perplexity_score = self.get_perplexity_score(response)
        linear_prob_score = self.get_linear_probability(response)

        # Update state with all scores
        self.state = np.array([
            exact_match_score,
            similarity_score,
            perplexity_score,
            linear_prob_score
        ], dtype=np.float32)

        # Calculate comprehensive reward
        reward = (
            0.4 * exact_match_score +
            0.3 * similarity_score +
            0.15 * perplexity_score +
            0.15 * linear_prob_score -
            abs(action - 2) * 0.05  # Action penalty
        )

        # Check if task is complete (adjusted thresholds)
        done = bool(
            exact_match_score >= 0.95 and 
            similarity_score >= 0.95 and
            perplexity_score >= 0.7 and
            linear_prob_score >= 0.7
        )

        info = {
            'exact_match': exact_match_score,
            'similarity': similarity_score,
            'perplexity': perplexity_score,
            'linear_probability': linear_prob_score
        }

        return self.state, reward, done, info

    def reset(self):
        # Reset prompt to initial state
        self.current_prompt = generate_invoice_agent(self.invoice, self.schema)
        self.last_output = None
        # Initialize state with all four metrics
        self.state = np.random.uniform(0.2, 0.4, size=(4,))
        return np.array(self.state)

In [None]:
class GymnasiumAgent:
    # ... (previous code remains the same until interact method)
    @classmethod
    def get_docs(cls, env):
        return env.unwrapped.__doc__

    def __init__(self, model, env):
        self.model = model
        self.env = env
        self.docs = self.get_docs(env)

        self.instructions = """
Your goal is to maximize your return, i.e., the sum of the rewards you receive.
I will give you an observation, reward, termination flag, truncation flag, and the return so far, formatted as:

Observation: <observation>
Reward: <reward>
Termination: <termination>
Truncation: <truncation>
Return: <sum_of_rewards>

You will respond with an action, formatted as:

Action: <action>

where you replace <action> with your actual action.
"""
        self.action_parser = RegexParser(
            regex=r"Action: (.*)", output_keys=["action"],
        )

    def interact(self):
        observation, _ = self.env.reset()
        terminated = False
        total_reward = 0

        while not terminated:
            # Format observation for better readability
            obs_dict = {
                'Exact Match': observation[0],
                'Similarity': observation[1],
                'Perplexity': observation[2],
                'Linear Probability': observation[3]
            }
            print("\nCurrent State:")
            for metric, value in obs_dict.items():
                print(f"{metric}: {value:.4f}")

            # Generate a response (action) using the model
            response = self.model([
                SystemMessage(content=self.instructions),
                HumanMessage(content=f"""
                    Observation: {obs_dict}
                    Reward: {total_reward:.4f}
                    Termination: {terminated}
                    Truncation: False
                    Return: {total_reward:.4f}
                """)
            ])

            action = int(self.action_parser.parse(response.content)['action'])

            # Perform action in the environment
            observation, reward, terminated, info = self.env.step(action)
            total_reward += reward

            print(f"\nAction taken: {action}")
            print(f"Reward: {reward:.4f}")
            print(f"Total Return: {total_reward:.4f}")
            print("Metrics:", info)

        print("\nTask completed successfully!")

In [2]:
class GymnasiumAgent:
    @classmethod
    def get_docs(cls, env):
        return env.unwrapped.__doc__

    def __init__(self, model, env):
        self.model = model
        self.env = env
        self.docs = self.get_docs(env)

        self.instructions = """
Your goal is to maximize your return, i.e., the sum of the rewards you receive.
I will give you an observation, reward, termination flag, truncation flag, and the return so far, formatted as:

Observation: <observation>
Reward: <reward>
Termination: <termination>
Truncation: <truncation>
Return: <sum_of_rewards>

You will respond with an action, formatted as:

Action: <action>

where you replace <action> with your actual action.
"""
        self.action_parser = RegexParser(
            regex=r"Action: (.*)", output_keys=["action"],
        )

    def interact(self):
        observation, _ = self.env.reset()
        terminated = False
        total_reward = 0

        while not terminated:
            print(f"Observation: {observation}")

            # Generate a response (action) using the model
            response = self.model([
                SystemMessage(content=self.instructions),
                HumanMessage(content=f"Observation: {observation}\nReward: {total_reward}\nTermination: {terminated}\nTruncation: False\nReturn: {total_reward}")
            ])

            action = int(self.action_parser.parse(response.content)['action'])

            # Perform action in the environment
            observation, reward, terminated, _ = self.env.step(action)
            total_reward += reward

            print(f"Action: {action}, Reward: {reward}, Total Return: {total_reward}")

        print("Task completed successfully!")


In [None]:
# Initialize environment with example data
# Example initial state and actions
invoice = f"""
-----------------Invoice------------------
                              Page 1 of 3

Invoice Number: INV-12345
Customer: XYZ Corp
Invoice Date: 2024-06-01


Item    Quantity    Price     Total
item_1     5         $100      500
item_2     10        $50       500
item_3     6         $10       60

					Subtotal: 1060
					Total GST: 500
					Total Amount: $1560
--------------------------------------------
"""
schema = schema = {
"invoice_number": "string",
"customer": "string",
"invoice_date": "yyyy-mm-dd",
"sub_total": "number",
"total_GST": "number",
"total_amount": "number",
"Line_Items": [
    {
        "item": "string",
        "quantity": "number",
        "price": "number",
        "total": "number"
    }
] 
}

generated_output = {
"invoice_number": "INV-12345",
"invoice_date": "2024-06-01",
"sub_total": 1060,
"total_amount": 1560, 
"Line_Items": [
    {
    "item": "item_1",
    "quantity": 5,
    "price": "$100",
    "total": 500
    },
    {
    "item": "item_2",
    "quantity": 10,
    "price": "$50",
    "total": 500
    }
]
} 
groundtruth = {
"invoice_number": "INV-12345",
"customer": "XYZ Corp",
"invoice_date": "2024-06-01",
"sub_total": 1060,
"total_GST":500,
"total_amount": 1560,
"Line_Items": [
    {
    "item": "item_1",
    "quantity": 5,
    "price": 100,
    "total": 500
    },
    {
    "item": "item_2",
    "quantity": 10,
    "price": 50,
    "total": 500
    },
    {
    "item": "item_3",
    "quantity": 6,
    "price": 10,
    "total": 60
    }
]
}
task_type ='form-like document extraction'



In [6]:
# Create environment and agent
env = DataExtractionEnv(invoice=invoice, schema=schema, groundtruth=groundtruth)
agent = GymnasiumAgent(model=ChatOpenAI(temperature=0.2), env=env)

# Run the interaction
agent.interact()

Observation: 0.34170806294078704


  response = self.model([


Action: 0, Reward: 0.046947020198249756, Total Return: 0.046947020198249756
Observation: [0.35548219 0.41337379]


ValueError: invalid literal for int() with base 10: 'Buy '