In [None]:
!pip install "langchain==v0.0.147"

In [1]:
from langchain.agents import Tool
from langchain.agents import AgentType
from langchain.memory import ConversationBufferMemory
from langchain import OpenAI
from langchain.chat_models import ChatOpenAI
from langchain.agents import initialize_agent
from langchain.chat_models import ChatOpenAI
from langchain import PromptTemplate, LLMChain
from langchain.prompts.chat import (
    ChatPromptTemplate,
    SystemMessagePromptTemplate,
    AIMessagePromptTemplate,
    HumanMessagePromptTemplate,
)
from langchain.schema import (
    AIMessage,
    HumanMessage,
    SystemMessage,
)

In [2]:
example_data_list = [
    {
        "description": "dataframe with timestamp_column_name, prediction_score_column_name, prediction_label_column_name, and actual_label_column_name",
        "dataframe": """pd.DataFrame([
    [pd.to_datetime('2023-03-01 02:02:19'), 0.91, 'click', 'click'],
    [pd.to_datetime('2023-02-17 23:45:48'), 0.37, 'no_click', 'no_click'],
    [pd.to_datetime('2023-01-30 15:30:03'), 0.54, 'click', 'no_click'],
    [pd.to_datetime('2023-02-03 19:56:09'), 0.74, 'click', 'click'],
    [pd.to_datetime('2023-02-24 04:23:43'), 0.37, 'no_click', 'click']
], columns=['timestamp', 'prediction_score', 'prediction', 'target'])""",
        "schema": """px.Schema(
    timestamp_column_name="timestamp",
    prediction_score_column_name="prediction_score",
    prediction_label_column_name="prediction",
    actual_label_column_name="target",
)""",
    },
    {
        "description": "dataframe with prediction_label_column_name, actual_label_column_name, feature_column_names, tag_column_names",
        "dataframe": """pd.DataFrame({
    'fico_score': [578, 507, 656, 414, 512],
    'merchant_id': ['Scammeds', 'Schiller Ltd', 'Kirlin and Sons', 'Scammeds', 'Champlin and Sons'],
    'loan_amount': [4300, 21000, 18000, 18000, 20000],
    'annual_income': [62966, 52335, 94995, 32034, 46005],
    'home_ownership': ['RENT', 'RENT', 'MORTGAGE', 'LEASE', 'OWN'],
    'num_credit_lines': [110, 129, 31, 81, 148],
    'inquests_in_last_6_months': [0, 0, 0, 2, 1],
    'months_since_last_delinquency': [0, 23, 0, 0, 0],
    'age': [25, 78, 54, 34, 49],
    'gender': ['male', 'female', 'female', 'male', 'male'],
    'predicted': ['not_fraud', 'not_fraud', 'uncertain', 'fraud', 'uncertain'],
    'target': ['fraud', 'not_fraud', 'uncertain', 'not_fraud', 'uncertain']
})""",
        "schema": """px.Schema(
    prediction_label_column_name="predicted",
    actual_label_column_name="target",
    feature_column_names=[
        "fico_score",
        "merchant_id",
        "loan_amount",
        "annual_income",
        "home_ownership",
        "num_credit_lines",
        "inquests_in_last_6_months",
        "months_since_last_delinquency",
    ],
    tag_column_names=[
        "age",
        "gender",
    ],
)""",
    },
    {
        "description": "example with prediction_label_column_name, actual_label_column_name, (embedding_feature_column_names with vector_column_name)",
        "dataframe": """pd.DataFrame({
    'predicted': ['fraud', 'fraud', 'not_fraud', 'not_fraud', 'uncertain'],
    'target': ['not_fraud', 'not_fraud', 'not_fraud', 'not_fraud', 'uncertain'],
    'embedding_vector': [[-0.97, 3.98, -0.03, 2.92], [3.20, 3.95, 2.81, -0.09], [-0.49, -0.62, 0.08, 2.03], [1.69, 0.01, -0.76, 3.64], [1.46, 0.69, 3.26, -0.17]],
    'fico_score': [604, 612, 646, 560, 636],
    'merchant_id': ['Leannon Ward', 'Scammeds', 'Leannon Ward', 'Kirlin and Sons', 'Champlin and Sons'],
    'loan_amount': [22000, 7500, 32000, 19000, 10000],
    'annual_income': [100781, 116184, 73666, 38589, 100251],
    'home_ownership': ['RENT', 'MORTGAGE', 'RENT', 'MORTGAGE', 'MORTGAGE'],
    'num_credit_lines': [108, 42, 131, 131, 10],
    'inquests_in_last_6_months': [0, 2, 0, 0, 0],
    'months_since_last_delinquency': [0, 56, 0, 0, 3]
})""",
        "schema": """px.Schema(
    prediction_label_column_name="predicted",
    actual_label_column_name="target",
    embedding_feature_column_names={
        "transaction_embeddings": px.EmbeddingColumnNames(
            vector_column_name="embedding_vector"
        ),
    },
)""",
    },    
#     {
#         "description": "dataframe with actual_label_column_name, (embedding_feature_column_names with vector_column_name and link_to_data_column_name)",
#         "dataframe": """pd.DataFrame({
#     'defective': ['okay', 'defective', 'okay', 'defective', 'okay'],
#     'image': ['https://www.example.com/image0.jpeg', 'https://www.example.com/image1.jpeg', 'https://www.example.com/image2.jpeg', 'https://www.example.com/image3.jpeg', 'https://www.example.com/image4.jpeg'],
#     'image_vector': [[1.73, 2.67, 2.91, 1.79, 1.29], [2.18, -0.21, 0.87, 3.84, -0.97], [3.36, -0.62, 2.40, -0.94, 3.69], [2.77, 2.79, 3.36, 0.60, 3.10], [1.79, 2.06, 0.53, 3.58, 0.24]]
# })""",
#         "schema": """px.Schema(
#     actual_label_column_name="defective",
#     embedding_feature_column_names={
#         "image_embedding": px.EmbeddingColumnNames(
#             vector_column_name="image_vector",
#             link_to_data_column_name="image",
#         ),
#     },
# )""",
#     },
#     {
#         "description": "dataframe with actual_label_column_name, feature_column_names, tag_column_names, (embedding_feature_column_names with vector_column_name and raw_data_column_name)",
#         "dataframe": """pd.DataFrame({
#     'defective': ['okay', 'defective', 'okay', 'defective', 'okay'],
#     'image': ['https://www.example.com/image0.jpeg', 'https://www.example.com/image1.jpeg', 'https://www.example.com/image2.jpeg', 'https://www.example.com/image3.jpeg', 'https://www.example.com/image4.jpeg'],
#     'image_vector': [[1.73, 2.67, 2.91, 1.79, 1.29], [2.18, -0.21, 0.87, 3.84, -0.97], [3.36, -0.62, 2.40, -0.94, 3.69], [2.77, 2.79, 3.36, 0.60, 3.10], [1.79, 2.06, 0.53, 3.58, 0.24]]
# })""",
#         "schema": """px.Schema(
#     actual_label_column_name="sentiment",
#     feature_column_names=[
#         "category",
#     ],
#     tag_column_names=[
#         "name",
#     ],
#     embedding_feature_column_names={
#         "product_review_embeddings": px.EmbeddingColumnNames(
#             vector_column_name="text_vector",
#             raw_data_column_name="text",
#         ),
#     },
# )""",
#     },
]

examples = ""
for example_data in example_data_list:
    examples += f"""Example: {example_data["description"]}
Dataframe:

```python
{example_data["dataframe"]}
```

Schema:

```python
{example_data["schema"]}
```
"""
print(examples)

Example: dataframe with timestamp_column_name, prediction_score_column_name, prediction_label_column_name, and actual_label_column_name
Dataframe:

```python
pd.DataFrame([
    [pd.to_datetime('2023-03-01 02:02:19'), 0.91, 'click', 'click'],
    [pd.to_datetime('2023-02-17 23:45:48'), 0.37, 'no_click', 'no_click'],
    [pd.to_datetime('2023-01-30 15:30:03'), 0.54, 'click', 'no_click'],
    [pd.to_datetime('2023-02-03 19:56:09'), 0.74, 'click', 'click'],
    [pd.to_datetime('2023-02-24 04:23:43'), 0.37, 'no_click', 'click']
], columns=['timestamp', 'prediction_score', 'prediction', 'target'])
```

Schema:

```python
px.Schema(
    timestamp_column_name="timestamp",
    prediction_score_column_name="prediction_score",
    prediction_label_column_name="prediction",
    actual_label_column_name="target",
)
```
Example: dataframe with prediction_label_column_name, actual_label_column_name, feature_column_names, tag_column_names
Dataframe:

```python
pd.DataFrame({
    'fico_score': [578, 50

In [3]:
with open("/Users/xandersong/phoenix/tutorials/api_reference.md") as f:
    api_reference = f.read()

In [4]:
import pandas as pd

dataframe = pd.read_parquet("https://storage.googleapis.com/arize-assets/phoenix/datasets/unstructured/cv/human-actions/human_actions_training.parquet")

sampled_dataframe = dataframe.head(1)
column_to_type = {}
for column in sampled_dataframe.columns:
    column_to_type[column] = repr(type(sampled_dataframe[column].iloc[0]))[8:-2]
dataframe_column_to_type = "\n".join([f"{column}: {type_string}" for column, type_string in column_to_type.items()])
print(dataframe_column_to_type)

prediction_id: str
prediction_ts: numpy.float64
url: str
image_vector: numpy.ndarray
actual_action: str
predicted_action: str


In [5]:
template = """You are a helpful chatbot. Your goal is to create a Phoenix schema that describes the user's input dataframe. Each of your messages should end with the schema itself (syntactic Python code inside of a markdown cell). You should proactively interact with the user to discover the correct schema. You should also help them understand the meaning of each of the fields of phoenix.Schema if they seem confused. When the user explicitly acknowledges that a schema you have suggested correctly describes their dataframe, you should call the launch-phoenix tool. Do not use the backtick symbol (`) in your response.

API reference:

{api_reference}

Examples:

{examples}
"""
print(template)

You are a helpful chatbot. Your goal is to create a Phoenix schema that describes the user's input dataframe. Each of your messages should end with the schema itself (syntactic Python code inside of a markdown cell). You should proactively interact with the user to discover the correct schema. You should also help them understand the meaning of each of the fields of phoenix.Schema if they seem confused. When the user explicitly acknowledges that a schema you have suggested correctly describes their dataframe, you should call the launch-phoenix tool. Do not use the backtick symbol (`) in your response.

API reference:

{api_reference}

Examples:

{examples}



In [6]:
system_message_prompt_template = SystemMessagePromptTemplate.from_template(template)
system_message = system_message_prompt_template.format(
    api_reference=api_reference,
    examples=examples,
)
print(system_message.content)

You are a helpful chatbot. Your goal is to create a Phoenix schema that describes the user's input dataframe. Each of your messages should end with the schema itself (syntactic Python code inside of a markdown cell). You should proactively interact with the user to discover the correct schema. You should also help them understand the meaning of each of the fields of phoenix.Schema if they seem confused. When the user explicitly acknowledges that a schema you have suggested correctly describes their dataframe, you should call the launch-phoenix tool. Do not use the backtick symbol (`) in your response.

API reference:

# phoenix.Dataset

```python
class Dataset(
    dataframe: pandas.DataFrame,
    schema: Schema,
    name: Optional[str] = None,
)
```

A dataset containing a split or cohort of data to be analyzed independently or compared to another cohort. Common examples include training, validation, test, or production datasets.

## Parameters

* **dataframe** (pandas.DataFrame): The

In [7]:
tools = [
    Tool(
        name="launch-phoenix",
        func=lambda: print("🚀 Launching Phoenix"),
        description="This tool launches the Phoenix app. It should only be run when the user has acknowledged that the schema for their data is correct.",
    ),
]

In [8]:
input_message_prompt_template = """Input Dataframe Columns to Data Type:

{dataframe_column_to_type}"""
human_message = HumanMessagePromptTemplate.from_template(input_message_prompt_template).format(dataframe_column_to_type=dataframe_column_to_type)
print(human_message.content)

Input Dataframe Columns to Data Type:

prediction_id: str
prediction_ts: numpy.float64
url: str
image_vector: numpy.ndarray
actual_action: str
predicted_action: str


In [9]:
memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
memory.chat_memory.messages.append(system_message)
memory

ConversationBufferMemory(chat_memory=ChatMessageHistory(messages=[SystemMessage(content='You are a helpful chatbot. Your goal is to create a Phoenix schema that describes the user\'s input dataframe. Each of your messages should end with the schema itself (syntactic Python code inside of a markdown cell). You should proactively interact with the user to discover the correct schema. You should also help them understand the meaning of each of the fields of phoenix.Schema if they seem confused. When the user explicitly acknowledges that a schema you have suggested correctly describes their dataframe, you should call the launch-phoenix tool. Do not use the backtick symbol (`) in your response.\n\nAPI reference:\n\n# phoenix.Dataset\n\n```python\nclass Dataset(\n    dataframe: pandas.DataFrame,\n    schema: Schema,\n    name: Optional[str] = None,\n)\n```\n\nA dataset containing a split or cohort of data to be analyzed independently or compared to another cohort. Common examples include tra

In [14]:
# llm = OpenAI(temperature=0.0)
model_name = "gpt-3.5-turbo"
llm = ChatOpenAI(model_name=model_name, temperature=0.0)
# messages = [
#     system_message,
# #     human_message,
# #     HumanMessage(content="What should my Phoenix schema look like?")
# ]
agent_chain = initialize_agent(
    tools=tools,
    llm=llm,
    agent=AgentType.CHAT_CONVERSATIONAL_REACT_DESCRIPTION,
    verbose=True,
    memory=memory,
)
agent_chain.run(
#     input=messages,
    input=human_message,
)



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m{
    "action": "Final Answer",
    "action_input": "px.Schema(\n    prediction_id_column_name='prediction_id',\n    timestamp_column_name='prediction_ts',\n    feature_column_names=['url', 'image_vector'],\n    prediction_label_column_name='predicted_action',\n    actual_label_column_name='actual_action'\n)"
}[0m

[1m> Finished chain.[0m


ValidationError: 1 validation error for HumanMessage
content
  str type expected (type=type_error.str)