In [27]:
import openai
import os
from langsmith import Client
from dotenv import find_dotenv, load_dotenv

from langchain.llms import OpenAI, AzureOpenAI
from langchain_openai.chat_models import AzureChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain.smith import RunEvalConfig, run_on_dataset

## Necessary configs for Azure OpenAI


In [29]:
azure_configs = {
    "base_url": "https://channel-openai-org.openai.azure.com/",
    "model_deployment": "GPT-Channel-OpenAI",
    "model_name": "gpt-35-turbo",
    "embedding_deployment": "GPT-Channel-OpenAI",
    "embedding_name": "AIEmbeddings",
}

In [30]:
load_dotenv(find_dotenv())

True

## Setting of Environment variables


In [31]:
os.environ["LANGCHAIN_API_KEY"] = os.getenv("LANGCHAIN_API_KEY")
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_ENDPOINT"] = "https://api.smith.langchain.com"
os.environ["LANGCHAIN_PROJECT"] = os.getenv("LANGCHAIN_PROJECT_2")
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")

openai.api_type = "azure"
openai.api_version = "2023-09-15-preview"
openai.api_key = os.getenv("OPENAI_API_KEY")
model: str = "gpt-35-turbo"
deployment_name = "GPT-Channel-OpenAI"

## Creating object of the Large Language Model


In [37]:
llm = AzureChatOpenAI(
    openai_api_version="2023-05-15",
    azure_endpoint=azure_configs["base_url"],
    azure_deployment=azure_configs["model_deployment"],
    model=azure_configs["model_name"],
    validate_base_url=False,
)

## The client that is required for Langsmith trace


In [33]:
client = Client()

## Quickstart on Langsmith feature


In [34]:
llm.predict(
    "Hypothetically, what do you think will happen to earth once humans start settling in mars?"
)

"As an AI language model, I don't have personal opinions or beliefs. However, I can provide a possible scenario based on scientific knowledge and predictions.\n\nIf humans start settling on Mars, it will likely have a significant impact on the planet Earth. Initially, it may lead to a decrease in the population density and resource consumption on Earth, which may positively impact the environment.\n\nOn the other hand, settling on Mars will require a considerable amount of resources, including energy, water, and food, which may lead to increased resource consumption on Earth to support the Mars settlement. There may also be significant technological advancements made to support the settlement, which could have both positive and negative impacts on the environment.\n\nAdditionally, the exploration and colonization of Mars may lead to the discovery of new scientific knowledge and resources that could benefit Earth. However, it may also lead to the exploitation of Mars' resources and pote

## Evaluation Quick Start


### Creating Dataset (only Inputs, No output)


In [35]:
example_inputs = [
    "a rap battle between Atticus Finch and Cicero",
    "a rap battle between Barbie and Oppenheimer",
    "a Pythonic rap battle between two swallows: one European and one African",
    "a rap battle between Aubrey Plaza and Stephen Colbert",
]

dataset_name = "Rap Battle Dataset"

# Storing inputs in a dataset lets us
# run chains and LLMs over a shared set of examples.
dataset = client.create_dataset(
    dataset_name=dataset_name,
    description="Rap battle prompts.",
)

for input_prompt in example_inputs:
    # Each example must be unique and have inputs defined.
    # Outputs are optional
    client.create_example(
        inputs={"question": input_prompt},
        outputs=None,
        dataset_id=dataset.id,
    )

HTTPError: [Errno 409 Client Error: Conflict for url: https://api.smith.langchain.com/datasets] {"detail":"Dataset with this name already exists."}

### 2. Evaluate datasets with LLM


In [38]:
eval_config = RunEvalConfig(
    evaluators=[
        # You can specify an evaluator by name/enum.
        # In this case, the default criterion is "helpfulness"
        "criteria",
        # Or you can configure the evaluator
        RunEvalConfig.Criteria("harmfulness"),
        RunEvalConfig.Criteria("misogyny"),
        RunEvalConfig.Criteria(
            {
                "cliche": "Are the lyrics cliche? "
                "Respond Y if they are, N if they're entirely unique."
            }
        ),
    ]
)

run_on_dataset(
    client=client,
    dataset_name=dataset_name,
    llm_or_chain_factory=llm,
    evaluation=eval_config,
)

View the evaluation results for project 'diligent-chicken-30' at:
https://smith.langchain.com/o/d8d276bd-e36b-55d0-b03a-1ccc6277cbef/datasets/dd500565-01a0-405a-8631-4bf8f5bb7b50/compare?selectedSessions=d1f783c1-4336-41c4-9218-1055e8d066a7

View all tests for Dataset Rap Battle Dataset at:
https://smith.langchain.com/o/d8d276bd-e36b-55d0-b03a-1ccc6277cbef/datasets/dd500565-01a0-405a-8631-4bf8f5bb7b50
[>                                                 ] 0/4

LLM failed for example acd1f7d1-efe5-4e4e-beae-28784bcfc5a1 with inputs {'question': 'a rap battle between Barbie and Oppenheimer'}
Error Type: ValueError, Message: Azure has not provided the response due to a content filter being triggered


[------------------------------------------------->] 4/4

{'project_name': 'diligent-chicken-30',
 'results': {'14c1b691-ae2b-4391-91e2-5138b5d5f1e0': {'input': {'question': 'a rap battle between Aubrey Plaza and Stephen Colbert'},
   'feedback': [EvaluationResult(key='helpfulness', score=None, value=None, comment="Error evaluating run e3dbe4b5-613c-4436-8eb3-8fc13ace3f0c: Error code: 401 - {'error': {'message': 'Incorrect API key provided: 71fea614********************c37f. You can find your API key at https://platform.openai.com/account/api-keys.', 'type': 'invalid_request_error', 'param': None, 'code': 'invalid_api_key'}}", correction=None, evaluator_info={}, feedback_config=None, source_run_id=None, target_run_id=None),
    EvaluationResult(key='harmfulness', score=None, value=None, comment="Error evaluating run e3dbe4b5-613c-4436-8eb3-8fc13ace3f0c: Error code: 401 - {'error': {'message': 'Incorrect API key provided: 71fea614********************c37f. You can find your API key at https://platform.openai.com/account/api-keys.', 'type': 'inva

## Different Ways of Creating Datasets in LangSmith


### 1. Create a Dataset From a List of Examples (Key-Value Pairs)


In [39]:
example_inputs = [
    ("What is the largest mammal?", "The blue whale"),
    ("What do mammals and birds have in common?", "They are both warm-blooded"),
    ("What are reptiles known for?", "Having scales"),
    (
        "What's the main characteristic of amphibians?",
        "They live both in water and on land",
    ),
]

dataset_name = "Elementary Animal Questions"

dataset = client.create_dataset(
    dataset_name=dataset_name,
    description="Questions and answers about animal phylogenetics.",
)

for input_prompt, output_answer in example_inputs:
    client.create_example(
        inputs={"question": input_prompt},
        outputs={"answer": output_answer},
        dataset_id=dataset.id,
    )

### 2. Create a Dataset From Existing Runs


In [41]:
dataset_name = "Example Dataset 2"

# Filter runs to add to the dataset
runs = client.list_runs(
    project_name="TEST LLM PROJECT",
    execution_order=1,
    error=False,
)

dataset = client.create_dataset(dataset_name, description="An example dataset")

for run in runs:
    client.create_example(
        inputs=run.inputs,
        outputs=run.outputs,
        dataset_id=dataset.id,
    )

## Correctness: LangSmith Question-Answer Evaluation


In [42]:
# 1. Evaluate Datasets That Contain Labels

evaluation_config = RunEvalConfig(
    evaluators=[
        "qa",  # correctness: right or wrong
        "context_qa",  # refer to example outputs
        "cot_qa",  # context_qa + reasoning
    ]
)

run_on_dataset(
    client=client,
    dataset_name="Elementary Animal Questions",
    llm_or_chain_factory=llm,
    evaluation=evaluation_config,
)

View the evaluation results for project 'ordinary-play-42' at:
https://smith.langchain.com/o/d8d276bd-e36b-55d0-b03a-1ccc6277cbef/datasets/a727245b-2dea-400e-85fc-e71ca73d8081/compare?selectedSessions=bf4dc105-bbe2-4090-bb72-d3ffc5b2b143

View all tests for Dataset Elementary Animal Questions at:
https://smith.langchain.com/o/d8d276bd-e36b-55d0-b03a-1ccc6277cbef/datasets/a727245b-2dea-400e-85fc-e71ca73d8081
[------------------------------------------------->] 4/4

{'project_name': 'ordinary-play-42',
 'results': {'94b75199-7762-4d79-9b31-b937f4b244d2': {'input': {'question': "What's the main characteristic of amphibians?"},
   'feedback': [EvaluationResult(key='correctness', score=None, value=None, comment="Error evaluating run 934403fd-c1a2-44c2-8f15-c02d095fb5cd: Error code: 401 - {'error': {'message': 'Incorrect API key provided: 71fea614********************c37f. You can find your API key at https://platform.openai.com/account/api-keys.', 'type': 'invalid_request_error', 'param': None, 'code': 'invalid_api_key'}}", correction=None, evaluator_info={}, feedback_config=None, source_run_id=None, target_run_id=None),
    EvaluationResult(key='Contextual Accuracy', score=None, value=None, comment="Error evaluating run 934403fd-c1a2-44c2-8f15-c02d095fb5cd: Error code: 401 - {'error': {'message': 'Incorrect API key provided: 71fea614********************c37f. You can find your API key at https://platform.openai.com/account/api-keys.', 'type': 'invalid