# Summarization
https://docs.arize.com/phoenix/datasets-and-experiments/use-cases-datasets/summarization

## Install Dependencies and Import Libraries

In [6]:
from typing import Any, Dict

import nest_asyncio
import pandas as pd

nest_asyncio.apply()  # needed for concurrent evals in notebook environments
pd.set_option("display.max_colwidth", None)  # display full cells of dataframes

In [7]:
## Instrument Your Application

In [8]:
from openinference.instrumentation.openai import OpenAIInstrumentor
from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter
from opentelemetry.sdk import trace as trace_sdk
from opentelemetry.sdk.trace.export import SimpleSpanProcessor

endpoint = "http://phoenix:6006/v1/traces"
tracer_provider = trace_sdk.TracerProvider()
tracer_provider.add_span_processor(SimpleSpanProcessor(OTLPSpanExporter(endpoint)))

OpenAIInstrumentor().instrument(tracer_provider=tracer_provider)

Attempting to instrument while already instrumented


In [9]:
## Create Your Dataset

In [10]:
import phoenix as px
from datetime import datetime

from datasets import load_dataset

hf_ds = load_dataset("abisee/cnn_dailymail", "3.0.0")
df = (
    hf_ds["test"]
    .to_pandas()
    .sample(n=10, random_state=0)
    .set_index("id")
    .rename(columns={"highlights": "summary"})
)
now = datetime.now().strftime("%Y-%m-%d %H:%M:%S")


dataset = px.Client(endpoint="http://phoenix:6006").upload_dataset(
    dataframe=df,
    input_keys=["article"],
    output_keys=["summary"],
    dataset_name=f"news-article-summaries-{now}",
)

README.md:   0%|          | 0.00/15.6k [00:00<?, ?B/s]

train-00000-of-00003.parquet:   0%|          | 0.00/257M [00:00<?, ?B/s]

train-00001-of-00003.parquet:   0%|          | 0.00/257M [00:00<?, ?B/s]

train-00002-of-00003.parquet:   0%|          | 0.00/259M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/34.7M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/30.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/287113 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/13368 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/11490 [00:00<?, ? examples/s]

📤 Uploading dataset...
💾 Examples uploaded: http://phoenix:6006/datasets/RGF0YXNldDo0/examples
🗄️ Dataset version ID: RGF0YXNldFZlcnNpb246Ng==


## Define Your Experiment Task

In [11]:
import openai
# from phoenix.experiments import Example
import os

# Create OpenAI client with custom base URL
openai_client = openai.OpenAI(
    api_key=os.getenv("OPENAI_API_KEY"),
    base_url=os.getenv("OPENAI_API_BASE")  # Custom API Base
)

async def summarize_article_openai(example, prompt_template: str, model: str) -> str:
    formatted_prompt_template = prompt_template.format(article=example.input["article"])
    response = openai_client.chat.completions.create(
        model=model,
        messages=[
            {"role": "assistant", "content": formatted_prompt_template},
        ],
    )
    assert response.choices
    return response.choices[0].message.content

In [12]:
import textwrap
from functools import partial

template = """
Summarize the article in two to four sentences:

ARTICLE
=======
{article}

SUMMARY
=======
"""
task = partial(summarize_article_openai, prompt_template=template, model="o1")
# print(dataset.examples)
test_example = dataset[0]
print(textwrap.fill(await task(test_example), width=100))

Tiger Woods' return to competitive golf at Augusta was marked by inconsistency. Despite his
impressive short game, Woods struggled with accuracy off the tee and experienced moments of
frustration. While he showed glimpses of his former brilliance with remarkable shots, his overall
performance reflected a golfer still battling against age and past injuries.  Woods finished the
first round one over par, highlighting both his resilience and the challenges he faces in regaining
his dominant form.


## Define Your Evaluators

In [13]:
import tiktoken
from rouge import Rouge


# convenience functions
def _rouge_1(hypothesis: str, reference: str) -> Dict[str, Any]:
    scores = Rouge().get_scores(hypothesis, reference)
    return scores[0]["rouge-1"]


def _rouge_1_f1_score(hypothesis: str, reference: str) -> float:
    return _rouge_1(hypothesis, reference)["f"]


def _rouge_1_precision(hypothesis: str, reference: str) -> float:
    return _rouge_1(hypothesis, reference)["p"]


def _rouge_1_recall(hypothesis: str, reference: str) -> float:
    return _rouge_1(hypothesis, reference)["r"]


# evaluators
def rouge_1_f1_score(output: str, expected: Dict[str, Any]) -> float:
    return _rouge_1_f1_score(hypothesis=output, reference=expected["summary"])


def rouge_1_precision(output: str, expected: Dict[str, Any]) -> float:
    return _rouge_1_precision(hypothesis=output, reference=expected["summary"])


def rouge_1_recall(output: str, expected: Dict[str, Any]) -> float:
    return _rouge_1_recall(hypothesis=output, reference=expected["summary"])


def num_tokens(output: str) -> int:
    encoding = tiktoken.encoding_for_model("gpt-4o")
    return len(encoding.encode(output))


EVALUATORS = [rouge_1_f1_score, rouge_1_precision, rouge_1_recall, num_tokens]
# EVALUATORS = []

## Run Experiments and Iterate on Your Prompt Template

In [14]:
from phoenix.experiments import run_experiment

experiment_results = run_experiment(
    dataset,
    task,
    experiment_name="initial-template",
    experiment_description="first experiment using a simple prompt template",
    experiment_metadata={"vendor": "openai", "model": "o1"},
    evaluators=EVALUATORS,
)

🧪 Experiment started.
📺 View dataset experiments: http://phoenix:6006/datasets/RGF0YXNldDo0/experiments
🔗 View this experiment: http://phoenix:6006/datasets/RGF0YXNldDo0/compare?experimentId=RXhwZXJpbWVudDo3


running tasks |          | 0/10 (0.0%) | ⏳ 00:00<? | ?it/s

✅ Task runs completed.
🧠 Evaluation started.


running experiment evaluations |          | 0/40 (0.0%) | ⏳ 00:00<? | ?it/s


🔗 View this experiment: http://phoenix:6006/datasets/RGF0YXNldDo0/compare?experimentId=RXhwZXJpbWVudDo3

Experiment Summary (03/10/25 06:50 PM +0000)
--------------------------------------------
           evaluator   n  n_scores   avg_score
0         num_tokens  10        10  108.500000
1   rouge_1_f1_score  10        10    0.299874
2  rouge_1_precision  10        10    0.254611
3     rouge_1_recall  10        10    0.377769

Tasks Summary (03/10/25 06:50 PM +0000)
---------------------------------------
   n_examples  n_runs  n_errors
0          10      10         0
