Skip to content

Commit d7631e6

Browse files
authored
Add a --json mode to evals (#55)
2 parents 392c27b + 41f4662 commit d7631e6

File tree

5 files changed

+439
-72
lines changed

5 files changed

+439
-72
lines changed

README.md

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,20 @@ Run the extension with output from a command. This uses single-shot mode.
6666
cat README.md | gh models run openai/gpt-4o-mini "summarize this text"
6767
```
6868

69+
#### Evaluating prompts
70+
71+
Run evaluation tests against a model using a `.prompt.yml` file:
72+
```shell
73+
gh models eval my_prompt.prompt.yml
74+
```
75+
76+
The evaluation will run test cases defined in the prompt file and display results in a human-readable format. For programmatic use, you can output results in JSON format:
77+
```shell
78+
gh models eval my_prompt.prompt.yml --json
79+
```
80+
81+
The JSON output includes detailed test results, evaluation scores, and summary statistics that can be processed by other tools or CI/CD pipelines.
82+
6983
## Notice
7084

7185
Remember when interacting with a model you are experimenting with AI, so content mistakes are possible. The feature is

cmd/eval/eval.go

Lines changed: 125 additions & 68 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ package eval
33

44
import (
55
"context"
6+
"encoding/json"
67
"errors"
78
"fmt"
89
"strings"
@@ -15,9 +16,22 @@ import (
1516
"github.com/spf13/cobra"
1617
)
1718

18-
// EvaluationPromptFile represents the structure of a prompt.yml file for evaluation
19-
// It extends the base prompt.File with evaluation-specific fields
20-
type EvaluationPromptFile = prompt.File
19+
// EvaluationSummary represents the overall evaluation summary
20+
type EvaluationSummary struct {
21+
Name string `json:"name"`
22+
Description string `json:"description"`
23+
Model string `json:"model"`
24+
TestResults []TestResult `json:"testResults"`
25+
Summary Summary `json:"summary"`
26+
}
27+
28+
// Summary represents the evaluation summary statistics
29+
type Summary struct {
30+
TotalTests int `json:"totalTests"`
31+
PassedTests int `json:"passedTests"`
32+
FailedTests int `json:"failedTests"`
33+
PassRate float64 `json:"passRate"`
34+
}
2135

2236
// TestResult represents the result of running a test case
2337
type TestResult struct {
@@ -61,12 +75,23 @@ func NewEvalCommand(cfg *command.Config) *cobra.Command {
6175
- name: contains-hello
6276
string:
6377
contains: "hello"
78+
79+
By default, results are displayed in a human-readable format. Use the --json flag
80+
to output structured JSON data for programmatic use or integration with CI/CD pipelines.
81+
82+
See https://docs.github.com/github-models/use-github-models/storing-prompts-in-github-repositories#supported-file-format for more information.
6483
`),
6584
Example: "gh models eval my_prompt.prompt.yml",
6685
Args: cobra.ExactArgs(1),
6786
RunE: func(cmd *cobra.Command, args []string) error {
6887
promptFilePath := args[0]
6988

89+
// Get the json flag
90+
jsonOutput, err := cmd.Flags().GetBool("json")
91+
if err != nil {
92+
return err
93+
}
94+
7095
// Load the evaluation prompt file
7196
evalFile, err := loadEvaluationPromptFile(promptFilePath)
7297
if err != nil {
@@ -75,25 +100,28 @@ func NewEvalCommand(cfg *command.Config) *cobra.Command {
75100

76101
// Run evaluation
77102
handler := &evalCommandHandler{
78-
cfg: cfg,
79-
client: cfg.Client,
80-
evalFile: evalFile,
103+
cfg: cfg,
104+
client: cfg.Client,
105+
evalFile: evalFile,
106+
jsonOutput: jsonOutput,
81107
}
82108

83109
return handler.runEvaluation(cmd.Context())
84110
},
85111
}
86112

113+
cmd.Flags().Bool("json", false, "Output results in JSON format")
87114
return cmd
88115
}
89116

90117
type evalCommandHandler struct {
91-
cfg *command.Config
92-
client azuremodels.Client
93-
evalFile *EvaluationPromptFile
118+
cfg *command.Config
119+
client azuremodels.Client
120+
evalFile *prompt.File
121+
jsonOutput bool
94122
}
95123

96-
func loadEvaluationPromptFile(filePath string) (*EvaluationPromptFile, error) {
124+
func loadEvaluationPromptFile(filePath string) (*prompt.File, error) {
97125
evalFile, err := prompt.LoadFromFile(filePath)
98126
if err != nil {
99127
return nil, fmt.Errorf("failed to load prompt file: %w", err)
@@ -103,23 +131,31 @@ func loadEvaluationPromptFile(filePath string) (*EvaluationPromptFile, error) {
103131
}
104132

105133
func (h *evalCommandHandler) runEvaluation(ctx context.Context) error {
106-
h.cfg.WriteToOut(fmt.Sprintf("Running evaluation: %s\n", h.evalFile.Name))
107-
h.cfg.WriteToOut(fmt.Sprintf("Description: %s\n", h.evalFile.Description))
108-
h.cfg.WriteToOut(fmt.Sprintf("Model: %s\n", h.evalFile.Model))
109-
h.cfg.WriteToOut(fmt.Sprintf("Test cases: %d\n", len(h.evalFile.TestData)))
110-
h.cfg.WriteToOut("\n")
134+
// Print header info only for human-readable output
135+
if !h.jsonOutput {
136+
h.cfg.WriteToOut(fmt.Sprintf("Running evaluation: %s\n", h.evalFile.Name))
137+
h.cfg.WriteToOut(fmt.Sprintf("Description: %s\n", h.evalFile.Description))
138+
h.cfg.WriteToOut(fmt.Sprintf("Model: %s\n", h.evalFile.Model))
139+
h.cfg.WriteToOut(fmt.Sprintf("Test cases: %d\n", len(h.evalFile.TestData)))
140+
h.cfg.WriteToOut("\n")
141+
}
111142

143+
var testResults []TestResult
112144
passedTests := 0
113145
totalTests := len(h.evalFile.TestData)
114146

115147
for i, testCase := range h.evalFile.TestData {
116-
h.cfg.WriteToOut(fmt.Sprintf("Running test case %d/%d...\n", i+1, totalTests))
148+
if !h.jsonOutput {
149+
h.cfg.WriteToOut(fmt.Sprintf("Running test case %d/%d...\n", i+1, totalTests))
150+
}
117151

118152
result, err := h.runTestCase(ctx, testCase)
119153
if err != nil {
120154
return fmt.Errorf("test case %d failed: %w", i+1, err)
121155
}
122156

157+
testResults = append(testResults, result)
158+
123159
// Check if all evaluators passed
124160
testPassed := true
125161
for _, evalResult := range result.EvaluationResults {
@@ -131,48 +167,91 @@ func (h *evalCommandHandler) runEvaluation(ctx context.Context) error {
131167

132168
if testPassed {
133169
passedTests++
134-
h.cfg.WriteToOut(" ✓ PASSED\n")
135-
} else {
136-
h.cfg.WriteToOut(" ✗ FAILED\n")
137-
// Show the first 100 characters of the model response when test fails
138-
preview := result.ModelResponse
139-
if len(preview) > 100 {
140-
preview = preview[:100] + "..."
141-
}
142-
h.cfg.WriteToOut(fmt.Sprintf(" Model Response: %s\n", preview))
143170
}
144171

145-
// Show evaluation details
146-
for _, evalResult := range result.EvaluationResults {
147-
status := "✓"
148-
if !evalResult.Passed {
149-
status = "✗"
150-
}
151-
h.cfg.WriteToOut(fmt.Sprintf(" %s %s (score: %.2f)\n",
152-
status, evalResult.EvaluatorName, evalResult.Score))
153-
if evalResult.Details != "" {
154-
h.cfg.WriteToOut(fmt.Sprintf(" %s\n", evalResult.Details))
155-
}
172+
if !h.jsonOutput {
173+
h.printTestResult(result, testPassed)
156174
}
157-
h.cfg.WriteToOut("\n")
158175
}
159176

177+
// Calculate pass rate
178+
passRate := 100.0
179+
if totalTests > 0 {
180+
passRate = float64(passedTests) / float64(totalTests) * 100
181+
}
182+
183+
if h.jsonOutput {
184+
// Output JSON format
185+
summary := EvaluationSummary{
186+
Name: h.evalFile.Name,
187+
Description: h.evalFile.Description,
188+
Model: h.evalFile.Model,
189+
TestResults: testResults,
190+
Summary: Summary{
191+
TotalTests: totalTests,
192+
PassedTests: passedTests,
193+
FailedTests: totalTests - passedTests,
194+
PassRate: passRate,
195+
},
196+
}
197+
198+
jsonData, err := json.MarshalIndent(summary, "", " ")
199+
if err != nil {
200+
return fmt.Errorf("failed to marshal JSON: %w", err)
201+
}
202+
203+
h.cfg.WriteToOut(string(jsonData) + "\n")
204+
} else {
205+
// Output human-readable format summary
206+
h.printSummary(passedTests, totalTests, passRate)
207+
}
208+
209+
return nil
210+
}
211+
212+
func (h *evalCommandHandler) printTestResult(result TestResult, testPassed bool) {
213+
if testPassed {
214+
h.cfg.WriteToOut(" ✓ PASSED\n")
215+
} else {
216+
h.cfg.WriteToOut(" ✗ FAILED\n")
217+
// Show the first 100 characters of the model response when test fails
218+
preview := result.ModelResponse
219+
if len(preview) > 100 {
220+
preview = preview[:100] + "..."
221+
}
222+
h.cfg.WriteToOut(fmt.Sprintf(" Model Response: %s\n", preview))
223+
}
224+
225+
// Show evaluation details
226+
for _, evalResult := range result.EvaluationResults {
227+
status := "✓"
228+
if !evalResult.Passed {
229+
status = "✗"
230+
}
231+
h.cfg.WriteToOut(fmt.Sprintf(" %s %s (score: %.2f)\n",
232+
status, evalResult.EvaluatorName, evalResult.Score))
233+
if evalResult.Details != "" {
234+
h.cfg.WriteToOut(fmt.Sprintf(" %s\n", evalResult.Details))
235+
}
236+
}
237+
h.cfg.WriteToOut("\n")
238+
}
239+
240+
func (h *evalCommandHandler) printSummary(passedTests, totalTests int, passRate float64) {
160241
// Summary
161242
h.cfg.WriteToOut("Evaluation Summary:\n")
162243
if totalTests == 0 {
163-
h.cfg.WriteToOut("Passed: 0/0 (0.0%)\n")
244+
h.cfg.WriteToOut("Passed: 0/0 (0.00%)\n")
164245
} else {
165-
h.cfg.WriteToOut(fmt.Sprintf("Passed: %d/%d (%.1f%%)\n",
166-
passedTests, totalTests, float64(passedTests)/float64(totalTests)*100))
246+
h.cfg.WriteToOut(fmt.Sprintf("Passed: %d/%d (%.2f%%)\n",
247+
passedTests, totalTests, passRate))
167248
}
168249

169250
if passedTests == totalTests {
170251
h.cfg.WriteToOut("🎉 All tests passed!\n")
171252
} else {
172253
h.cfg.WriteToOut("❌ Some tests failed.\n")
173254
}
174-
175-
return nil
176255
}
177256

178257
func (h *evalCommandHandler) runTestCase(ctx context.Context, testCase map[string]interface{}) (TestResult, error) {
@@ -210,16 +289,9 @@ func (h *evalCommandHandler) templateMessages(testCase map[string]interface{}) (
210289
return nil, fmt.Errorf("failed to template message content: %w", err)
211290
}
212291

213-
var role azuremodels.ChatMessageRole
214-
switch strings.ToLower(msg.Role) {
215-
case "system":
216-
role = azuremodels.ChatMessageRoleSystem
217-
case "user":
218-
role = azuremodels.ChatMessageRoleUser
219-
case "assistant":
220-
role = azuremodels.ChatMessageRoleAssistant
221-
default:
222-
return nil, fmt.Errorf("unknown message role: %s", msg.Role)
292+
role, err := prompt.GetAzureChatMessageRole(msg.Role)
293+
if err != nil {
294+
return nil, err
223295
}
224296

225297
messages = append(messages, azuremodels.ChatMessage{
@@ -236,22 +308,7 @@ func (h *evalCommandHandler) templateString(templateStr string, data map[string]
236308
}
237309

238310
func (h *evalCommandHandler) callModel(ctx context.Context, messages []azuremodels.ChatMessage) (string, error) {
239-
req := azuremodels.ChatCompletionOptions{
240-
Messages: messages,
241-
Model: h.evalFile.Model,
242-
Stream: false,
243-
}
244-
245-
// Apply model parameters
246-
if h.evalFile.ModelParameters.MaxTokens != nil {
247-
req.MaxTokens = h.evalFile.ModelParameters.MaxTokens
248-
}
249-
if h.evalFile.ModelParameters.Temperature != nil {
250-
req.Temperature = h.evalFile.ModelParameters.Temperature
251-
}
252-
if h.evalFile.ModelParameters.TopP != nil {
253-
req.TopP = h.evalFile.ModelParameters.TopP
254-
}
311+
req := h.evalFile.BuildChatCompletionOptions(messages)
255312

256313
resp, err := h.client.GetChatCompletionStream(ctx, req)
257314
if err != nil {

0 commit comments

Comments
 (0)