@@ -3,6 +3,7 @@ package eval
3
3
4
4
import (
5
5
"context"
6
+ "encoding/json"
6
7
"errors"
7
8
"fmt"
8
9
"strings"
@@ -15,9 +16,22 @@ import (
15
16
"github.com/spf13/cobra"
16
17
)
17
18
18
- // EvaluationPromptFile represents the structure of a prompt.yml file for evaluation
19
- // It extends the base prompt.File with evaluation-specific fields
20
- type EvaluationPromptFile = prompt.File
19
+ // EvaluationSummary represents the overall evaluation summary
20
+ type EvaluationSummary struct {
21
+ Name string `json:"name"`
22
+ Description string `json:"description"`
23
+ Model string `json:"model"`
24
+ TestResults []TestResult `json:"testResults"`
25
+ Summary Summary `json:"summary"`
26
+ }
27
+
28
+ // Summary represents the evaluation summary statistics
29
+ type Summary struct {
30
+ TotalTests int `json:"totalTests"`
31
+ PassedTests int `json:"passedTests"`
32
+ FailedTests int `json:"failedTests"`
33
+ PassRate float64 `json:"passRate"`
34
+ }
21
35
22
36
// TestResult represents the result of running a test case
23
37
type TestResult struct {
@@ -61,12 +75,23 @@ func NewEvalCommand(cfg *command.Config) *cobra.Command {
61
75
- name: contains-hello
62
76
string:
63
77
contains: "hello"
78
+
79
+ By default, results are displayed in a human-readable format. Use the --json flag
80
+ to output structured JSON data for programmatic use or integration with CI/CD pipelines.
81
+
82
+ See https://docs.github.com/github-models/use-github-models/storing-prompts-in-github-repositories#supported-file-format for more information.
64
83
` ),
65
84
Example : "gh models eval my_prompt.prompt.yml" ,
66
85
Args : cobra .ExactArgs (1 ),
67
86
RunE : func (cmd * cobra.Command , args []string ) error {
68
87
promptFilePath := args [0 ]
69
88
89
+ // Get the json flag
90
+ jsonOutput , err := cmd .Flags ().GetBool ("json" )
91
+ if err != nil {
92
+ return err
93
+ }
94
+
70
95
// Load the evaluation prompt file
71
96
evalFile , err := loadEvaluationPromptFile (promptFilePath )
72
97
if err != nil {
@@ -75,25 +100,28 @@ func NewEvalCommand(cfg *command.Config) *cobra.Command {
75
100
76
101
// Run evaluation
77
102
handler := & evalCommandHandler {
78
- cfg : cfg ,
79
- client : cfg .Client ,
80
- evalFile : evalFile ,
103
+ cfg : cfg ,
104
+ client : cfg .Client ,
105
+ evalFile : evalFile ,
106
+ jsonOutput : jsonOutput ,
81
107
}
82
108
83
109
return handler .runEvaluation (cmd .Context ())
84
110
},
85
111
}
86
112
113
+ cmd .Flags ().Bool ("json" , false , "Output results in JSON format" )
87
114
return cmd
88
115
}
89
116
90
117
type evalCommandHandler struct {
91
- cfg * command.Config
92
- client azuremodels.Client
93
- evalFile * EvaluationPromptFile
118
+ cfg * command.Config
119
+ client azuremodels.Client
120
+ evalFile * prompt.File
121
+ jsonOutput bool
94
122
}
95
123
96
- func loadEvaluationPromptFile (filePath string ) (* EvaluationPromptFile , error ) {
124
+ func loadEvaluationPromptFile (filePath string ) (* prompt. File , error ) {
97
125
evalFile , err := prompt .LoadFromFile (filePath )
98
126
if err != nil {
99
127
return nil , fmt .Errorf ("failed to load prompt file: %w" , err )
@@ -103,23 +131,31 @@ func loadEvaluationPromptFile(filePath string) (*EvaluationPromptFile, error) {
103
131
}
104
132
105
133
func (h * evalCommandHandler ) runEvaluation (ctx context.Context ) error {
106
- h .cfg .WriteToOut (fmt .Sprintf ("Running evaluation: %s\n " , h .evalFile .Name ))
107
- h .cfg .WriteToOut (fmt .Sprintf ("Description: %s\n " , h .evalFile .Description ))
108
- h .cfg .WriteToOut (fmt .Sprintf ("Model: %s\n " , h .evalFile .Model ))
109
- h .cfg .WriteToOut (fmt .Sprintf ("Test cases: %d\n " , len (h .evalFile .TestData )))
110
- h .cfg .WriteToOut ("\n " )
134
+ // Print header info only for human-readable output
135
+ if ! h .jsonOutput {
136
+ h .cfg .WriteToOut (fmt .Sprintf ("Running evaluation: %s\n " , h .evalFile .Name ))
137
+ h .cfg .WriteToOut (fmt .Sprintf ("Description: %s\n " , h .evalFile .Description ))
138
+ h .cfg .WriteToOut (fmt .Sprintf ("Model: %s\n " , h .evalFile .Model ))
139
+ h .cfg .WriteToOut (fmt .Sprintf ("Test cases: %d\n " , len (h .evalFile .TestData )))
140
+ h .cfg .WriteToOut ("\n " )
141
+ }
111
142
143
+ var testResults []TestResult
112
144
passedTests := 0
113
145
totalTests := len (h .evalFile .TestData )
114
146
115
147
for i , testCase := range h .evalFile .TestData {
116
- h .cfg .WriteToOut (fmt .Sprintf ("Running test case %d/%d...\n " , i + 1 , totalTests ))
148
+ if ! h .jsonOutput {
149
+ h .cfg .WriteToOut (fmt .Sprintf ("Running test case %d/%d...\n " , i + 1 , totalTests ))
150
+ }
117
151
118
152
result , err := h .runTestCase (ctx , testCase )
119
153
if err != nil {
120
154
return fmt .Errorf ("test case %d failed: %w" , i + 1 , err )
121
155
}
122
156
157
+ testResults = append (testResults , result )
158
+
123
159
// Check if all evaluators passed
124
160
testPassed := true
125
161
for _ , evalResult := range result .EvaluationResults {
@@ -131,48 +167,91 @@ func (h *evalCommandHandler) runEvaluation(ctx context.Context) error {
131
167
132
168
if testPassed {
133
169
passedTests ++
134
- h .cfg .WriteToOut (" ✓ PASSED\n " )
135
- } else {
136
- h .cfg .WriteToOut (" ✗ FAILED\n " )
137
- // Show the first 100 characters of the model response when test fails
138
- preview := result .ModelResponse
139
- if len (preview ) > 100 {
140
- preview = preview [:100 ] + "..."
141
- }
142
- h .cfg .WriteToOut (fmt .Sprintf (" Model Response: %s\n " , preview ))
143
170
}
144
171
145
- // Show evaluation details
146
- for _ , evalResult := range result .EvaluationResults {
147
- status := "✓"
148
- if ! evalResult .Passed {
149
- status = "✗"
150
- }
151
- h .cfg .WriteToOut (fmt .Sprintf (" %s %s (score: %.2f)\n " ,
152
- status , evalResult .EvaluatorName , evalResult .Score ))
153
- if evalResult .Details != "" {
154
- h .cfg .WriteToOut (fmt .Sprintf (" %s\n " , evalResult .Details ))
155
- }
172
+ if ! h .jsonOutput {
173
+ h .printTestResult (result , testPassed )
156
174
}
157
- h .cfg .WriteToOut ("\n " )
158
175
}
159
176
177
+ // Calculate pass rate
178
+ passRate := 100.0
179
+ if totalTests > 0 {
180
+ passRate = float64 (passedTests ) / float64 (totalTests ) * 100
181
+ }
182
+
183
+ if h .jsonOutput {
184
+ // Output JSON format
185
+ summary := EvaluationSummary {
186
+ Name : h .evalFile .Name ,
187
+ Description : h .evalFile .Description ,
188
+ Model : h .evalFile .Model ,
189
+ TestResults : testResults ,
190
+ Summary : Summary {
191
+ TotalTests : totalTests ,
192
+ PassedTests : passedTests ,
193
+ FailedTests : totalTests - passedTests ,
194
+ PassRate : passRate ,
195
+ },
196
+ }
197
+
198
+ jsonData , err := json .MarshalIndent (summary , "" , " " )
199
+ if err != nil {
200
+ return fmt .Errorf ("failed to marshal JSON: %w" , err )
201
+ }
202
+
203
+ h .cfg .WriteToOut (string (jsonData ) + "\n " )
204
+ } else {
205
+ // Output human-readable format summary
206
+ h .printSummary (passedTests , totalTests , passRate )
207
+ }
208
+
209
+ return nil
210
+ }
211
+
212
+ func (h * evalCommandHandler ) printTestResult (result TestResult , testPassed bool ) {
213
+ if testPassed {
214
+ h .cfg .WriteToOut (" ✓ PASSED\n " )
215
+ } else {
216
+ h .cfg .WriteToOut (" ✗ FAILED\n " )
217
+ // Show the first 100 characters of the model response when test fails
218
+ preview := result .ModelResponse
219
+ if len (preview ) > 100 {
220
+ preview = preview [:100 ] + "..."
221
+ }
222
+ h .cfg .WriteToOut (fmt .Sprintf (" Model Response: %s\n " , preview ))
223
+ }
224
+
225
+ // Show evaluation details
226
+ for _ , evalResult := range result .EvaluationResults {
227
+ status := "✓"
228
+ if ! evalResult .Passed {
229
+ status = "✗"
230
+ }
231
+ h .cfg .WriteToOut (fmt .Sprintf (" %s %s (score: %.2f)\n " ,
232
+ status , evalResult .EvaluatorName , evalResult .Score ))
233
+ if evalResult .Details != "" {
234
+ h .cfg .WriteToOut (fmt .Sprintf (" %s\n " , evalResult .Details ))
235
+ }
236
+ }
237
+ h .cfg .WriteToOut ("\n " )
238
+ }
239
+
240
+ func (h * evalCommandHandler ) printSummary (passedTests , totalTests int , passRate float64 ) {
160
241
// Summary
161
242
h .cfg .WriteToOut ("Evaluation Summary:\n " )
162
243
if totalTests == 0 {
163
- h .cfg .WriteToOut ("Passed: 0/0 (0.0 %)\n " )
244
+ h .cfg .WriteToOut ("Passed: 0/0 (0.00 %)\n " )
164
245
} else {
165
- h .cfg .WriteToOut (fmt .Sprintf ("Passed: %d/%d (%.1f %%)\n " ,
166
- passedTests , totalTests , float64 ( passedTests ) / float64 ( totalTests ) * 100 ))
246
+ h .cfg .WriteToOut (fmt .Sprintf ("Passed: %d/%d (%.2f %%)\n " ,
247
+ passedTests , totalTests , passRate ))
167
248
}
168
249
169
250
if passedTests == totalTests {
170
251
h .cfg .WriteToOut ("🎉 All tests passed!\n " )
171
252
} else {
172
253
h .cfg .WriteToOut ("❌ Some tests failed.\n " )
173
254
}
174
-
175
- return nil
176
255
}
177
256
178
257
func (h * evalCommandHandler ) runTestCase (ctx context.Context , testCase map [string ]interface {}) (TestResult , error ) {
@@ -210,16 +289,9 @@ func (h *evalCommandHandler) templateMessages(testCase map[string]interface{}) (
210
289
return nil , fmt .Errorf ("failed to template message content: %w" , err )
211
290
}
212
291
213
- var role azuremodels.ChatMessageRole
214
- switch strings .ToLower (msg .Role ) {
215
- case "system" :
216
- role = azuremodels .ChatMessageRoleSystem
217
- case "user" :
218
- role = azuremodels .ChatMessageRoleUser
219
- case "assistant" :
220
- role = azuremodels .ChatMessageRoleAssistant
221
- default :
222
- return nil , fmt .Errorf ("unknown message role: %s" , msg .Role )
292
+ role , err := prompt .GetAzureChatMessageRole (msg .Role )
293
+ if err != nil {
294
+ return nil , err
223
295
}
224
296
225
297
messages = append (messages , azuremodels.ChatMessage {
@@ -236,22 +308,7 @@ func (h *evalCommandHandler) templateString(templateStr string, data map[string]
236
308
}
237
309
238
310
func (h * evalCommandHandler ) callModel (ctx context.Context , messages []azuremodels.ChatMessage ) (string , error ) {
239
- req := azuremodels.ChatCompletionOptions {
240
- Messages : messages ,
241
- Model : h .evalFile .Model ,
242
- Stream : false ,
243
- }
244
-
245
- // Apply model parameters
246
- if h .evalFile .ModelParameters .MaxTokens != nil {
247
- req .MaxTokens = h .evalFile .ModelParameters .MaxTokens
248
- }
249
- if h .evalFile .ModelParameters .Temperature != nil {
250
- req .Temperature = h .evalFile .ModelParameters .Temperature
251
- }
252
- if h .evalFile .ModelParameters .TopP != nil {
253
- req .TopP = h .evalFile .ModelParameters .TopP
254
- }
311
+ req := h .evalFile .BuildChatCompletionOptions (messages )
255
312
256
313
resp , err := h .client .GetChatCompletionStream (ctx , req )
257
314
if err != nil {
0 commit comments