# Martian SDK Quickstart Guide

In [None]:
# Imports
import json
import logging
import statistics
import sys
from typing import List

import openai
from openai.types.chat import (
    chat_completion,
    chat_completion_message,
)
import sklearn.metrics

from martian_apart_hack_sdk import exceptions, judge_specs, martian_client, utils
from martian_apart_hack_sdk.models import judge_evaluation, llm_models, router_constraints

## Load Credentials
You must have a .env file with the following values set:

1. `MARTIAN_API_URL` - withmartian.com/api
1. `MARTIAN_API_KEY` - your personal API key

In [8]:
# Load the config and make a client.
config = utils.load_config()
client = martian_client.MartianClient(
    api_url=config.api_url,
    api_key=config.api_key,
)

In [19]:
# One quick thing we can do with the client is confirm we have credits.
credit_balance = client.organization.get_credit_balance()
print(credit_balance)

OrganizationBalance(credits=50.0)


## Martian Gateway

You can use Martian as a gateway to access a number of different LLM providers.

To do so, you start by making an OpenAI client with the base_url set to the Martian API URL + "/openai/v2".

Then you can use the client as you would when working with OpenAI.

The list of available models are:

In [10]:
PROVIDERS = {
    # "OpenAI": llm_models.OPENAI_MODELS,
    # "Anthropic": llm_models.ANTHROPIC_MODELS,
    # "Together": llm_models.TOGETHER_MODELS,
    # "Google Gemini": llm_models.GEMINI_MODELS,
    "ALL":llm_models.ALL_MODELS
}

for provider, models in PROVIDERS.items():
    print(f'{provider}:')
    for model in models:
        print(f'  - {model}')
    print()
print("You could also pick any model from llm_models.ALL_MODELS")

ALL:
  - together/Qwen/Qwen2.5-72B-Instruct-Turbo
  - together/meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo
  - together/google/gemma-2-27b-it
  - gemini/gemini/gemini-1.5-pro
  - gemini/gemini/gemini-2.0-flash
  - together/mistralai/Mistral-Small-24B-Instruct-2501
  - openai/openai/gpt-4.1
  - openai/openai/gpt-4.5-preview
  - gemini/gemini/gemini-1.5-flash-8b-latest
  - anthropic/anthropic/claude-3-opus-latest
  - together/meta-llama/Llama-3.3-70B-Instruct-Turbo
  - together/deepseek-ai/DeepSeek-V3
  - together/nvidia/Llama-3.1-Nemotron-70B-Instruct-HF
  - openai/openai/gpt-4o-mini
  - gemini/gemini/gemini-1.5-flash-8b
  - together/deepseek-ai/DeepSeek-R1
  - gemini/gemini/gemini-1.5-flash-latest
  - anthropic/anthropic/claude-3-5-haiku-latest
  - gemini/gemini/gemini-1.5-flash
  - openai/openai/gpt-4o
  - anthropic/anthropic/claude-3-5-sonnet-latest
  - openai/openai/gpt-4.1-nano
  - gemini/gemini/gemini-1.5-pro-latest
  - together/Qwen/Qwen2.5-Coder-32B-Instruct
  - openai/openai/

In [20]:
rubric = """
You are tasked with evaluating whether the answer of a model is correct to the question and whether the reason make sense. You will be doing this by reading the REASON part of the answer and evaluate it with the question to see if the reason make sense and if it actually leads to the answer, and check if the ANSWER part corresponds to the reason.
The scoring is as follows:
- ⁠0 when the reason does not make sense and the answer is wrong.
- 1 when the reason is correct according to the context but has nothing to do with the question, and the answer is wrong.
- 2 when the reason makes sense and lead to the expected answer, but the answer is wrong.
- 3 when the reason makes sense and lead to the expected answer, and the answer is correct and corresponding to the reason.
- ⁠Show the logic chain you use that lead to your judge.
"""

rubric_judge_spec = judge_specs.RubricJudgeSpec(
	model_type="rubric_judge",
	rubric=rubric,
	model="openai/openai/gpt-4o",
	min_score=0,
	max_score=3,
)

In [21]:
# Run the judge spec.
def eval(question, answer):
	chat_request_text = question
	chat_response_text = answer

	completion_request = {
		"model": llm_models.GPT_4O_MINI,
		"messages": [{"role": "user", "content": chat_request_text}],
	}

	chat_completion_response = chat_completion.ChatCompletion(
		id="123",
		choices=[
			chat_completion.Choice(
				finish_reason="stop",
				index=0,
				message=chat_completion_message.ChatCompletionMessage(
					role="assistant",
					content=chat_response_text,
				),
			)
		],
		created=0,
		model="gpt-4o",
		object="chat.completion",
		service_tier=None,
	)

	evaluation_result = client.judges.evaluate_using_judge_spec(
		rubric_judge_spec.to_dict(),
		completion_request=completion_request,
		completion_response=chat_completion_response,
	)

	# print(f"Evaluation result: {evaluation_result}")
	return int(evaluation_result.score)

In [None]:
import pandas as pd

# Create the client.
openai_client = openai.OpenAI(
    api_key=config.api_key,
    base_url=config.api_url + "/openai/v2"
)

df_en = pd.read_parquet("hf://datasets/google/xquad/xquad.en/validation-00000-of-00001.parquet")
df_en_5 = df_en.head(100)
results_en = []
score_gemini_en = {"0":0, "1":0, "2":0, "3":0}
score_gpt_en = {"0":0, "1":0, "2":0, "3":0}
score_llama_en = {"0":0, "1":0, "2":0, "3":0}

for idx, lines in df_en_5.iterrows():
	results_en.append([lines["context"],lines["question"],lines["answers"]])

for context, question, answers in results_en:
    prompt_en = f"Original text:{context}\n{question}\nAnswer in the language of the original text, answer in the following format:\nANSWER: your answer.\nREASON: provide your reason."
    gemini_8b_response_en = openai_client.chat.completions.create(
        model="gemini/gemini/gemini-1.5-flash-8b",
        messages=[{"role": "user", "content": prompt_en}],
    )
    gpt_41nano_response_en = openai_client.chat.completions.create(
        model="openai/openai/gpt-4.1-nano",
        messages=[{"role": "user", "content": prompt_en}],
    )
    llama31405B_response_en = openai_client.chat.completions.create(
        model="together/meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo",
        messages=[{"role": "user", "content": prompt_en}],
    )
    # print(prompt_en)
    # print("THE EXPECTED ANSWER is",answers['text'][0],"\n")
    # print("\n 1.\n")
    # print("Mistral Samll says:", mistral_small_response_en.choices[0].message.content)
    # print("\n 2.\n")
    # print("DeepSeek-V3 says:", DeepSeekV3_response_en.choices[0].message.content)
    # print("\n 3.\n")
    # print("Llama-3.1-405B-Instruct-Turbo says:", llama31405B_response_en.choices[0].message.content)
    # print("\n xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx \n")

    evaluation_model_1_en = eval(prompt_en, gemini_8b_response_en.choices[0].message.content)
    evaluation_model_2_en = eval(prompt_en, gpt_41nano_response_en.choices[0].message.content)
    evaluation_model_3_en = eval(prompt_en, llama31405B_response_en.choices[0].message.content)
    
    if evaluation_model_1_en == 0:
        score_gemini_en["0"] = score_gemini_en["0"] + 1
    elif evaluation_model_1_en == 1:
        score_gemini_en["1"] = score_gemini_en["1"] + 1
    elif evaluation_model_1_en == 2:
        score_gemini_en["2"] = score_gemini_en["2"] + 1
    elif evaluation_model_1_en == 3:
        score_gemini_en["3"] = score_gemini_en["3"] + 1

    if evaluation_model_2_en == 0:
        score_gpt_en["0"] = score_gpt_en["0"] + 1
    elif evaluation_model_2_en == 1:
        score_gpt_en["1"] = score_gpt_en["1"] + 1
    elif evaluation_model_2_en == 2:
        score_gpt_en["2"] = score_gpt_en["2"] + 1
    elif evaluation_model_2_en == 3:
        score_gpt_en["3"] = score_gpt_en["3"] + 1

    if evaluation_model_3_en == 0:
        score_llama_en["0"] = score_llama_en["0"] + 1
    elif evaluation_model_3_en == 1:
        score_llama_en["1"] = score_llama_en["1"] + 1
    elif evaluation_model_3_en == 2:
        score_llama_en["2"] = score_llama_en["2"] + 1
    elif evaluation_model_3_en == 3:
        score_llama_en["3"] = score_llama_en["3"] + 1
    

df_zh = pd.read_parquet("hf://datasets/google/xquad/xquad.zh/validation-00000-of-00001.parquet")
df_zh_5 = df_zh.head(100)
results_zh = []
score_gemini_zh = {"0":0, "1":0, "2":0, "3":0}
score_gpt_zh = {"0":0, "1":0, "2":0, "3":0}
score_llama_zh = {"0":0, "1":0, "2":0, "3":0}

for idx, lines in df_zh_5.iterrows():
	results_zh.append([lines["context"],lines["question"],lines["answers"]])

for context, question, answers in results_zh:
    prompt_zh = f"Original text:{context}\n{question}\nAnswer in the language of the original text, answer in the following format:\nANSWER: your answer.\nREASON: provide your reason."
    gemini_8b_response_zh = openai_client.chat.completions.create(
        model="gemini/gemini/gemini-1.5-flash-8b",
        messages=[{"role": "user", "content": prompt_zh}],
    )
    gpt_41nano_response_zh = openai_client.chat.completions.create(
        model="openai/openai/gpt-4.1-nano",
        messages=[{"role": "user", "content": prompt_zh}],
    )
    llama31405B_response_zh = openai_client.chat.completions.create(
        model="together/meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo",
        messages=[{"role": "user", "content": prompt_zh}],
    )
    # print(prompt_zh)
    # print("THE EXPECTED ANSWER is",answers['text'][0],"\n")
    # print("\n 1.\n")
    # print("gemini-1.5-flash-8b-latest says:", mistral_small_response_zh.choices[0].message.content)
    # print("\n 2.\n")
    # print("DeepSeek-V3 says:", DeepSeekV3_response_zh.choices[0].message.content)
    # print("\n 3.\n")
    # print("Llama-3.1-405B-Instruct-Turbo says:", llama31405B_response_zh.choices[0].message.content)
    # print("\n xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx \n")

    evaluation_model_1_zh = eval(prompt_zh, gemini_8b_response_zh.choices[0].message.content)
    evaluation_model_2_zh = eval(prompt_zh, gpt_41nano_response_zh.choices[0].message.content)
    evaluation_model_3_zh = eval(prompt_zh, llama31405B_response_zh.choices[0].message.content)
    
    if evaluation_model_1_zh == 0:
        score_gemini_zh["0"] = score_gemini_zh["0"] + 1
    elif evaluation_model_1_zh == 1:
        score_gemini_zh["1"] = score_gemini_zh["1"] + 1
    elif evaluation_model_1_zh == 2:
        score_gemini_zh["2"] = score_gemini_zh["2"] + 1
    elif evaluation_model_1_zh == 3:
        score_gemini_zh["3"] = score_gemini_zh["3"] + 1

    if evaluation_model_2_zh == 0:
        score_gpt_zh["0"] = score_gpt_zh["0"] + 1
    elif evaluation_model_2_zh == 1:
        score_gpt_zh["1"] = score_gpt_zh["1"] + 1
    elif evaluation_model_2_zh == 2:
        score_gpt_zh["2"] = score_gpt_zh["2"] + 1
    elif evaluation_model_2_zh == 3:
        score_gpt_zh["3"] = score_gpt_zh["3"] + 1

    if evaluation_model_3_zh == 0:
        score_llama_zh["0"] = score_llama_zh["0"] + 1
    elif evaluation_model_3_zh == 1:
        score_llama_zh["1"] = score_llama_zh["1"] + 1
    elif evaluation_model_3_zh == 2:
        score_llama_zh["2"] = score_llama_zh["2"] + 1
    elif evaluation_model_3_zh == 3:
        score_llama_zh["3"] = score_llama_zh["3"] + 1

print("EN")
print(f"Gemini: {score_gemini_en}")
print(f"GPT40mini: {score_gpt_en}")
print(f"llama: {score_llama_en}")
print("ZH")
print(f"Gemini: {score_gemini_zh}")
print(f"GPT40mini: {score_gpt_zh}")
print(f"llama: {score_llama_zh}")