# Calling External model endpoints

## Objective

This tutorial calls multiple model endpoints using URL and auth key. It captures requests and responses.

## Open AI - GPT 2 Model

In [62]:
import requests

API_URL = "https://api-inference.huggingface.co/models/openai-community/gpt2"
headers = {"Authorization": "Bearer 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"}

def query(payload):
	response = requests.post(API_URL, headers=headers, json=payload)
	return response.json()
	
output = query({
	"inputs": "What is the capital of France?",
})

print("\nResponse :")
output


Response :


[{'generated_text': "What is the capital of France?\n\nThe French and the British world's richest are out of power. That has been explained last year by Quentin Tarantino's films, which claim to give us a grand vision of next decade when'modernism' is, historically speaking, replaced by committed pacifism - or from which this current disturbed world is drawn.\n\nBut what is important about France - and how much we can hold on to it, if people like EU Central Bank Governor Jean-Claude Juncker have their way"}]

## Tiny Lama 1.1B - HuggingFace Endpoint

In [52]:
import requests 

API_URL = "https://api-inference.huggingface.co/models/TinyLlama/TinyLlama-1.1B-Chat-v1.0/v1/chat/completions"
headers = {"Authorization": "Bearer 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"}

def query(payload):
	response = requests.post(API_URL, headers=headers, json=payload)
	return response.json()
	
output = query({
	"model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
	"messages": [{
		"role": "user", 
		"content": "What is the capital of France?"
		}],
	"max_tokens": 500,
	"stream": False
	})

print("\nResponse :")
output


Response :


{'object': 'chat.completion',
 'id': '',
 'created': 1721931339,
 'model': 'TinyLlama/TinyLlama-1.1B-Chat-v1.0',
 'system_fingerprint': '2.1.1-sha-4dfdb48',
 'choices': [{'index': 0,
   'message': {'role': 'assistant',
    'content': 'The capital of France is Paris.\n\nMore accurate: the capital of France is Paris, France.\n\nIncorrect: The capital of France is Tokyo, Japan as it is not mentioned in the original text.'},
   'logprobs': None,
   'finish_reason': 'eos_token'}],
 'usage': {'prompt_tokens': 30, 'completion_tokens': 45, 'total_tokens': 75}}

## Phi3 Mini - Deployed in Azure AI as Serverless endpoint - Model as a Service (MaaS)

In [53]:
import requests 

# serverless

API_URL = "https://Phi-3-mini-4k-instruct-rqvel.eastus2.models.ai.azure.com/v1/chat/completions"
headers = {"Authorization": "Bearer J6HAqLPf6jyC0ApRXkXRE0cdSpdINcgm"}

def query(payload):
	response = requests.post(API_URL, headers=headers, json=payload)
	return response.json()
	
output = query({
	"messages": [{
		"role": "user", 
		"content": "What is the capital of France?"
		}],
	"max_tokens": 500
	})

print("\nResponse :")
output


Response :


{'choices': [{'finish_reason': 'stop',
   'index': 0,
   'message': {'content': " The capital of France is Paris. It's the country's most populous city and has been a major center of finance, diplomacy, commerce, fashion, science, and the arts for many centuries.",
    'role': 'assistant',
    'tool_calls': []}}],
 'created': 1721931339,
 'id': 'cmpl-6834da4e7e12419894618bcc24ed189f',
 'model': 'phi3-mini-4k',
 'object': 'chat.completion',
 'usage': {'completion_tokens': 45, 'prompt_tokens': 10, 'total_tokens': 55}}

## Google T5 Efficient Mini-3

In [54]:
import requests 

API_URL = "https://waqasjaved-5368-qyibl.eastus2.inference.ml.azure.com/score"
headers = {"Authorization": "Bearer F4z7o6YB9Kqd3sl9edTeZpiijf1kWy1M"}


def query(payload):
	response = requests.post(API_URL, headers=headers, json=payload)
	return response.json()
	
output = query({
			'inputs': 'What is the capital of France'
		}
)

print("\nResponse :")
output


Response :


[{'generated_text': 'France.  - - - - - -'}]

## Mistral 8x 7B Instruct 01 

In [63]:
import requests 

# serverless

API_URL = "https://mistral-7b-east1092381.eastus2.inference.ml.azure.com/chat/completions"
headers = {"Content-Type" : "application/json", "Authorization": "Bearer lnAZ0Upil4nK279UC7Bv1ASawFzgHyAL"}

def query(payload):
	response = requests.post(API_URL, headers=headers, json=payload)
	return response.json()
	
output = query(
{ 
  "messages": [ 
    { 
      "content": "What is the capital of France?", 
      "role": "user" 
    } 
  ], 
  "max_tokens": 50
}
)

print("\nResponse :")
output


Response :


{'id': 'cmpl-413ab065722e40e383ea27750db09be2',
 'object': 'chat.completion',
 'created': 1721937556,
 'model': 'mistralai/Mixtral-8x7B-Instruct-v0.1',
 'choices': [{'index': 0,
   'message': {'role': 'assistant',
    'content': " The capital of France is Paris. It's located in the north-central part of the country and is one of the most populous and visited cities in Europe. Known for its iconic landmarks such as the Eiffel Tower",
    'tool_calls': []},
   'finish_reason': 'length'}],
 'usage': {'prompt_tokens': 15, 'total_tokens': 65, 'completion_tokens': 50}}

## GPT 3.5 Turbo

In [66]:
import requests 

# serverless

API_URL = "https://ai-wjai6180585924556846.openai.azure.com/openai/deployments/gpt-35-turbo/v1/chat/completions"
headers = {"Content-Type" : "application/json", "Authorization": "Bearer 63089f0381494d4d9129fb057f16cb7f"}

def query(payload):
	response = requests.post(API_URL, headers=headers, json=payload)
	return response.json()
	
output = query(
{ 
  "messages": [ 
    { 
      "content": "What is the capital of France?", 
      "role": "user" 
    } 
  ], 
  "max_tokens": 50
}
)

print("\nResponse :")
output

AuthenticationError: Error code: 401 - {'error': {'code': 'PermissionDenied', 'message': 'Principal does not have access to API/Operation.'}}