# Local Huggingface Endpoint

The idea of local HF endpoint is to interact with a LLM hosted on a cloud/personnal machine similarly to classical HF endpoints.

This notebooks goes over how to use a local HF endpoint (setting up on the same hardware for the example).

## Simple HF endpoint

Here is a simple HF endpoint set up with fastapi.

In [None]:
!pip install fastapi
!pip install uvicorn

In [None]:
RUN_API = False # You should run the code in a parallel script

### schema.py

In [None]:
"""
Schemas for the Text API.
"""

from typing import Any, Dict, List, Optional, Union

from pydantic import BaseModel


class TextCompletionRequest(BaseModel):
    """
    A text completion request.
    """
    inputs: Union[str, List[str]]
    model_kwargs: Optional[Dict[str, Any]] = {
        "max_new_tokens": 50,
        "num_return_sequences": 1,
        "temperature": 1,
        "early_stopping": True,
    }

### main.py

In [None]:
import logging

from fastapi import FastAPI, status
from fastapi.exceptions import HTTPException
from transformers import pipeline

import uvicorn

app = FastAPI()
model = None

@app.on_event("startup")
async def startup_event():
    """
    Startup hook event.
    """
    global model
    logger = logging.getLogger("uvicorn")
    logger.info("Starting up...")
    logger.info("Loading model...")
    model = pipeline('text-generation', model='gpt2')
    logger.info("Model loaded!")

@app.post(
    "/completion",
    status_code=status.HTTP_200_OK,
    summary="Text completion",
    responses={
        status.HTTP_500_INTERNAL_SERVER_ERROR: {
            "description": "Could not compute prediction",
        },
    },
)
def completion(request: TextCompletionRequest):
    try:
        completion_response = model(
            request.inputs,
            **request.model_kwargs,
        )
    except Exception as e:
        raise HTTPException(
            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
            detail=f"Could not compute prediction: {e}",
        )
    return completion_response
    

if RUN_API:
    uvicorn.run(app, host="0.0.0.0", port=8000)

## Final use case

In [None]:
from langchain.llms import LocalHuggingFaceEndpoint

endpoint_url = "http://localhost:8000/completion"
llm = LocalHuggingFaceEndpoint(
    endpoint_url=endpoint_url,
    headers = {"Content-Type": "application/json"}
)
llm("Say 'hello':")