In [None]:
from flask import Flask, request, jsonify
import requests  # For making HTTP requests to the Ollama API
import threading
import time  # For generating the `created` timestamp

app = Flask(__name__)

# Define the Ollama server URL
OLLAMA_API_URL = "http://localhost:11434/v1/completions"  # Update if using a different port

@app.route('/v1/chat/completions', methods=['POST'])
def completions():
    # Parse the incoming JSON request
    data = request.get_json()

    # Extract the 'messages' list from the request
    messages = data.get('messages', [])

    # Check if there are messages
    if not messages or not isinstance(messages, list):
        return jsonify({"error": "Invalid or missing 'messages' field"}), 400

    # Concatenate all user messages
    prompt = ""
    for message in messages:
        role = message.get('role')
        content = message.get('content')
        if role == 'user' and content:
            prompt += f"User: {content}\n"

    # Default model parameters
    max_tokens = data.get("max_tokens", 100)
    temperature = data.get("temperature", 0.7)
    top_p = data.get("top_p", 1)
    model = data.get("model", "llama3.1:8b")  # Default model

    # Error handling for empty prompt
    if not prompt.strip():
        return jsonify({"error": "Prompt is required"}), 400

    try:
        # Send a request to the Ollama API
        response = requests.post(
            OLLAMA_API_URL,
            json={
                "model": model,
                "prompt": prompt,
                "max_tokens": max_tokens if max_tokens > 0 else 100,  # Ensure a valid max_tokens value
                "temperature": temperature,
                "top_p": top_p
            }
        )

        # Check if the request was successful
        if response.status_code != 200:
            return jsonify({"error": f"Failed to communicate with Ollama server: {response.text}"}), response.status_code

        # Extract the completion text from the Ollama server response
        completion_response = response.json()
        completion_text = completion_response['choices'][0]['text']

        # Create the response object in the desired format
        response = {
            "id": "chatcmpl-" + str(int(time.time())),  # Unique ID for the completion
            "object": "chat.completion",
            "created": int(time.time()),  # Unix timestamp
            "model": model,
            "choices": [
                {
                    "index": 0,
                    "message": {
                        "role": "assistant",
                        "content": completion_text  # Assistant's response
                    },
                    "finish_reason": completion_response['choices'][0].get('finish_reason', 'stop')
                }
            ],
            # Assuming the API provides prompt and completion token usage
            "usage": {
                "prompt_tokens": completion_response.get('prompt_tokens', 0),
                "completion_tokens": completion_response.get('completion_tokens', len(completion_text.split())),
                "total_tokens": completion_response.get('total_tokens', len(prompt.split()) + len(completion_text.split()))
            }
        }

        return jsonify(response)

    except Exception as e:
        # Handle any errors and return a 500 status code with error message
        return jsonify({"error": str(e)}), 500

# Function to run the Flask app in a separate thread
def run_app():
    app.run(debug=True, use_reloader=False, host='0.0.0.0', port=1234)

# Start the Flask server in a background thread
flask_thread = threading.Thread(target=run_app)
flask_thread.start()
