In [1]:
!pip install -U transformers accelerate flask flask-cors pyngrok safetensors huggingface_hub einops

import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

from flask import Flask, request, jsonify
from flask_cors import CORS
from transformers import (
    AutoTokenizer,
    AutoConfig,
    AutoModelForSeq2SeqLM,
    AutoModelForCausalLM
)
from pyngrok import ngrok
import torch
import gc

Collecting transformers
  Downloading transformers-4.53.0-py3-none-any.whl.metadata (39 kB)
Collecting accelerate
  Downloading accelerate-1.8.1-py3-none-any.whl.metadata (19 kB)
Collecting flask
  Downloading flask-3.1.1-py3-none-any.whl.metadata (3.0 kB)
Collecting flask-cors
  Downloading flask_cors-6.0.1-py3-none-any.whl.metadata (5.3 kB)
Collecting pyngrok
  Downloading pyngrok-7.2.11-py3-none-any.whl.metadata (9.4 kB)
Collecting huggingface_hub
  Downloading huggingface_hub-0.33.2-py3-none-any.whl.metadata (14 kB)
Collecting hf-xet<2.0.0,>=1.1.2 (from huggingface_hub)
  Downloading hf_xet-1.1.5-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (879 bytes)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.me

In [2]:
# ------------------ SETUP ------------------ #
device = "cuda" if torch.cuda.is_available() else "cpu"

SUPPORTED_MODELS = [
    "microsoft/Phi-4-mini-instruct",
    "deepseek-ai/deepseek-coder-6.7b-instruct",
    "Qwen/Qwen2.5-Coder-7B-Instruct",
    "Salesforce/codet5-base",
    "uclanlp/plbart-base"
]

SEQ2SEQ_TYPES = ["plbart", "t5"]
CAUSAL_TYPES = ["gpt", "gpt2", "gpt_neo", "codeparrot", "starcoder", "starcoder2", "deepseek", "stable-code", "phi", "phi3", "qwen", "qwen2", "qwen3", "chatglm", "stablelm"]
LLAMA_LIKE_TYPES = ["llama", "gemma"]

model_cache = {}

def get_model_loader(model_type):
    if model_type in SEQ2SEQ_TYPES:
        return AutoModelForSeq2SeqLM
    elif model_type in CAUSAL_TYPES or model_type in LLAMA_LIKE_TYPES:
        return AutoModelForCausalLM
    else:
        return None

In [8]:
# ------------------ ENDPOINTS ------------------ #
app = Flask(__name__)
CORS(app)

@app.route("/")
def index():
    return "✅ PONTIS Translator API is running (Lazy Load Mode)!"

@app.route("/translate", methods=["POST"])
def translate():
    data = request.get_json()

    code = data.get("code")
    model_id = data.get("model")
    source_lang = data.get("source_lang")
    target_lang = data.get("target_lang")

    if not all([code, model_id, source_lang, target_lang]):
        return jsonify({"error": "Missing required fields"}), 400

    if model_id not in SUPPORTED_MODELS:
        return jsonify({"error": f"Model '{model_id}' not supported"}), 400

    try:
        print(f"\n🔍 Requested model: {model_id}")
        print("📊 GPU memory before loading:")
        print(torch.cuda.memory_summary() if torch.cuda.is_available() else "No CUDA available")

        if model_id not in model_cache:
            tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True, trust_remote_code=True)
            config = AutoConfig.from_pretrained(model_id, trust_remote_code=True)
            model_type = config.model_type

            if tokenizer.pad_token is None:
                if tokenizer.eos_token:
                    tokenizer.pad_token = tokenizer.eos_token
                else:
                    tokenizer.add_special_tokens({'pad_token': '[PAD]'})

            model_cache[model_id] = {
                "tokenizer": tokenizer,
                "model_type": model_type,
                "config": config
            }
        else:
            tokenizer = model_cache[model_id]["tokenizer"]
            model_type = model_cache[model_id]["model_type"]
            config = model_cache[model_id]["config"]

        model_loader = get_model_loader(model_type)
        if model_loader is None:
            return jsonify({"error": f"Unsupported model type '{model_type}'"}), 500

        # Load model to GPU
        model = model_loader.from_pretrained(
            model_id,
            config=config,
            device_map="auto",
            torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
            trust_remote_code=True
        )
        model.eval()

        # Prompt
        if model_type in SEQ2SEQ_TYPES:
            prompt = f"Translate to {target_lang}: {code}"
        elif model_type in CAUSAL_TYPES:
            prompt = f"translate this {source_lang} code below to {target_lang}. Output only the code without explanation, heading, or comments. \n\n{code}"
        else:
            prompt = f"Translate this {source_lang} code below to {target_lang}:\n\n{code}"

        inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=1024).to(device)
        input_ids = inputs["input_ids"]
        attention_mask = inputs.get("attention_mask", None)

        gen_kwargs = {
            "max_new_tokens": 256,
            "do_sample": False,
            "num_beams": 4,
            "early_stopping": True
        }

        if model_type in CAUSAL_TYPES:
            gen_kwargs["temperature"] = 0.7
            gen_kwargs["pad_token_id"] = tokenizer.eos_token_id or tokenizer.pad_token_id

        outputs = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            **gen_kwargs
        )

        decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
        translated_code = decoded.split(prompt)[-1].strip() if prompt in decoded else decoded.strip()

        # Hapus model dari GPU dan kosongkan cache
        del model
        torch.cuda.empty_cache()
        gc.collect()

        print("📊 GPU memory after unloading:")
        print(torch.cuda.memory_summary() if torch.cuda.is_available() else "No CUDA available")

        return jsonify({"translated_code": translated_code})

    except Exception as e:
        print("❌ Translation error:", str(e))
        return jsonify({"error": f"Internal server error: {str(e)}"}), 500

In [None]:
# ------------------ SERVER ------------------ #
ngrok.set_auth_token("<Your Ngrok Token>")
public_url = ngrok.connect(80, bind_tls=True, hostname="causal-simply-foal.ngrok-free.app")
print(f" * ngrok tunnel: {public_url}")

app.run(host="0.0.0.0", port=80)

 * ngrok tunnel: NgrokTunnel: "https://causal-simply-foal.ngrok-free.app" -> "http://localhost:80"
 * Serving Flask app '__main__'
 * Debug mode: off

🔍 Requested model: microsoft/Phi-4-mini-instruct
📊 GPU memory before loading:
|                  PyTorch CUDA memory summary, device ID 0                 |
|---------------------------------------------------------------------------|
|            CUDA OOMs: 0            |        cudaMalloc retries: 0         |
|        Metric         | Cur Usage  | Peak Usage | Tot Alloc  | Tot Freed  |
|---------------------------------------------------------------------------|
| Allocated memory      |   8320 KiB |  15573 MiB |   1656 GiB |   1656 GiB |
|       from large pool |   8320 KiB |  15572 MiB |   1589 GiB |   1589 GiB |
|       from small pool |      0 KiB |     40 MiB |     66 GiB |     66 GiB |
|---------------------------------------------------------------------------|
| Active memory         |   8320 KiB |  15573 MiB |   1656 GiB |   16

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


📊 GPU memory after unloading:
|                  PyTorch CUDA memory summary, device ID 0                 |
|---------------------------------------------------------------------------|
|            CUDA OOMs: 0            |        cudaMalloc retries: 0         |
|        Metric         | Cur Usage  | Peak Usage | Tot Alloc  | Tot Freed  |
|---------------------------------------------------------------------------|
| Allocated memory      |   8331 KiB |  15573 MiB |   1685 GiB |   1685 GiB |
|       from large pool |   8320 KiB |  15572 MiB |   1616 GiB |   1616 GiB |
|       from small pool |     11 KiB |     40 MiB |     68 GiB |     68 GiB |
|---------------------------------------------------------------------------|
| Active memory         |   8331 KiB |  15573 MiB |   1685 GiB |   1685 GiB |
|       from large pool |   8320 KiB |  15572 MiB |   1616 GiB |   1616 GiB |
|       from small pool |     11 KiB |     40 MiB |     68 GiB |     68 GiB |
|---------------------------------

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


📊 GPU memory after unloading:
|                  PyTorch CUDA memory summary, device ID 0                 |
|---------------------------------------------------------------------------|
|            CUDA OOMs: 0            |        cudaMalloc retries: 0         |
|        Metric         | Cur Usage  | Peak Usage | Tot Alloc  | Tot Freed  |
|---------------------------------------------------------------------------|
| Allocated memory      |   8330 KiB |  15573 MiB |   1818 GiB |   1818 GiB |
|       from large pool |   8320 KiB |  15572 MiB |   1733 GiB |   1733 GiB |
|       from small pool |     10 KiB |     65 MiB |     85 GiB |     85 GiB |
|---------------------------------------------------------------------------|
| Active memory         |   8330 KiB |  15573 MiB |   1818 GiB |   1818 GiB |
|       from large pool |   8320 KiB |  15572 MiB |   1733 GiB |   1733 GiB |
|       from small pool |     10 KiB |     65 MiB |     85 GiB |     85 GiB |
|---------------------------------