# API para usar las funciones de predicción de LLaMa

In [1]:
# ================================
# 1. Instalar dependencias
# ================================
!pip install transformers accelerate
!npm install -g localtunnel

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.wh

In [2]:
# ================================
# 2. Importar librerías necesarias
# ================================
import threading
import time
from flask import Flask, request, jsonify
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
import torch

In [3]:
# ================================
# 3. Cargar el modelo ya fusionado (sin LoRA)
# ================================
model_path = "/content/drive/MyDrive/Colab Notebooks/ft_llama_model/final_model/llama_merged"

# Montar directorio de drive
from google.colab import drive
drive.mount('/content/drive')

tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path)
pipe = pipeline(
        task="text-generation",
        model=model,
        tokenizer=tokenizer,
        max_new_tokens=3,
        do_sample=False,
        return_full_text=False,
        truncation=True,
        pad_token_id=tokenizer.eos_token_id
    )

# ================================
# 4. Crear servidor Flask
# ================================
app = Flask(__name__)

@app.route("/predict", methods=["POST"])
def predict():
  data = request.get_json()
  prompt = data.get("prompt", "")
  result = pipe(prompt)
  if not result or 'generated_text' not in result[0]:
    label = 'view'
    return jsonify({"intention": label})

  text = result[0]['generated_text'].strip().lower().replace("**", "")

  # Clasificación
  if 'clear' in text:
    label = 'clear'
  elif 'view' in text:
    label = 'view'
  elif 'add' in text:
    label = 'add'
  elif 'remove' in text:
    label = 'remove'
  else:
    label = 'view'
  return jsonify({"intention": label})

# ================================
# 5. Ejecutar Flask y LocalTunnel
# ================================
def run_flask():
    app.run(port=5000)

def run_tunnel():
    get_ipython().system_raw('lt --port 5000 > tunnel.txt 2>&1 &')
    time.sleep(4)
    !curl -s http://localhost:4040/api/tunnels | grep -o '"public_url":"[^"]*' | grep -o 'http[^"]*'

threading.Thread(target=run_flask).start()
time.sleep(2)
run_tunnel()


Mounted at /content/drive


Device set to use cuda:0
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://127.0.0.1:5000
INFO:werkzeug:[33mPress CTRL+C to quit[0m


In [4]:
# Relanzar el túnel y mostrar la salida completa
import time
get_ipython().system_raw('lt --port 5000 > tunnel.txt 2>&1 &')
time.sleep(5)

# Ver toda la salida generada por localtunnel
!cat tunnel.txt

your url is: https://afraid-spies-walk.loca.lt


In [5]:
# Ver toda la salida generada por localtunnel
!cat tunnel.txt

your url is: https://afraid-spies-walk.loca.lt
