# Install Dependencies
We need these libraries to run the LLM and the web server

In [1]:
!pip uninstall -y torch torchvision torchaudio

!pip install torch==2.5.1 torchvision==0.20.1 torchaudio==2.5.1 \
  --index-url https://download.pytorch.org/whl/cu121

!pip install -q -U transformers accelerate bitsandbytes flask flask-cors pyngrok

Found existing installation: torch 2.5.1+cu121
Uninstalling torch-2.5.1+cu121:
  Successfully uninstalled torch-2.5.1+cu121
Found existing installation: torchvision 0.20.1+cu121
Uninstalling torchvision-0.20.1+cu121:
  Successfully uninstalled torchvision-0.20.1+cu121
Found existing installation: torchaudio 2.5.1+cu121
Uninstalling torchaudio-2.5.1+cu121:
  Successfully uninstalled torchaudio-2.5.1+cu121
Looking in indexes: https://download.pytorch.org/whl/cu121
Collecting torch==2.5.1
  Using cached https://download.pytorch.org/whl/cu121/torch-2.5.1%2Bcu121-cp312-cp312-linux_x86_64.whl (780.4 MB)
Collecting torchvision==0.20.1
  Using cached https://download.pytorch.org/whl/cu121/torchvision-0.20.1%2Bcu121-cp312-cp312-linux_x86_64.whl (7.3 MB)
Collecting torchaudio==2.5.1
  Using cached https://download.pytorch.org/whl/cu121/torchaudio-2.5.1%2Bcu121-cp312-cp312-linux_x86_64.whl (3.4 MB)
Installing collected packages: torch, torchvision, torchaudio
Successfully installed torch-2.5.1+cu

In [2]:
import torch
import torchvision

print("Torch:", torch.__version__)
print("TorchVision:", torchvision.__version__)

from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from flask import Flask, request, jsonify
from flask_cors import CORS
from pyngrok import ngrok
import threading
import json

Torch: 2.5.1+cu121
TorchVision: 0.20.1+cu121


# Load the SLM

- Qwen2.5-Coder is currently arguably the best "Small" model for code & logic.
- We load it in 4-bit mode to fit inside the free 15GB GPU memory.

In [7]:
# We use the Instruct version which is tuned for chat/commands
# Qwen/Qwen2.5-Coder-7B-Instruct
# Phi-3.5-mini-instruct
MODEL_ID = "Qwen/Qwen2.5-Coder-3B-Instruct"

print(f"‚è≥ Loading {MODEL_ID}... this takes about 2-3 minutes...")

# Quantization Config (Makes the model smaller to fit in free Colab GPU)
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
)

# Load Tokenizer & Model
# FIX: Set trust_remote_code=False to use the official transformers implementation
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=False)

model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=False # <--- CHANGED THIS TO FALSE
)

print("‚úÖ Model Loaded Successfully!")

‚è≥ Loading Qwen/Qwen2.5-Coder-3B-Instruct... this takes about 2-3 minutes...


config.json:   0%|          | 0.00/661 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/1.21G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.96G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/243 [00:00<?, ?B/s]

‚úÖ Model Loaded Successfully!


# Start the API Server (Flask + Ngrok)
- Sign up at ngrok.com for free and get your auth token
- Replace 'YOUR_NGROK_TOKEN' below, or the tunnel might disconnect quickly.

In [8]:
from google.colab import userdata

NGROK_AUTH_TOKEN = userdata.get('NGROK_AUTH_TOKEN')

if not NGROK_AUTH_TOKEN:
    print("‚ö†Ô∏è NGROK_AUTH_TOKEN not found in .env.local")
else:
    ngrok.set_auth_token(NGROK_AUTH_TOKEN)

app = Flask(__name__)
CORS(app)  # Allow your local Next.js app to hit this

<flask_cors.extension.CORS at 0x7fa06cfcffb0>

# Creating Custom LLM Endpoint Request
- We send both the user prompt and the system prompt to the custom LLM endpoint.
- This ensures the model understands its role and the expected output format.
- We run the custom endpoint and expose it via ngrok for easy access.

In [9]:
@app.route('/generate', methods=['POST'])
def generate():
    data = request.json
    user_prompt = data.get('prompt', '')
    system_prompt = data.get('systemPrompt', "")

    if not user_prompt:
        return jsonify({'error': 'No prompt provided'}), 400

    print(f"üì© Received Request: {user_prompt[:50]}...")

    # Combine System Prompt + User Prompt
    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_prompt}
    ]

    # Apply Chat Template (Handles the internal formatting for the model)
    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )

    model_inputs = tokenizer([text], return_tensors="pt").to(model.device)

    # Generate Response
    generated_ids = model.generate(
        **model_inputs,
        max_new_tokens=1024, # Allow enough space for code
        temperature=0.1,     # Low temp for precise code
        do_sample=True
    )

    # Decode and clean up
    generated_ids = [
        output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
    ]
    response_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]

    return jsonify({'text': response_text})

public_url = ngrok.connect(5000).public_url
print(f"üöÄ API is LIVE at: {public_url}")
print(f"‚ö†Ô∏è Copy this URL into your .env.local as CUSTOM_LLM_URL")

# Run Flask
app.run(port=5000)


üöÄ API is LIVE at: https://3d5a725036a1.ngrok-free.app
‚ö†Ô∏è Copy this URL into your .env.local as CUSTOM_LLM_URL
 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://127.0.0.1:5000
INFO:werkzeug:[33mPress CTRL+C to quit[0m


üì© Received Request: 
The previous python script you wrote failed with ...


INFO:werkzeug:127.0.0.1 - - [18/Jan/2026 12:45:49] "POST /generate HTTP/1.1" 200 -


üì© Received Request: 
The previous python script you wrote failed with ...


INFO:werkzeug:127.0.0.1 - - [18/Jan/2026 12:46:15] "POST /generate HTTP/1.1" 200 -
