# Install Dependencies
We need these libraries to run the LLM and the web server

In [None]:
!pip uninstall -y torch torchvision torchaudio

!pip install torch==2.5.1 torchvision==0.20.1 torchaudio==2.5.1 \
  --index-url https://download.pytorch.org/whl/cu121

In [None]:
import torch
import torchvision

print("Torch:", torch.__version__)
print("TorchVision:", torchvision.__version__)

from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from flask import Flask, request, jsonify
from flask_cors import CORS
from pyngrok import ngrok
import threading
import json

# Load the SLM (Qwen2.5-Coder-7B)
- Qwen2.5-Coder is currently arguably the best "Small" model for code & logic.
- We load it in 4-bit mode to fit inside the free 15GB GPU memory.

In [None]:
MODEL_ID = "Qwen/Qwen2.5-Coder-7B-Instruct"

print("‚è≥ Loading Model... this takes about 2-3 minutes...")

# Quantization Config (Makes the model smaller)
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
)

# Load Tokenizer & Model
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True
)

print("‚úÖ Model Loaded Successfully!")

# Start the API Server (Flask + Ngrok)
- Sign up at ngrok.com for free and get your auth token
- Replace 'YOUR_NGROK_TOKEN' below, or the tunnel might disconnect quickly.

In [None]:
NGROK_AUTH_TOKEN = "2uyzUG2z2qLJ1th9FOddvaFPxFm_6AF7BrRR9DbfoNFLop9zH"

if NGROK_AUTH_TOKEN != "YOUR_NGROK_TOKEN_HERE":
    ngrok.set_auth_token(NGROK_AUTH_TOKEN)

app = Flask(__name__)
CORS(app)  # Allow your local Next.js app to hit this

# SYSTEM PROMPT (Enforcing JSON)

In [None]:
SYSTEM_PROMPT = """You are a Python Data Analyst.
Your goal is to answer the user's question by writing a VALID PYTHON SCRIPT.

RULES:
1. You have a pandas DataFrame named 'df' ALREADY LOADED. Do not load it yourself.
2. You MUST use 'df' to calculate the answer.
3. The LAST line of your script must print a JSON object.
4. Do NOT wrap code in markdown blocks (like ```python). Just raw code.
5. If the user asks for a chart, return the 'chart' JSON type.
6. If the user asks for a table, return the 'table' JSON type.

EXPECTED JSON OUTPUT STRUCTURE:
type ChartPayload = {
  config: {
    type: 'bar' | 'line' | 'pie';
    title: string;
    xAxisKey: string;
    series: { dataKey: string; label: string; color?: string }[];
  };
  data: any[];
};

type Output =
  | { type: 'markdown'; summary: string }
  | { type: 'chart'; summary: string; data: ChartPayload }
  | { type: 'table'; summary: string; data: { headers: string[]; rows: any[][] } }
  | { type: 'kpi'; summary: string; data: { label: string; value: string; status?: 'positive'|'negative' }[] };

EXAMPLE PYTHON SCRIPT:
monthly = df.groupby('Month')['Revenue'].sum().reset_index()
print(json.dumps({
  "type": "chart",
  "summary": "Revenue peaked in December.",
  "data": {
    "config": { "type": "bar", "title": "Revenue", "xAxisKey": "Month", "series": [{"dataKey": "Revenue", "label": "Rev"}] },
    "data": monthly.to_dict(orient='records')
  }
}))
"""

# Creating Custom LLM Endpoint Request
- We send both the user prompt and the system prompt to the custom LLM endpoint.
- This ensures the model understands its role and the expected output format.
- We run the custom endpoint and expose it via ngrok for easy access.

In [None]:
@app.route('/generate', methods=['POST'])
def generate():
    data = request.json
    user_prompt = data.get('prompt', '')
    system_prompt = data.get('systemPrompt', SYSTEM_PROMPT)

    if not user_prompt:
        return jsonify({'error': 'No prompt provided'}), 400

    print(f"üì© Received Request: {user_prompt[:50]}...")

    # Combine System Prompt + User Prompt
    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_prompt}
    ]

    # Apply Chat Template (Handles the internal formatting for the model)
    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )

    model_inputs = tokenizer([text], return_tensors="pt").to(model.device)

    # Generate Response
    generated_ids = model.generate(
        **model_inputs,
        max_new_tokens=1024, # Allow enough space for code
        temperature=0.1,     # Low temp for precise code
        do_sample=True
    )

    # Decode and clean up
    generated_ids = [
        output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
    ]
    response_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]

    return jsonify({'text': response_text})

public_url = ngrok.connect(5000).public_url
print(f"üöÄ API is LIVE at: {public_url}")
print(f"‚ö†Ô∏è Copy this URL into your .env.local as CUSTOM_LLM_URL")

# Run Flask
app.run(port=5000)
