# Sentinel Forensics — Kaggle GPU Inference Server

This notebook runs the heavy ML models on Kaggle's free GPU (T4) and exposes them
as an HTTP API via ngrok, so your local FastAPI backend can call them instead of
loading models locally.

## Setup steps
1. **Enable GPU**: Notebook settings → Accelerator → **GPU T4 x2**
2. **Add ngrok token**: Notebook settings → Secrets → Add `NGROK_TOKEN`
   - Get a free token at https://ngrok.com (sign up → Your Authtoken)
3. **Run all cells** (`Run All`)
4. **Copy the ngrok URL** printed in the last cell output
5. In your local terminal set:
   ```
   set USE_REMOTE_INFERENCE=true
   set KAGGLE_INFERENCE_URL=https://xxxx.ngrok-free.app
   ```
6. Run `python run_api.py` locally — startup in ~5 seconds!

In [None]:
# Cell 1 — Install dependencies
!pip install fastapi uvicorn pyngrok transformers torch torchvision torchaudio \
             ultralytics httpx Pillow opencv-python-headless -q

In [None]:
# Cell 2 — Load all 4 GPU models
import os, logging, base64, io, tempfile, threading
import torch

os.environ.setdefault('USE_TORCH', '1')
os.environ.setdefault('TRANSFORMERS_NO_TF', '1')

logging.basicConfig(level=logging.INFO, format='%(asctime)s [%(levelname)s] %(message)s')
logger = logging.getLogger(__name__)

DEVICE = 0 if torch.cuda.is_available() else -1
DEVICE_STR = 'cuda:0' if torch.cuda.is_available() else 'cpu'
print(f'Device: {DEVICE_STR}')

# ── Text models ─────────────────────────────────────────────────────────────
from transformers import pipeline as hf_pipeline, AutoTokenizer, AutoModelForSeq2SeqLM

THREAT_CATEGORIES = [
    'financial fraud', 'violence or threats', 'drug trafficking',
    'cyber crime', 'identity theft', 'exploitation',
    'corruption', 'weapons', 'general / benign',
]

print('Loading NER...')
ner_pipeline = hf_pipeline('ner', model='dslim/bert-base-NER',
                            aggregation_strategy='simple', device=DEVICE)

print('Loading summarizer...')
_sum_tok = AutoTokenizer.from_pretrained('sshleifer/distilbart-cnn-12-6')
_sum_mod = AutoModelForSeq2SeqLM.from_pretrained('sshleifer/distilbart-cnn-12-6')
if DEVICE >= 0:
    _sum_mod = _sum_mod.to(DEVICE_STR)

print('Loading zero-shot classifier...')
classifier_pipeline = hf_pipeline('zero-shot-classification',
                                   model='facebook/bart-large-mnli', device=DEVICE)
print('✓ Text models ready')

# ── Audio model ─────────────────────────────────────────────────────────────
print('Loading Whisper...')
whisper_pipeline = hf_pipeline('automatic-speech-recognition',
                                model='openai/whisper-tiny', device=DEVICE)
print('✓ Audio model ready')

# ── Vision models ────────────────────────────────────────────────────────────
from ultralytics import YOLO
from transformers import ViTForImageClassification, ViTImageProcessor

print('Loading YOLOv8n...')
yolo_model = YOLO('yolov8n.pt')

print('Loading violence detector...')
_vit_name = 'jaranohaal/vit-base-violence-detection'
vit_processor = ViTImageProcessor(size=224, image_mean=[0.5,0.5,0.5], image_std=[0.5,0.5,0.5])
vit_model = ViTForImageClassification.from_pretrained(_vit_name)
if DEVICE >= 0:
    vit_model = vit_model.to(DEVICE_STR)
vit_model.eval()
print('✓ Vision models ready')

# ── Deepfake model ───────────────────────────────────────────────────────────
from transformers import SiglipForImageClassification, AutoImageProcessor

print('Loading deepfake detector...')
_df_name = 'prithivMLmods/deepfake-detector-model-v1'
df_processor = AutoImageProcessor.from_pretrained(_df_name)
df_model = SiglipForImageClassification.from_pretrained(_df_name)
df_model.eval()
print('✓ Deepfake model ready')

print('\n=== All models loaded ===')

In [None]:
# Cell 3 — Inference helpers
import numpy as np
from PIL import Image

# ── Text inference ───────────────────────────────────────────────────────────
def infer_text(text: str) -> dict:
    results = {}
    # NER
    try:
        raw = ner_pipeline(text[:512])
        entities = [{'text': e['word'], 'entity_type': e['entity_group'],
                     'start': e['start'], 'end': e['end'],
                     'confidence': round(float(e['score']), 4)} for e in raw]
        results['ner'] = 'loaded'
    except Exception as e:
        entities = []; results['ner'] = f'error: {e}'

    # Summarization
    try:
        inputs = _sum_tok(text, return_tensors='pt', max_length=1024,
                          truncation=True)
        if DEVICE >= 0:
            inputs = {k: v.to(DEVICE_STR) for k, v in inputs.items()}
        with torch.no_grad():
            out = _sum_mod.generate(**inputs, max_new_tokens=128)
        summary = _sum_tok.decode(out[0], skip_special_tokens=True)
        results['summarizer'] = 'loaded'
    except Exception as e:
        summary = ''; results['summarizer'] = f'error: {e}'

    # Zero-shot classification
    top_label, top_score, categories = 'general / benign', 0.0, []
    try:
        clf_out = classifier_pipeline(text[:1024], THREAT_CATEGORIES)
        categories = [{'label': l, 'score': round(s, 4)}
                      for l, s in zip(clf_out['labels'], clf_out['scores'])]
        top_label = clf_out['labels'][0]
        top_score = round(float(clf_out['scores'][0]), 4)
        results['classifier'] = 'loaded'
    except Exception as e:
        results['classifier'] = f'error: {e}'

    risk_level = 'HIGH' if top_score > 0.7 else ('MEDIUM' if top_score > 0.4 else 'LOW')
    if top_label == 'general / benign':
        risk_level = 'LOW'

    return {
        'label': top_label,
        'confidence': top_score,
        'tokens_processed': len(text.split()),
        'entities': entities,
        'summary': summary,
        'categories': categories,
        'risk_level': risk_level,
        'components_status': results,
    }


# ── Audio inference ──────────────────────────────────────────────────────────
def infer_audio(data: bytes) -> dict:
    try:
        with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as f:
            f.write(data); tmp = f.name
        result = whisper_pipeline(tmp)
        os.unlink(tmp)
        transcription = result.get('text', '').strip()
        has_speech = bool(transcription)
        return {
            'label': 'speech-detected' if has_speech else 'no-speech',
            'confidence': 0.9 if has_speech else 0.5,
            'transcription': transcription,
            'duration_seconds': 0.0,
        }
    except Exception as e:
        return {'label': 'error', 'confidence': 0.0,
                'transcription': '', 'duration_seconds': 0.0, 'error': str(e)}


# ── Vision inference ─────────────────────────────────────────────────────────
def infer_vision(data: bytes) -> dict:
    try:
        img = Image.open(io.BytesIO(data)).convert('RGB')
        img_arr = np.array(img)

        # YOLO detection
        yolo_results = yolo_model(img_arr, device='cpu', verbose=False)
        detections = []
        for r in yolo_results:
            for box in r.boxes:
                detections.append({
                    'class_name': r.names[int(box.cls)],
                    'confidence': round(float(box.conf), 4),
                    'bbox': [round(float(x), 1) for x in box.xyxy[0].tolist()],
                })

        # Violence detection
        inputs = vit_processor(images=img, return_tensors='pt')
        if DEVICE >= 0:
            inputs = {k: v.to(DEVICE_STR) for k, v in inputs.items()}
        with torch.no_grad():
            logits = vit_model(**inputs).logits
        probs = torch.softmax(logits, dim=-1).squeeze()
        violence_idx = list(vit_model.config.id2label.values()).index('Violence') \
                       if 'Violence' in vit_model.config.id2label.values() else 0
        violence_score = float(probs[violence_idx])
        violence_detected = violence_score > 0.5

        top_label = detections[0]['class_name'] if detections else \
                    ('violence-detected' if violence_detected else 'no-objects')
        top_conf = detections[0]['confidence'] if detections else violence_score
        risk = 'HIGH' if violence_detected or top_conf > 0.8 else \
               ('MEDIUM' if top_conf > 0.5 else 'LOW')

        return {
            'label': top_label, 'confidence': round(top_conf, 4),
            'detections': detections, 'violence_detected': violence_detected,
            'violence_score': round(violence_score, 4),
            'risk_level': risk, 'detection_count': len(detections),
        }
    except Exception as e:
        return {'label': 'error', 'confidence': 0.0, 'detections': [],
                'violence_detected': False, 'violence_score': 0.0,
                'risk_level': 'LOW', 'detection_count': 0, 'error': str(e)}


# ── Deepfake inference ────────────────────────────────────────────────────────
def _classify_frame(frame_img: Image.Image) -> tuple[bool, float, str]:
    inputs = df_processor(images=frame_img, return_tensors='pt')
    with torch.no_grad():
        logits = df_model(**inputs).logits
    probs = torch.softmax(logits, dim=-1).squeeze()
    pred_idx = int(torch.argmax(probs))
    label = df_model.config.id2label[pred_idx].lower()
    conf = float(probs[pred_idx])
    return ('fake' in label), conf, label

def infer_deepfake(data: bytes) -> dict:
    try:
        img = Image.open(io.BytesIO(data)).convert('RGB')
        is_fake, conf, label = _classify_frame(img)
        return {'is_deepfake': is_fake, 'confidence': round(conf, 4), 'label': label}
    except Exception as e:
        return {'is_deepfake': False, 'confidence': 0.0,
                'label': 'inference-error', 'error': str(e)}

print('✓ Inference helpers ready')

In [None]:
# Cell 4 — FastAPI app (async endpoints — handles concurrent requests)
from fastapi import FastAPI
from pydantic import BaseModel
import uvicorn
import asyncio

app = FastAPI(title='Sentinel Forensics — Kaggle GPU Inference Server')

class TextRequest(BaseModel):
    text: str

class BytesRequest(BaseModel):
    data_b64: str  # base64-encoded bytes

@app.get('/health')
def health():
    return {'status': 'ok', 'device': DEVICE_STR,
            'models': ['text', 'audio', 'vision', 'deepfake']}

# Async endpoints run inference in a thread pool so FastAPI can accept
# new requests while another is being processed on GPU.
@app.post('/predict/text')
async def predict_text(req: TextRequest):
    return await asyncio.to_thread(infer_text, req.text)

@app.post('/predict/audio')
async def predict_audio(req: BytesRequest):
    data = base64.b64decode(req.data_b64)
    return await asyncio.to_thread(infer_audio, data)

@app.post('/predict/vision')
async def predict_vision(req: BytesRequest):
    data = base64.b64decode(req.data_b64)
    return await asyncio.to_thread(infer_vision, data)

@app.post('/predict/deepfake')
async def predict_deepfake(req: BytesRequest):
    data = base64.b64decode(req.data_b64)
    return await asyncio.to_thread(infer_deepfake, data)

print('✓ FastAPI app created (async endpoints)')


In [None]:
# Cell 5 — Start server and expose via ngrok
from pyngrok import ngrok
import threading, time

# ── Get ngrok token ──────────────────────────────────────────────────────────
NGROK_TOKEN_FALLBACK = ""  # ← paste your ngrok token here if secret doesn't work

ngrok_token = None
try:
    from kaggle_secrets import UserSecretsClient
    secrets = UserSecretsClient()
    ngrok_token = secrets.get_secret('NGROK_TOKEN')
    print("✓ Loaded ngrok token from Kaggle secret")
except Exception as e:
    print(f"⚠ Could not load Kaggle secret: {e}")
    if NGROK_TOKEN_FALLBACK:
        ngrok_token = NGROK_TOKEN_FALLBACK
        print("✓ Using fallback token from NGROK_TOKEN_FALLBACK")
    else:
        raise RuntimeError(
            "No ngrok token found!\n"
            "Fix option 1: Kaggle → Add-ons → Secrets → toggle NGROK_TOKEN ON\n"
            "Fix option 2: Paste your token in NGROK_TOKEN_FALLBACK above"
        )

ngrok.set_auth_token(ngrok_token)

# ── Start FastAPI with thread pool — handles concurrent requests properly ────
# run_in_threadpool=True lets multiple requests run simultaneously
# even though Python has the GIL, I/O and GPU ops release it
def run_server():
    uvicorn.run(
        app,
        host='0.0.0.0',
        port=8000,
        log_level='warning',
        workers=1,           # single process (models are in-memory)
        loop='asyncio',
    )

thread = threading.Thread(target=run_server, daemon=True)
thread.start()
time.sleep(2)

# ── Open ngrok tunnel ─────────────────────────────────────────────────────────
tunnel = ngrok.connect(8000)
public_url = tunnel.public_url

print('=' * 60)
print('KAGGLE GPU INFERENCE SERVER IS RUNNING')
print('=' * 60)
print(f'Public URL: {public_url}')
print()
print('Copy the URL above, then in your LOCAL terminal run:')
print(f'  set USE_REMOTE_INFERENCE=true')
print(f'  set KAGGLE_INFERENCE_URL={public_url}')
print(f'  python run_api.py')
print('=' * 60)


In [None]:
# Cell 6 — Keep-alive (run this to keep the session alive)
# Kaggle sessions stay active as long as a cell is running.
# This cell loops and prints a heartbeat every 5 minutes.
import time
print('Keep-alive started. Press Stop to shut down the server.')
i = 0
while True:
    time.sleep(300)
    i += 1
    print(f'[heartbeat {i}] Server still running at {public_url}')