In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
%cd /content/drive/MyDrive/Colab_Notebooks/Advanced_ML/moderation_app


/content/drive/MyDrive/Colab_Notebooks/Advanced_ML/moderation_app


In [3]:
import os
print("Working dir:", os.getcwd())
print("Files and folders:", os.listdir())

Working dir: /content/drive/MyDrive/Colab_Notebooks/Advanced_ML/moderation_app
Files and folders: ['requirements.txt', 'app.py', 'static', 'templates', 'progress.log', 'Moderator_runner.ipynb']


In [4]:
!ls templates

index.html  results_fragment.html


In [5]:
!pip install flask pyngrok transformers detoxify Pillow pymupdf --quiet
!pip install git+https://github.com/huggingface/transformers.git --upgrade --quiet  # Optional: latest HF

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


In [6]:
import os

# Kill any process using port 5001
os.system("fuser -k 5002/tcp || echo 'No process found on 5002'")
os.system("kill $(ps aux | grep '[n]grok' | awk '{print $2}') 2>/dev/null")


512

In [7]:
from pyngrok import ngrok
from google.colab import userdata

auth_token = userdata.get("NGROK_AUTH_TOKEN")
if auth_token:
    ngrok.set_auth_token(auth_token)
    print("✅ ngrok token set.")
else:
    print("⚠️ No ngrok token found. Using local iframe fallback.")


✅ ngrok token set.


In [8]:
import os
import io
import time
import threading
import logging
from flask import Flask, request, render_template, jsonify
from pyngrok import ngrok
import torch
from detoxify import Detoxify
from transformers import AutoProcessor, PaliGemmaForConditionalGeneration
from PIL import Image
import fitz  # PyMuPDF
from huggingface_hub import notebook_login # Import notebook_login

# ---- Flask Setup ----
app = Flask(__name__, template_folder="templates", static_folder="static")
app.secret_key = os.urandom(24)

# ---- Logging ----
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# ---- Progress State ----
progress = {"percent": 0, "message": "Starting..."}

def update_progress(percent, message):
    progress["percent"] = percent
    progress["message"] = message

# ---- Hugging Face Login ----
# Log in to Hugging Face to access gated models
try:
    from google.colab import userdata
    hf_token = userdata.get("HF_TOKEN")
    if hf_token:
        notebook_login() # This will use the token set via environment variable or secrets
        print("✅ Hugging Face login successful.")
    else:
        print("⚠️ HF_TOKEN not found in Colab secrets. Accessing gated models may fail.")
except Exception as e:
    print(f"❌ Hugging Face login failed: {e}")


# ---- Model Setup ----
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
TEXT_TOXICITY_THRESHOLD = 0.5

update_progress(5, "Loading Detoxify...")
text_moderator = Detoxify('original', device=DEVICE)

update_progress(10, "Loading PaliGemma...")
model_id = "google/paligemma-3b-mix-448"
image_processor = AutoProcessor.from_pretrained(model_id)

try:
    image_model = PaliGemmaForConditionalGeneration.from_pretrained(
        model_id,
        torch_dtype=torch.bfloat16,
        revision="bfloat16",
        device_map=DEVICE,
        token=hf_token # Explicitly pass the token
    ).eval()
except:
    image_model = PaliGemmaForConditionalGeneration.from_pretrained(
        model_id,
        torch_dtype=torch.float32,
        device_map=DEVICE,
        token=hf_token # Explicitly pass the token
    ).eval()

update_progress(15, "Models ready.")

IMAGE_PROMPTS = {
    "sexually_explicit": "Is this image sexually explicit?",
    "dangerous": "Does this image depict dangerous acts or content (weapons, self-harm, illegal activities)?",
    "violence_gore": "Does this image contain graphic violence or gore?",
    "hate_symbol": "Does this image contain a common hate symbol (like a swastika)?"
}

# ---- Helper Functions ----
def extract_content_from_pdf(pdf_bytes):
    update_progress(20, "Extracting text and images from PDF...")
    doc = fitz.open(stream=pdf_bytes, filetype="pdf")
    text = ""
    images = []
    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        text += page.get_text("text") + "\n"
        for img_index, img in enumerate(page.get_images(full=True)):
            xref = img[0]
            base_image = doc.extract_image(xref)
            if base_image and base_image.get("image"):
                images.append({
                    "page_num": page_num + 1,
                    "img_index": img_index + 1,
                    "image_bytes": base_image["image"],
                    "ext": base_image["ext"]
                })
    doc.close()
    update_progress(30, f"Found {len(images)} image(s).")
    return text.strip(), images

def moderate_text(text):
    if not text.strip():
        return {"status": "no_text"}
    try:
        update_progress(40, "Moderating text...")
        scores = text_moderator.predict(text)
        score = float(scores["toxicity"])
        label = "Toxic" if score >= TEXT_TOXICITY_THRESHOLD else "Not Toxic"
        return {"status": "processed", "score": score, "label": label, "all_scores": scores}
    except Exception as e:
        return {"status": "error", "message": str(e)}

def moderate_image(image_bytes, current_index, total):
    try:
        img = Image.open(io.BytesIO(image_bytes)).convert("RGB")
        results = {}
        for key, prompt in IMAGE_PROMPTS.items():
            prompt_text = "<image>\n" + prompt
            inputs = image_processor(text=prompt_text, images=img, return_tensors="pt").to(DEVICE)
            with torch.no_grad():
                output = image_model.generate(**inputs, max_new_tokens=20, do_sample=False)
            result = image_processor.decode(output[0], skip_special_tokens=True).strip().lower()
            results[key] = "Yes" if "yes" in result else "No" if "no" in result else f"Uncertain ({result})"
        percent = 40 + int((current_index / total) * 50)
        update_progress(percent, f"Moderating image {current_index}/{total}")
        return {"status": "processed", "results": results}
    except Exception as e:
        return {"status": "error", "message": str(e)}

# ---- Routes ----
@app.route('/', methods=['GET'])
def index():
    return render_template("index.html")

@app.route('/progress')
def progress_status():
    return jsonify(progress)

@app.route('/moderate', methods=['POST'])
def moderate():
    update_progress(1, "Processing PDF...")
    file = request.files.get("pdf_input")
    if not file:
        return render_template("results_fragment.html", error="No file uploaded.")

    try:
        pdf_bytes = file.read()
        text, images = extract_content_from_pdf(pdf_bytes)
        text_results = moderate_text(text)

        image_results = []
        for idx, img in enumerate(images):
            result = moderate_image(img["image_bytes"], idx + 1, len(images))
            result["page_number"] = img["page_num"]
            result["image_index"] = img["img_index"]
            image_results.append(result)

        update_progress(100, "Done!")
        return render_template("results_fragment.html",
                               uploaded_filename=file.filename,
                               text_results=text_results,
                               image_results=image_results)
    except Exception as e:
        update_progress(100, f"Error: {e}")
        return render_template("results_fragment.html", error=str(e))

# ---- Start Flask + ngrok ----
def run():
    app.run(port=5002, debug=True, use_reloader=False)

threading.Thread(target=run).start()
time.sleep(3)

try:
    public_url = ngrok.connect(5002)
    print(f"\n✅ App is live at: {public_url}")
except Exception as e:
    print("❌ ngrok failed:", e)

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

✅ Hugging Face login successful.


Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

`torch_dtype` is deprecated! Use `dtype` instead!


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

 * Serving Flask app '__main__'
 * Debug mode: on


 * Running on http://127.0.0.1:5002
INFO:werkzeug:[33mPress CTRL+C to quit[0m



✅ App is live at: NgrokTunnel: "https://b6ac569d861d.ngrok-free.app" -> "http://localhost:5002"
