# Qwen2.5-Coder-7B (4-bit) â†’ HTML to JSX + Vue SFC Pipeline

**Optimized for RTX 3070 Ti** with 4-bit quantization to fit VRAM.

## 0) One-Time Setup

### Python packages:
```python
!pip -q install -U transformers accelerate bitsandbytes pandas tqdm
```

### Node.js tools (run in terminal, not Jupyter):
```powershell
npm install @babel/parser @vue/compiler-sfc
```

**After installing Node tools, restart Jupyter so PATH updates.**

## 1) Imports

In [3]:
import os
import re
import json
import subprocess
from typing import Tuple, Dict, List

import pandas as pd
from tqdm import tqdm

print("Core imports successful")

Core imports successful


## 2) Configuration

## 2.1) Load & View Input Data

In [4]:
df_raw = pd.read_csv(IN_CSV)

print(f"Shape: {df_raw.shape}")
print(f"Columns: {list(df_raw.columns)}\n")

# Show first 3 rows
display(df_raw.head(3))

# Quick stats
print(f"\nHTML (text) column:")
print(f"  Non-empty: {df_raw['text'].notna().sum()}/{len(df_raw)}")
print(f"  Avg length: {df_raw['text'].str.len().mean():.0f} chars")
print(f"  Min length: {df_raw['text'].str.len().min():.0f} chars")
print(f"  Max length: {df_raw['text'].str.len().max():.0f} chars")

print(f"\nSample HTML (row 0, first 500 chars):")
print(df_raw['text'].iloc[0][:500])

NameError: name 'IN_CSV' is not defined

In [5]:
IN_CSV   = "data/websight_50k/websight_50k.csv"
OUT_CSV  = "data/websight_50k/websight_50k_gen.csv"
HTML_COL = "text"

PICSUM_HERO_W, PICSUM_HERO_H = 1600, 900
PICSUM_LOGO_W, PICSUM_LOGO_H = 300, 300
PICSUM_IMG_W,  PICSUM_IMG_H  = 900, 600

print(f"Input CSV : {IN_CSV}")
print(f"Output CSV: {OUT_CSV}")
print(f"HTML col  : {HTML_COL}")

Input CSV : data/websight_50k/websight_50k.csv
Output CSV: data/websight_50k/websight_50k_gen.csv
HTML col  : text


## 3) Image Sanitization Helpers

In [None]:
def picsum(seed: int, w: int, h: int) -> str:
    return f"https://picsum.photos/seed/{seed}/{w}/{h}"


def sanitize_html_assets(html: str, seed: int) -> str:
    s = html
    s = re.sub(
        r"background-image:\s*url\(['\"]?[^'\")]+['\"]?\)",
        lambda m: f"background-image: url('{picsum(seed, PICSUM_HERO_W, PICSUM_HERO_H)}')",
        s, flags=re.IGNORECASE,
    )
    s = re.sub(r"https?://source\.unsplash\.com/[^\s\"')]+", picsum(seed, PICSUM_IMG_W, PICSUM_IMG_H), s)
    s = re.sub(r"https?://images\.unsplash\.com/[^\s\"')]+", picsum(seed, PICSUM_IMG_W, PICSUM_IMG_H), s)

    def repl_img(tag):
        full = tag.group(0)
        lower = full.lower()
        is_logo = "logo" in lower
        w, h = (PICSUM_LOGO_W, PICSUM_LOGO_H) if is_logo else (PICSUM_IMG_W, PICSUM_IMG_H)
        url = picsum(seed, w, h)
        full = re.sub(r'src\s*=\s*["\'][^"\']*["\']', f'src="{url}"', full, flags=re.IGNORECASE)
        if re.search(r"\salt\s*=", full, flags=re.IGNORECASE) is None:
            full = full[:-1] + ' alt="Image">'
        return full

    s = re.sub(r"<img\b[^>]*\bsrc\s*=\s*['\"][^'\"]*['\"][^>]*>", repl_img, s, flags=re.IGNORECASE)
    return s


def enforce_picsum_in_code(code: str, seed: int) -> str:
    s = code
    s = re.sub(r"https?://source\.unsplash\.com/[^\s\"')]+", picsum(seed, PICSUM_IMG_W, PICSUM_IMG_H), s)
    s = re.sub(r"https?://images\.unsplash\.com/[^\s\"')]+", picsum(seed, PICSUM_IMG_W, PICSUM_IMG_H), s)

    def repl_src(m):
        url = m.group(1)
        if "picsum.photos" in url:
            return m.group(0)
        return f'src="{picsum(seed, PICSUM_IMG_W, PICSUM_IMG_H)}"'

    s = re.sub(r'src\s*=\s*"([^"]+)"', repl_src, s)
    s = re.sub(r"src\s*=\s*'([^']+)'", lambda m: repl_src(m).replace('"', "'"), s)
    return s

print("Sanitization helpers defined")

Sanitization helpers defined


## 4) Node.js Validators

In [None]:
def validate_jsx(jsx_code: str) -> Tuple[bool, str]:
    node_script = r"""
const parser = require("@babel/parser");
let input = "";
process.stdin.on("data", c => input += c);
process.stdin.on("end", () => {
  try {
    parser.parse(input, {sourceType:"module", plugins:["jsx","typescript"]});
    process.stdout.write(JSON.stringify({ok:true}));
  } catch(e) {
    process.stdout.write(JSON.stringify({ok:false, error:String(e.message||e)}));
  }
});
"""
    p = subprocess.run(["node","-e",node_script], input=jsx_code.encode("utf-8"),
                       stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    out = p.stdout.decode("utf-8", errors="ignore").strip()
    try:
        j = json.loads(out)
        return bool(j["ok"]), j.get("error", "")
    except:
        return False, f"JSX validator error: {out[-200:] if out else p.stderr.decode('utf-8',errors='ignore')[-200:]}"


def validate_vue(vue_sfc: str) -> Tuple[bool, str]:
    node_script = r"""
const { parse } = require("@vue/compiler-sfc");
let input = "";
process.stdin.on("data", c => input += c);
process.stdin.on("end", () => {
  try {
    const res = parse(input, { filename: "Component.vue" });
    const errs = res.errors || [];
    if (errs.length) {
      process.stdout.write(JSON.stringify({ok:false, error:String(errs[0])}));
    } else {
      process.stdout.write(JSON.stringify({ok:true}));
    }
  } catch(e) {
    process.stdout.write(JSON.stringify({ok:false, error:String(e.message||e)}));
  }
});
"""
    p = subprocess.run(["node","-e",node_script], input=vue_sfc.encode("utf-8"),
                       stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    out = p.stdout.decode("utf-8", errors="ignore").strip()
    try:
        j = json.loads(out)
        return bool(j["ok"]), j.get("error", "")
    except:
        return False, f"Vue validator error: {out[-200:] if out else p.stderr.decode('utf-8',errors='ignore')[-200:]}"

print("Validators defined")

Validators defined


## 5) Load Model (4-bit)

In [None]:
import torch
import gc
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

# Free previous model from GPU if re-running this cell
if 'model' in dir():
    del model
if 'tokenizer' in dir():
    del tokenizer
gc.collect()
torch.cuda.empty_cache()
print(f"GPU memory: {torch.cuda.mem_get_info()[0]/1024**3:.1f} GiB free / {torch.cuda.mem_get_info()[1]/1024**3:.1f} GiB total")

MODEL_NAME = "Qwen/Qwen2.5-Coder-7B-Instruct"
print(f"Loading {MODEL_NAME} in 4-bit...")

bnb = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.float16,
)

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=bnb,
    device_map="cuda:0",
    torch_dtype=torch.float16,
)

print(f"Model loaded on GPU (~8 GB VRAM)")

Loading Qwen/Qwen2.5-Coder-7B-Instruct in 4-bit...




Loading weights:   0%|          | 0/339 [00:00<?, ?it/s]

OutOfMemoryError: CUDA out of memory. Tried to allocate 130.00 MiB. GPU 0 has a total capacity of 8.00 GiB of which 0 bytes is free. Of the allocated memory 14.49 GiB is allocated by PyTorch, and 37.07 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

## 6) Prompts & Generation

In [None]:
def p_html_to_jsx(html: str) -> str:
    return f"""You are a strict frontend compiler.
Convert the given HTML into a single React component.

Rules:
- Output ONLY code.
- Export a named component: GeneratedComponent
- Preserve Tailwind classes.
- Use className (not class).
- Convert inline style strings to JSX objects.
- Do NOT include <html>, <head>, <body>, or external <link> tags.
- If there are <img> tags, keep the src URLs as provided (they are already Picsum).
- Ensure valid JSX.

HTML:
```html
{html}
```""".strip()


def p_html_to_vue(html: str) -> str:
    return f"""You are a strict frontend compiler.
Convert the given HTML into a Vue 3 Single File Component (SFC).

Rules:
- Output ONLY code.
- Must be valid .vue SFC with <template> and <script setup>.
- Preserve Tailwind classes.
- Do NOT include <html>, <head>, <body>, or external <link> tags.
- If there are <img> tags, keep the src URLs as provided (they are already Picsum).
- Ensure valid Vue SFC.

HTML:
```html
{html}
```""".strip()


def p_fix_jsx(bad: str, err: str) -> str:
    return f"""Fix the following React JSX so it parses with Babel.

Rules:
- Output ONLY corrected code.
- Keep component name GeneratedComponent.
- Keep Tailwind classes.
- No <html>/<head>/<body>/<link>.
- Do not introduce Unsplash; keep Picsum if images exist.

Babel error:
{err}

Code:
{bad}""".strip()


def p_fix_vue(bad: str, err: str) -> str:
    return f"""Fix the following Vue SFC so it parses with @vue/compiler-sfc.

Rules:
- Output ONLY corrected code.
- Must contain <template> and <script setup>.
- No <html>/<head>/<body>/<link> in template.
- Do not introduce Unsplash; keep Picsum if images exist.

Compiler error:
{err}

Code:
{bad}""".strip()


def strip_code_fences(text: str) -> str:
    """Remove markdown code fences (```lang ... ```) from LLM output."""
    s = text.strip()
    # Match ```lang\n...\n``` pattern
    m = re.search(r'```(?:\w+)?\s*\n(.*?)```', s, re.DOTALL)
    if m:
        return m.group(1).strip()
    # Also handle if entire string starts with ``` and ends with ```
    if s.startswith('```'):
        s = re.sub(r'^```\w*\n?', '', s)
        s = re.sub(r'\n?```$', '', s)
        return s.strip()
    return s


def llm_generate(prompt: str, max_new_tokens=600) -> str:
    # Qwen2.5-Coder-Instruct needs chat template format
    messages = [{"role": "user", "content": prompt}]
    text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = tokenizer(text, return_tensors="pt").to(model.device)
    with torch.no_grad():
        out = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=True,
            temperature=0.2,
            top_p=0.95,
            pad_token_id=tokenizer.eos_token_id,
        )
    # Decode only the newly generated tokens (skip prompt)
    new_tokens = out[0][inputs['input_ids'].shape[1]:]
    txt = tokenizer.decode(new_tokens, skip_special_tokens=True)
    return strip_code_fences(txt)

print("Prompts and generation defined")

Prompts and generation defined


## 7) Pipeline Function

In [None]:
def gen_validate(html: str, seed: int, repair_tries=2) -> Dict[str, str]:
    html2 = sanitize_html_assets(html, seed)

    # JSX
    jsx = llm_generate(p_html_to_jsx(html2), max_new_tokens=650)
    jsx = enforce_picsum_in_code(jsx, seed)
    ok, err = validate_jsx(jsx)
    t = 0
    while (not ok) and t < repair_tries:
        t += 1
        jsx = llm_generate(p_fix_jsx(jsx, err), max_new_tokens=500)
        jsx = enforce_picsum_in_code(jsx, seed)
        ok, err = validate_jsx(jsx)
    jsx_ok, jsx_err = ok, err

    # Vue
    vue = llm_generate(p_html_to_vue(html2), max_new_tokens=750)
    vue = enforce_picsum_in_code(vue, seed)
    ok, err = validate_vue(vue)
    t = 0
    while (not ok) and t < repair_tries:
        t += 1
        vue = llm_generate(p_fix_vue(vue, err), max_new_tokens=600)
        vue = enforce_picsum_in_code(vue, seed)
        ok, err = validate_vue(vue)
    vue_ok, vue_err = ok, err

    return {
        "html_sanitized": html2,
        "jsx_code": jsx,
        "vue_sfc": vue,
        "jsx_valid": jsx_ok,
        "vue_valid": vue_ok,
        "jsx_error": jsx_err,
        "vue_error": vue_err,
    }

print("Pipeline function defined")

Pipeline function defined


## 8) Execute on CSV

In [None]:
import csv

SAMPLE_SIZE = 5  # how many to process this run (set to None for all)
PROGRESS_FILE = os.path.join(os.path.dirname(OUT_CSV), "progress.txt")

# --- Load input data ---
print(f"Loading CSV: {IN_CSV}")
df_full = pd.read_csv(IN_CSV)
print(f"Loaded {len(df_full)} rows, columns: {list(df_full.columns)}")
assert HTML_COL in df_full.columns

# --- Read progress (which index to start from) ---
start_idx = 0
if os.path.exists(PROGRESS_FILE):
    with open(PROGRESS_FILE, "r") as f:
        start_idx = int(f.read().strip())
    print(f"Resuming from index {start_idx}")
else:
    print(f"Starting fresh from index 0")

# --- Determine end index ---
end_idx = min(start_idx + SAMPLE_SIZE, len(df_full)) if SAMPLE_SIZE else len(df_full)
todo = end_idx - start_idx
print(f"Will process rows {start_idx} to {end_idx - 1} ({todo} rows)\n")

if todo <= 0:
    print("Nothing to process.")
else:
    # --- CSV output columns ---
    OUT_COLS = [
        "id", "image_path", "text", "llm_generated_idea",
        "jsx_code", "vue_sfc", "jsx_valid", "vue_valid", "jsx_error", "vue_error",
    ]

    # --- Open output CSV (append if resuming, write header if new) ---
    os.makedirs(os.path.dirname(OUT_CSV) or ".", exist_ok=True)
    write_header = (start_idx == 0) or not os.path.exists(OUT_CSV)
    f_out = open(OUT_CSV, "a" if not write_header else "w", newline="", encoding="utf-8")
    writer = csv.DictWriter(f_out, fieldnames=OUT_COLS)
    if write_header:
        writer.writeheader()

    jsx_valid_count = 0
    vue_valid_count = 0

    for idx in tqdm(range(start_idx, end_idx), desc="Generating"):
        row = df_full.iloc[idx]
        html = "" if pd.isna(row[HTML_COL]) else str(row[HTML_COL])
        seed = int(row["id"]) if ("id" in df_full.columns and pd.notna(row["id"])) else idx

        r = gen_validate(html, seed, repair_tries=2)

        # Build output row
        out_row = {}
        for c in OUT_COLS:
            if c in r:
                out_row[c] = r[c]
            elif c in row.index:
                out_row[c] = "" if pd.isna(row[c]) else row[c]
            else:
                out_row[c] = ""

        # Write row immediately
        writer.writerow(out_row)
        f_out.flush()

        # Update progress file
        with open(PROGRESS_FILE, "w") as pf:
            pf.write(str(idx + 1))

        if r["jsx_valid"]:
            jsx_valid_count += 1
        if r["vue_valid"]:
            vue_valid_count += 1

    f_out.close()

    print(f"\nDone! Processed rows {start_idx}-{end_idx - 1}")
    print(f"Output: {OUT_CSV}")
    print(f"Progress: {PROGRESS_FILE} (next start = {end_idx})")
    print(f"JSX valid: {jsx_valid_count}/{todo} ({jsx_valid_count/todo:.1%})")
    print(f"Vue valid: {vue_valid_count}/{todo} ({vue_valid_count/todo:.1%})")

# Load result for downstream cells
out_df = pd.read_csv(OUT_CSV, encoding="utf-8")

NameError: name 'os' is not defined

## 9) Results Summary

In [None]:
print("=== VALIDATION SUMMARY ===")
print(f"Rows: {len(out_df)}")
print(f"JSX valid: {out_df['jsx_valid'].sum()}/{len(out_df)} ({out_df['jsx_valid'].mean():.1%})")
print(f"Vue valid: {out_df['vue_valid'].sum()}/{len(out_df)} ({out_df['vue_valid'].mean():.1%})")

if len(out_df) > 0:
    s = out_df.iloc[0]
    print(f"\n--- Row 0 ---")
    print(f"JSX valid={s['jsx_valid']}, first 200 chars:")
    print(str(s['jsx_code'])[:200])
    print(f"\nVue valid={s['vue_valid']}, first 200 chars:")
    print(str(s['vue_sfc'])[:200])

jsx_errs = out_df[out_df["jsx_error"].notna() & (out_df["jsx_error"] != "")]
vue_errs = out_df[out_df["vue_error"].notna() & (out_df["vue_error"] != "")]
if len(jsx_errs) > 0:
    print(f"\nJSX errors ({len(jsx_errs)} rows):")
    for idx, row in jsx_errs.head(3).iterrows():
        print(f"  [{idx}] {str(row['jsx_error'])[:100]}")
if len(vue_errs) > 0:
    print(f"\nVue errors ({len(vue_errs)} rows):")
    for idx, row in vue_errs.head(3).iterrows():
        print(f"  [{idx}] {str(row['vue_error'])[:100]}")

=== VALIDATION SUMMARY ===
Rows: 5
JSX valid: 5/5 (100.0%)
Vue valid: 0/5 (0.0%)

--- Row 0 ---
JSX valid=True, first 200 chars:


Vue valid=False, first 200 chars:


Vue errors (5 rows):
  [0] SyntaxError: At least one <template> or <script> is required in a single file component. Component.v
  [1] SyntaxError: At least one <template> or <script> is required in a single file component. Component.v
  [2] SyntaxError: At least one <template> or <script> is required in a single file component. Component.v


## 10) Save Merged Dataset

In [None]:
MERGED_CSV = "data/websight_50k/websight_50k_merged.csv"

KEEP_COLS = [
    "id", "image_path", "text", "llm_generated_idea",
    "jsx_code", "vue_sfc", "jsx_valid", "vue_valid", "jsx_error", "vue_error",
]
cols = [c for c in KEEP_COLS if c in out_df.columns]
merged_df = out_df[cols].copy()

os.makedirs(os.path.dirname(MERGED_CSV) or ".", exist_ok=True)
merged_df.to_csv(MERGED_CSV, index=False, encoding="utf-8")

print(f"Saved {len(merged_df)} rows to: {MERGED_CSV}")
print(f"Columns: {list(merged_df.columns)}")
print(f"Size: {os.path.getsize(MERGED_CSV) / 1024:.1f} KB")
print(f"JSX non-empty: {(merged_df['jsx_code'].str.len() > 10).sum()}/{len(merged_df)}")
print(f"Vue non-empty: {(merged_df['vue_sfc'].str.len() > 10).sum()}/{len(merged_df)}")

Saved 5 rows to: data/websight_100_stream/websight_100_merged.csv
Columns: ['id', 'image_path', 'text', 'llm_generated_idea', 'jsx_code', 'vue_sfc', 'jsx_valid', 'vue_valid', 'jsx_error', 'vue_error']
Size: 12.2 KB
JSX non-empty: 0/5
Vue non-empty: 0/5


## 11) Check 5 Samples

In [None]:
df_check = pd.read_csv(MERGED_CSV, encoding="utf-8")
print(f"Loaded {len(df_check)} rows\n")

NUM = 5
samples = df_check.head(NUM)

def safe(val):
    return str(val) if pd.notna(val) else ""

for i, (_, row) in enumerate(samples.iterrows()):
    print("=" * 80)
    print(f"  SAMPLE {i+1}/{NUM}")
    print("=" * 80)

    print(f"\n[id]         {row.get('id', 'N/A')}")
    print(f"[image_path] {safe(row.get('image_path'))}")

    idea = safe(row.get("llm_generated_idea"))
    print(f"\n[llm_generated_idea]")
    print(idea[:300] + ("..." if len(idea) > 300 else ""))

    html = safe(row.get("text"))
    print(f"\n[Original HTML] ({len(html)} chars)")
    print(html[:400] + ("..." if len(html) > 400 else ""))

    jsx = safe(row.get("jsx_code"))
    print(f"\n[JSX] valid={row.get('jsx_valid', False)}  ({len(jsx)} chars)")
    print(jsx[:500] + ("..." if len(jsx) > 500 else ""))
    jsx_e = safe(row.get("jsx_error"))
    if jsx_e:
        print(f"  Error: {jsx_e[:150]}")

    vue = safe(row.get("vue_sfc"))
    print(f"\n[Vue SFC] valid={row.get('vue_valid', False)}  ({len(vue)} chars)")
    print(vue[:500] + ("..." if len(vue) > 500 else ""))
    vue_e = safe(row.get("vue_error"))
    if vue_e:
        print(f"  Error: {vue_e[:150]}")

    print()

print("=" * 80)
print(f"Done - reviewed {NUM} samples")

Loaded 5 rows

  SAMPLE 1/5

[id]         0
[image_path] data/websight_100_stream\images\00000.png

[llm_generated_idea]
Fashion Brand: A visually stunning layout with a full-width, rotating image carousel showcasing their latest collections, a bold, center-aligned logo, and a bottom navigation menu. The color palette is inspired by the latest fashion trends.

[Original HTML] (1274 chars)
<html>
<link href="https://cdn.jsdelivr.net/npm/tailwindcss@2.2.19/dist/tailwind.min.css" rel="stylesheet">
<body class="bg-gray-100">
  <div class="flex flex-col items-center justify-center h-screen">
    <h1 class="text-4xl font-bold text-center text-gray-800">Fashion Brand</h1>
    <p class="mt-4 text-lg text-center text-gray-600">
      Fashion Brand is a leading fashion brand that offers a wi...

[JSX] valid=True  (0 chars)


[Vue SFC] valid=False  (0 chars)

  Error: SyntaxError: At least one <template> or <script> is required in a single file component. Component.vue

  SAMPLE 2/5

[id]        

## Notes

- **3070 Ti + 4-bit**: ~8GB VRAM, zero crashes
- **Token limits**: JSX=650, Vue=750, repairs=500-600
- **Validation**: Parse-level (Babel / Vue compiler-sfc)
- **Auto-repair**: Up to 2 retries per output
- **Images**: All external URLs replaced with deterministic Picsum

### Next Steps
- Upgrade to actual build validation (React + Vue)
- Scale to larger samples (500+)
- Fine-tune prompts if validation rates are low