# SKU/Description Demo — **Zero‑Setup + IKEA Prefilled** (No API, No HF token)

This Colab runs **fully locally** via **GPT4All** (auto‑downloads a small GGUF model). No keys, no tokens.

Workflow:
1) Auto‑download a public **IKEA BESTÅ** buying guide PDF (US, Jan 2025) — or fall back to manual upload.
2) Extract product‑like lines (OCR fallback).
3) Apply **small random perturbations** (typo / size tweak / standard off‑by‑one).
4) Ask a local model to output a **strict JSON diff**: `{is_changed, differences[], confidence, notes}`.
5) Export CSV/XLSX.


In [1]:
# %%capture
!pip -q install gpt4all pydantic pandas numpy openpyxl rapidfuzz pdfplumber pillow pytesseract pdf2image requests
!apt-get -y -qq install poppler-utils tesseract-ocr > /dev/null
import pandas as pd, numpy as np, re, json, os, random, requests
import pdfplumber
from PIL import Image
import pytesseract
from pdf2image import convert_from_path
from pydantic import BaseModel, Field
from rapidfuzz import fuzz
from gpt4all import GPT4All
print('Setup complete.')

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.5/48.5 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m121.6/121.6 MB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m101.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.0/60.0 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m109.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.8/2.8 MB[0m [31m99.1 MB/s[0m eta [36m0:00:00[0m
[?25hSetup complete.


## Config
- `USE_DEFAULT_PDF=True` to auto‑fetch the IKEA BESTÅ PDF; otherwise upload your own.
- `PAGE_INDEX` selects which page to parse (0‑based). Page 3 usually has list‑like lines.
- `GPT4ALL_MODEL` is a small default model; it will auto‑download on first run.

In [2]:
USE_DEFAULT_PDF = True
DEFAULT_URL = 'https://www.ikea.com/us/en/files/pdf/43/d8/43d81f01/besta_bw_jan_2025_np.pdf'
DEFAULT_LOCAL = '/content/ikea_besta_jan_2025.pdf'
PAGE_INDEX = 3  # 0-based; adjust if needed

GPT4ALL_MODEL = 'orca-mini-3b-gguf2-q4_0.gguf'  # tokenless, quick to start
RANDOM_SEED = 42
random.seed(RANDOM_SEED)
print('Config ready.')

Config ready.


## Get the PDF (auto-download or upload)

In [3]:
PDF_PATH = None
if USE_DEFAULT_PDF:
    try:
        print('Downloading IKEA BESTÅ PDF…')
        r = requests.get(DEFAULT_URL, stream=True, timeout=60)
        r.raise_for_status()
        with open(DEFAULT_LOCAL, 'wb') as f:
            for chunk in r.iter_content(chunk_size=1<<20):
                if chunk:
                    f.write(chunk)
        PDF_PATH = DEFAULT_LOCAL
        print('Saved to', PDF_PATH)
    except Exception as e:
        print('Auto-download failed:', e)

if not PDF_PATH:
    from google.colab import files
    print('Upload a PDF instead…')
    uploaded = files.upload()
    PDF_PATH = next(iter(uploaded.keys()))
    print('Using uploaded file:', PDF_PATH)

Downloading IKEA BESTÅ PDF…
Saved to /content/ikea_besta_jan_2025.pdf


## Extract text from the PDF (OCR fallback)

In [4]:
def extract_text_from_pdf(path, page_index=0):
    with pdfplumber.open(path) as pdf:
        if page_index >= len(pdf.pages):
            raise ValueError(f'PDF has only {len(pdf.pages)} pages')
        page = pdf.pages[page_index]
        text = page.extract_text() or ''
    text = text.strip()
    if text:
        return text
    # OCR fallback if no text layer
    print('No text layer found; running OCR…')
    images = convert_from_path(path, first_page=page_index+1, last_page=page_index+1, fmt='png')
    if not images:
        return ''
    gray = images[0].convert('L')
    return pytesseract.image_to_string(gray)

raw_text = extract_text_from_pdf(PDF_PATH, PAGE_INDEX)
print('Sample (first 800 chars):\n', raw_text[:800])

Sample (first 800 chars):
 COMBINATIONS
Overall size: 70⅞×16½×29⅛". Overall size: 47¼×16½×18⅞".
This combination: 894.888.13 This combination: 995.665.51
BESTÅ Frame 47¼×15¾×25¼", White 102.458.46 1 pc BESTÅ TV unit 47¼×15¾×15", White 705.660.33 1 pc
BESTÅ Frame 23½×15¾×25¼", White 302.458.50 1 pc BESTÅ Shelf 22×14⅛", White 002.955.54 2 pcs
GLASSVIK Glass door 23½×25⅛", White/clear glass 405.409.02 3 pcs LAPPVIKEN Door/drawer front 23½×15", White 002.916.74 2 pcs
STUBBARP Legs, White, 2pk 102.935.64 2 pcs BESTÅ Soft-closing/push-open hinges, 2pk 802.612.58 2 pcs
BESTÅ Soft-closing/push-open hinges, 2pk 802.612.58 3 pcs STUBBARP Legs, White, 2pk 102.935.64 2 pcs
BESTÅ Glass shelf 22×14⅛", Glass 602.955.32 6 pcs BESTÅ Supporting leg 3⅞", Gray 502.936.04 1 pc
Overall size: 47¼×16½×29⅛". Overall size: 70⅞×16½×15⅜".
This


## Build dataset (heuristic line picking)

In [5]:
lines = [ln.strip() for ln in raw_text.splitlines() if ln.strip()]
def looks_like_product(ln: str) -> bool:
    low = ln.lower()
    flags = [
        any(tok in low for tok in ['frame','door','drawer','shelf','tv unit','combination','overall size','hinges','glass','screw','nut','washer']),
        bool(re.search(r'\b\d{3}\.\d{3}\.\d{2}\b', ln)),  # IKEA article no. pattern
        bool(re.search(r'\b\d+[×x]\d+\b', ln)),              # size tokens
    ]
    return sum(bool(f) for f in flags) >= 1

candidates = list(dict.fromkeys([ln for ln in lines if looks_like_product(ln)]))
def synthesize_sku(ln: str, idx: int) -> str:
    m = re.search(r'\b(\d{3}\.\d{3}\.\d{2})\b', ln)
    return (m.group(1) + f'-{idx:02d}') if m else f'PROD-{idx:02d}'

rows = [{'sku': synthesize_sku(ln, i+1), 'description': ln} for i, ln in enumerate(candidates[:10])]
df = pd.DataFrame(rows)
df

Unnamed: 0,sku,description
0,PROD-01,COMBINATIONS
1,PROD-02,"Overall size: 70⅞×16½×29⅛"". Overall size: 47¼×..."
2,894.888.13-03,This combination: 894.888.13 This combination:...
3,102.458.46-04,"BESTÅ Frame 47¼×15¾×25¼"", White 102.458.46 1 p..."
4,302.458.50-05,"BESTÅ Frame 23½×15¾×25¼"", White 302.458.50 1 p..."
5,405.409.02-06,"GLASSVIK Glass door 23½×25⅛"", White/clear glas..."
6,102.935.64-07,"STUBBARP Legs, White, 2pk 102.935.64 2 pcs BES..."
7,802.612.58-08,"BESTÅ Soft-closing/push-open hinges, 2pk 802.6..."
8,602.955.32-09,"BESTÅ Glass shelf 22×14⅛"", Glass 602.955.32 6 ..."
9,PROD-10,"Overall size: 47¼×16½×29⅛"". Overall size: 70⅞×..."


## Perturbations (random tiny changes)

In [6]:
def tweak_standard(text: str) -> str:
    def repl(m):
        try:
            num = int(m.group(2))
            return f"{m.group(1)} {num+1}"
        except:
            return m.group(0)
    return re.sub(r'\b(DIN|ISO)\s*(\d{3,5})\b', repl, text, count=1)

def tweak_size(text: str) -> str:
    def repl_m(m):
        try:
            mm = int(m.group(2))
            return f"M{m.group(1)}x{mm+1}"
        except:
            return m.group(0)
    t = re.sub(r'\bM\s*(\d+)x(\d+)\b', repl_m, text, count=1, flags=re.I)
    # If no metric pattern, try generic size token A×B -> increase B by 1 if integer
    if t == text:
        m = re.search(r'(\d+)\s*[×x]\s*(\d+)', text)
        if m:
            a, b = m.group(1), m.group(2)
            try:
                b2 = str(int(b) + 1)
                t = text.replace(m.group(0), f"{a}×{b2}", 1)
            except:
                pass
    return t

def add_typo(text: str) -> str:
    if len(text) < 6:
        return text
    i = random.randint(0, len(text)-2)
    return text[:i] + text[i+1] + text[i] + text[i+2:]

def perturb_row(desc: str):
    ops = [('standard_off_by_one', tweak_standard), ('size_tweak', tweak_size), ('random_typo', add_typo)]
    k, fn = random.choice(ops)
    return k, fn(desc)

orig = df.copy()
demo = df.copy()
changes, modified = [], []
for i in range(len(demo)):
    if random.random() < 0.6:
        k, nd = perturb_row(demo.loc[i, 'description'])
        demo.loc[i, 'description'] = nd
        modified.append(True)
        changes.append(k)
    else:
        modified.append(False)
        changes.append(None)

demo['perturbed'] = modified
demo['change_type'] = changes
demo[['sku','description','perturbed','change_type']]

Unnamed: 0,sku,description,perturbed,change_type
0,PROD-01,COMBINATIONS,False,
1,PROD-02,"Overall size: 70⅞×16½×29⅛"". Overall size: 47¼×...",True,size_tweak
2,894.888.13-03,This combination: 894.888.13 This combination:...,True,standard_off_by_one
3,102.458.46-04,"BESTÅ Frame 47¼×15¾×25¼"", White 102.458.46 1 p...",False,
4,302.458.50-05,"BESTÅ Frame 23½×15¾×25¼"", White 302.458.50 1 p...",False,
5,405.409.02-06,"GLASSVIK Glass door 23½×25⅛"", White/clear glas...",False,
6,102.935.64-07,"STUBBARP Legs, White, 2pk 102.935.64 2 pcs BES...",True,size_tweak
7,802.612.58-08,"BESTÅ Soft-closing/push-open hinges, 2pk 802.6...",True,standard_off_by_one
8,602.955.32-09,"BESTÅ Glass shelf 22×14⅛"", Glass 602.955.32 6 ...",True,random_typo
9,PROD-10,"Overall size: 47¼×16½×29⅛"". Overall size: 70⅞×...",True,standard_off_by_one


## GPT4All local model (no keys) + JSON diff

In [7]:
def prompt_from_messages(messages):
    parts = []
    for m in messages:
        role = m.get('role','user')
        parts.append(f"{role.upper()}: {m.get('content','')}")
    parts.append('ASSISTANT:')
    return '\n'.join(parts)

def call_gpt4all(messages, max_tokens=512, temp=0.1):
    model = GPT4All(GPT4ALL_MODEL)
    try:
        out = model.chat_completion(messages)
        return out['choices'][0]['message']['content']
    except Exception:
        # fallback to simple generate
        with model.chat_session():
            return model.generate(prompt_from_messages(messages), max_tokens=max_tokens, temp=temp)

class DiffReport(BaseModel):
    is_changed: bool = False
    differences: list = []
    confidence: float = Field(ge=0.0, le=1.0, default=0.5)
    notes: str | None = None

SYSTEM = 'Compare two product lines. Output ONLY valid JSON: {"is_changed":bool,"differences":list of short strings,"confidence":0-1,"notes":optional}.'
USER_TMPL = 'Original: {orig}\nCurrent: {curr}\nRules: JSON only. If unsure, set is_changed=false with low confidence.'

reports = []
for i in range(len(demo)):
    orig_desc = str(orig.loc[i,'description'])
    curr_desc = str(demo.loc[i,'description'])
    messages = [
        {"role":"system","content": SYSTEM},
        {"role":"user","content": USER_TMPL.format(orig=orig_desc, curr=curr_desc)},
    ]
    raw = call_gpt4all(messages)
    start, end = raw.find('{'), raw.rfind('}')
    if start >= 0 and end > start:
        raw = raw[start:end+1]
    try:
        data = json.loads(raw)
        rep = DiffReport(**data)
    except Exception:
        rep = DiffReport(is_changed=False, differences=['LLM parse error'], confidence=0.2)
    reports.append(rep.model_dump())

df_out = pd.concat([demo.reset_index(drop=True), pd.DataFrame(reports)], axis=1)
df_out[['sku','perturbed','change_type','description','is_changed','differences','confidence','notes']]

Downloading: 100%|██████████| 1.98G/1.98G [00:21<00:00, 94.1MiB/s]
Verifying: 100%|██████████| 1.98G/1.98G [00:04<00:00, 427MiB/s]


Unnamed: 0,sku,perturbed,change_type,description,is_changed,differences,confidence,notes
0,PROD-01,False,,COMBINATIONS,False,[LLM parse error],0.2,
1,PROD-02,True,size_tweak,"Overall size: 70⅞×16½×29⅛"". Overall size: 47¼×...",True,[],0.0,
2,894.888.13-03,True,standard_off_by_one,This combination: 894.888.13 This combination:...,False,[LLM parse error],0.2,
3,102.458.46-04,False,,"BESTÅ Frame 47¼×15¾×25¼"", White 102.458.46 1 p...",True,"[Frame 47¼×15¾×25¼, White 102.458.46, White 70...",0.0,
4,302.458.50-05,False,,"BESTÅ Frame 23½×15¾×25¼"", White 302.458.50 1 p...",False,[LLM parse error],0.2,
5,405.409.02-06,False,,"GLASSVIK Glass door 23½×25⅛"", White/clear glas...",True,"[GLASSVIK Glass door 23½×25⅛, White/clear glas...",0.0,
6,102.935.64-07,True,size_tweak,"STUBBARP Legs, White, 2pk 102.935.64 2 pcs BES...",True,"[Soft-closing/push-open hinges, BESTÅ Soft-clo...",0.0,
7,802.612.58-08,True,standard_off_by_one,"BESTÅ Soft-closing/push-open hinges, 2pk 802.6...",True,"[Soft-closing/push-open hinges, Legs]",0.0,
8,602.955.32-09,True,random_typo,"BESTÅ Glass shelf 22×14⅛"", Glass 602.955.32 6 ...",False,"[<span style='color:gray'>Gary</span>, <span s...",0.0,
9,PROD-10,True,standard_off_by_one,"Overall size: 47¼×16½×29⅛"". Overall size: 70⅞×...",True,"[47/16/29\E0, 70/16/15/3]",0.0,


## Export results

In [9]:
from datetime import datetime
ts = datetime.now().strftime('%Y%m%d_%H%M%S')
csv_path = f'/content/zero_setup_ikea_diff_{ts}.csv'
xlsx_path = f'/content/zero_setup_ikea_diff_{ts}.xlsx'
df_out.to_csv(csv_path, index=False)
df_out.to_excel(xlsx_path, index=False)
print('Saved:', csv_path)
print('Saved:', xlsx_path)

Saved: /content/zero_setup_ikea_diff_20250812_092047.csv
Saved: /content/zero_setup_ikea_diff_20250812_092047.xlsx


### Notes
- This demo is for educational/testing purposes. Use public PDFs or ones you have permission to share.
- The local model is small; JSON accuracy improves if you re-run or try a slightly larger GPT4All model string.
- Want to remove randomness? Set the perturbation probability to 0 or comment that section.