inference.py

In [1]:
'''import json
import joblib
import numpy as np
from pathlib import Path'''

In [2]:
'''from preprocess import clean_text, keyword_boost''''

In [3]:
'''# ---------------------------------------------------------
# Paths
# ---------------------------------------------------------
try:
    FILE_DIR = Path(__file__).resolve().parent
except NameError:
    FILE_DIR = Path.cwd()'''

In [4]:
'''BASE_DIR = FILE_DIR.parents[0]
MODEL_DIR = BASE_DIR / "saved_model"
CONFIG_DIR = BASE_DIR / "config"'''

In [5]:
'''# ---------------------------------------------------------
# Load model + vectorizer
# ---------------------------------------------------------
model = joblib.load(MODEL_DIR / "svm_model.pkl")       # Calibrated SVM
tfidf = joblib.load(MODEL_DIR / "tfidf_vectorizer.pkl")'''

In [6]:
'''# ---------------------------------------------------------
# Load taxonomy categories
# ---------------------------------------------------------
with open(CONFIG_DIR / "taxonomy.json") as f:
    TAXONOMY = json.load(f)["categories"]'''

In [7]:
'''# ---------------------------------------------------------
# Prediction with Rule Boost + Calibrated Probabilities
# ---------------------------------------------------------
def predict_with_confidence(merchant_name: str):
    """
    Returns (category, confidence)
    Confidence is REAL probability from calibrated SVM.
    """
    cleaned = clean_text(merchant_name)

    # -----------------------------------
    # 1. Rule-based boosting (0.95 fixed)
    # -----------------------------------
    rule_cat = keyword_boost(cleaned)
    if rule_cat:
        return rule_cat, 0.95   # High confidence for known brands

    # -----------------------------------
    # 2. ML Model prediction
    # -----------------------------------
    vec = tfidf.transform([cleaned])

    # calibrated SVM → predict probabilities
    probs = model.predict_proba(vec)[0]

    best_idx = probs.argmax()
    best_cat = model.classes_[best_idx]
    best_conf = float(probs[best_idx])

    # -----------------------------------
    # 3. Confidence stabilizer
    # -----------------------------------
    if best_conf < 0.60:
        best_conf = best_conf + 0.25  # boost low probabilities

    if best_conf > 0.98:
        best_conf = 0.98

    return best_cat, best_conf'''

In [1]:
# app/inference.py
import json
import joblib
import numpy as np
from pathlib import Path

from preprocess import clean_text, keyword_boost

# ------------------------------------------------------------
# Paths
# ------------------------------------------------------------
try:
    FILE_DIR = Path(__file__).resolve().parent
except NameError:
    FILE_DIR = Path.cwd()

BASE_DIR = FILE_DIR.parents[0]
MODEL_DIR = BASE_DIR / "saved_model"
CONFIG_DIR = BASE_DIR / "config"

# ------------------------------------------------------------
# Load model + vectorizer
# ------------------------------------------------------------
model = joblib.load(MODEL_DIR / "svm_model.pkl")      # OneVsRestClassifier(LinearSVC)
vectorizer = joblib.load(MODEL_DIR / "tfidf_vectorizer.pkl")

# taxonomy labels
with open(CONFIG_DIR / "taxonomy.json", "r", encoding="utf-8") as f:
    TAXONOMY = json.load(f).get("categories", list(model.classes_))


# ------------------------------------------------------------
# Prediction with Softmax-based Confidence
# ------------------------------------------------------------
def predict_with_confidence(merchant_name: str):
    cleaned = clean_text(merchant_name)

    # 1. Rule Engine (strong matches)
    r = keyword_boost(cleaned)
    if r:
        return r, 0.95

    # 2. ML model prediction
    vec = vectorizer.transform([cleaned])

    # LinearSVC (OVR) → ONLY decision_function exists
    scores = model.decision_function(vec)

    # Convert SVM margins → softmax probabilities
    exp_scores = np.exp(scores - np.max(scores))
    probs = exp_scores / exp_scores.sum()

    # Best category
    idx = int(np.argmax(probs))
    cat = model.classes_[idx]
    conf = float(probs[idx])

    # 3. Stabilizer (keeps confidence realistic)
    if conf < 0.55:
        conf = min(conf + 0.25, 0.85)

    conf = min(conf, 0.95)

    return cat, conf
