# LightGBM Ads Tutorial

End-to-end tutorial using synthetic keyword-ads performance data.

**Models covered:**
1. CTR prediction (regression)
2. Conversion prediction (binary classification)
3. Keyword ranking function (score-based)
4. Learning-to-Rank with LambdaMART (group-split version)
5. Feature importance

## 0) Install dependencies

In [None]:
# !pip install lightgbm scikit-learn pandas numpy

## 1) Create synthetic ads dataset

Each row represents one `(given_word, keyword)` pair with features:
- `similarity` – cosine-like similarity between the two words
- `competition`, `impressions`, `clicks`, `cpc`, `cost`, `device`, `hour`

Targets:
- `ctr` – click-through rate (regression)
- `has_conversion` – did it convert at least once? (binary classification)

In [None]:
import json
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

rng = np.random.default_rng(42)

# Load word lists from files
with open("given_words.json") as f:
    given_words = np.array(json.load(f))

with open("keywords.json") as f:
    keywords = np.array(json.load(f))

print(f"given_words : {len(given_words)}")
print(f"keywords    : {len(keywords)}")

# ── Precompute real similarities ───────────────────────────────────────────────
print("Encoding embeddings...")
embed_model = SentenceTransformer("all-MiniLM-L6-v2")

all_words = np.unique(np.concatenate([given_words, keywords]))
vecs      = embed_model.encode(all_words.tolist(), normalize_embeddings=True, show_progress_bar=True)
vec_index = dict(zip(all_words, vecs))

def real_similarity(given: np.ndarray, kw: np.ndarray) -> np.ndarray:
    """Vectorised cosine similarity for arrays of given/keyword strings."""
    g_vecs = np.stack([vec_index[g] for g in given])
    k_vecs = np.stack([vec_index[k] for k in kw])
    # row-wise dot product (vectors are already normalised)
    return np.clip((g_vecs * k_vecs).sum(axis=1), 0.0, 1.0)

print("Done.")

def make_ads_dataset(n=100_000):
    given = rng.choice(given_words, size=n)
    kw    = rng.choice(keywords, size=n)

    similarity  = real_similarity(given, kw)

    impressions = rng.integers(50, 20000, size=n)
    device      = rng.choice(["mobile", "desktop"], size=n, p=[0.7, 0.3])
    hour        = rng.integers(0, 24, size=n)
    competition = rng.uniform(0.1, 1.0, size=n)

    cpc = np.clip(
        0.2 + 2.0 * competition + 0.5 * (1 - similarity) + rng.normal(0, 0.15, size=n),
        0.05, None
    )

    device_boost = np.where(device == "mobile", 0.02, 0.0)
    hour_boost   = np.where((hour >= 19) & (hour <= 23), 0.01, 0.0)
    ctr = np.clip(
        0.01 + 0.10 * similarity + device_boost + hour_boost + rng.normal(0, 0.01, size=n),
        0.0005, 0.30
    )

    clicks = rng.binomial(impressions, p=ctr)
    cost   = clicks * cpc

    conv_p = 1 / (1 + np.exp(-(-2.0 + 4.0 * similarity - 0.4 * cpc)))
    conversions    = rng.binomial(np.maximum(clicks, 1), p=np.clip(conv_p, 0.0001, 0.8))
    has_conversion = (conversions > 0).astype(int)

    return pd.DataFrame({
        "given_word":     given,
        "keyword":        kw,
        "similarity":     similarity,
        "competition":    competition,
        "impressions":    impressions,
        "clicks":         clicks,
        "cpc":            cpc,
        "cost":           cost,
        "device":         device,
        "hour":           hour,
        "ctr":            np.where(impressions > 0, clicks / impressions, 0.0),
        "has_conversion": has_conversion,
        "conversions":    conversions,
    })

df = make_ads_dataset(100_000)
print(f"\nDataset shape: {df.shape}")
df.head()

In [None]:
df.describe()

## 2) Prepare features

LightGBM handles categorical features natively when they are `pandas.Categorical` dtype.

In [None]:
from sklearn.model_selection import train_test_split

FEATURE_COLS = [
    "given_word", "keyword", "similarity", "competition",
    "impressions", "clicks", "cpc", "cost", "device", "hour"
]
CAT_COLS = ["given_word", "keyword", "device"]

X = df[FEATURE_COLS].copy()
for c in CAT_COLS:
    X[c] = X[c].astype("category")

y_ctr  = df["ctr"].values
y_conv = df["has_conversion"].values

X_train, X_test, y_ctr_train, y_ctr_test = train_test_split(
    X, y_ctr, test_size=0.2, random_state=42
)
# reuse the same split indices for the conversion target
y_conv_train = y_conv[X_train.index]
y_conv_test  = y_conv[X_test.index]

print(f"Train: {X_train.shape}  |  Test: {X_test.shape}")
print(f"Conversion rate (train): {y_conv_train.mean():.3f}")

## 3) Model A — CTR prediction (regression)

CTR is continuous and bounded in (0, 1). We weight each sample by `impressions` so high-volume rows have more influence.

In [None]:
import lightgbm as lgb
from sklearn.metrics import mean_squared_error

reg = lgb.LGBMRegressor(
    n_estimators=2000,
    learning_rate=0.03,
    num_leaves=63,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    verbose=-1,
)

reg.fit(
    X_train, y_ctr_train,
    sample_weight=X_train["impressions"],
    eval_set=[(X_test, y_ctr_test)],
    eval_sample_weight=[X_test["impressions"]],
    eval_metric="l2",
    categorical_feature=CAT_COLS,
    callbacks=[lgb.early_stopping(stopping_rounds=80, verbose=False),
               lgb.log_evaluation(period=200)],
)

pred_ctr = reg.predict(X_test)
rmse = mean_squared_error(y_ctr_test, pred_ctr) ** 0.5
print(f"\nCTR RMSE : {rmse:.6f}")
print(f"Best iter: {reg.best_iteration_}")

## 4) Model B — Conversion prediction (binary classification)

In [None]:
from sklearn.metrics import roc_auc_score, average_precision_score

clf = lgb.LGBMClassifier(
    n_estimators=3000,
    learning_rate=0.03,
    num_leaves=63,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    verbose=-1,
)

clf.fit(
    X_train, y_conv_train,
    eval_set=[(X_test, y_conv_test)],
    eval_metric="auc",
    categorical_feature=CAT_COLS,
    callbacks=[lgb.early_stopping(stopping_rounds=100, verbose=False),
               lgb.log_evaluation(period=200)],
)

proba = clf.predict_proba(X_test)[:, 1]
print(f"\nAUC   : {roc_auc_score(y_conv_test, proba):.4f}")
print(f"PR-AUC: {average_precision_score(y_conv_test, proba):.4f}")

## 5) Rank keywords for a given word

For a new `given_word`, score a list of candidate keywords using:
- `pred_ctr` from the regression model
- `pred_conv_prob` from the classifier
- `score = pred_ctr × pred_conv_prob` (customize to ROAS, profit, etc.)

In [None]:
def rank_keywords_for_given(given_word: str, candidates: dict, base_features: dict) -> pd.DataFrame:
    """Score and rank candidate keywords for a given word.

    Parameters
    ----------
    given_word    : The query / seed word.
    candidates    : Dict of {keyword: similarity_score} pairs.
    base_features : Dict of feature values shared across all candidates
                    (all cols except given_word, keyword, similarity).

    Returns
    -------
    DataFrame sorted by score descending.
    """
    rows = [
        {**base_features, "given_word": given_word, "keyword": kw, "similarity": sim}
        for kw, sim in candidates.items()
    ]
    Xcand = pd.DataFrame(rows)[FEATURE_COLS]
    for c in CAT_COLS:
        Xcand[c] = Xcand[c].astype("category")

    ctr_hat  = reg.predict(Xcand)
    conv_hat = clf.predict_proba(Xcand)[:, 1]

    return pd.DataFrame({
        "given_word":     given_word,
        "keyword":        list(candidates.keys()),
        "similarity":     list(candidates.values()),
        "pred_ctr":       ctr_hat,
        "pred_conv_prob": conv_hat,
        "score":          ctr_hat * conv_hat,
    }).sort_values("score", ascending=False).reset_index(drop=True)


# Per-keyword similarity scores (in practice, compute these from word embeddings)
candidates = {
    "white sneakers":    0.95,
    "running shoes":     0.82,
    "canvas shoes":      0.74,
    "hiking boots":      0.55,
    "yoga mat":          0.20,
    "leather wallet":    0.10,
    "wireless earbuds":  0.08,
    "gaming mouse":      0.05,
}

base = {
    "competition": 0.6,
    "impressions": 5000,
    "clicks":      0,
    "cpc":         2.0,
    "cost":        0.0,
    "device":      "mobile",
    "hour":        21,
}

ranked = rank_keywords_for_given("sneakers", candidates, base)
ranked

## 6) Learning-to-Rank with LambdaMART

A proper LambdaMART setup requires:
1. **Group-based train/test split** — keep all rows for a `given_word` in the same split.
2. **Group sizes array** — number of candidate keywords per query, in order.
3. **Relevance labels** — here we use `ctr`; in production use ROAS or conversions.

In [None]:
from lightgbm import LGBMRanker

# ── 6a) Group-based train/test split ──────────────────────────────────────────
unique_given = df["given_word"].unique()
rng_split    = np.random.default_rng(0)
rng_split.shuffle(unique_given)

split_idx    = int(len(unique_given) * 0.8)
train_words  = set(unique_given[:split_idx])
test_words   = set(unique_given[split_idx:])

df_rank = df.sort_values("given_word").copy()

mask_train = df_rank["given_word"].isin(train_words)
df_r_train = df_rank[mask_train].copy()
df_r_test  = df_rank[~mask_train].copy()

print(f"Ranker train rows: {len(df_r_train)}  |  test rows: {len(df_r_test)}")
print(f"Train given_words: {sorted(train_words)}")
print(f"Test  given_words: {sorted(test_words)}")

In [None]:
# ── 6b) Build feature matrices and group size arrays ──────────────────────────
def bin_ctr(ctr_values: np.ndarray, n_bins: int = 5) -> np.ndarray:
    """Convert continuous CTR into integer relevance grades (0 to n_bins-1)."""
    bins = np.quantile(ctr_values, np.linspace(0, 1, n_bins + 1))
    bins = np.unique(bins)  # remove duplicates if any
    return np.digitize(ctr_values, bins[1:-1]).astype(int)

def build_rank_arrays(subset: pd.DataFrame):
    Xr = subset[FEATURE_COLS].copy()
    for c in CAT_COLS:
        Xr[c] = Xr[c].astype("category")
    y      = bin_ctr(subset["ctr"].values)   # integer grades required by LambdaMART
    groups = subset.groupby("given_word", sort=True).size().tolist()
    return Xr, y, groups

Xr_train, yr_train, groups_train = build_rank_arrays(df_r_train)
Xr_test,  yr_test,  groups_test  = build_rank_arrays(df_r_test)

print(f"Label range: {yr_train.min()} – {yr_train.max()}  (grades 0–4)")
print(f"Group sizes (train): {groups_train}")
print(f"Group sizes (test) : {groups_test}")

In [None]:
# ── 6c) Train LambdaMART ranker ───────────────────────────────────────────────
ranker = LGBMRanker(
    objective="lambdarank",
    metric="ndcg",
    n_estimators=2000,
    learning_rate=0.03,
    num_leaves=63,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    verbose=-1,
)

ranker.fit(
    Xr_train, yr_train,
    group=groups_train,
    eval_set=[(Xr_test, yr_test)],
    eval_group=[groups_test],
    eval_at=[3, 5, 10],
    categorical_feature=CAT_COLS,
    callbacks=[lgb.early_stopping(stopping_rounds=80, verbose=False),
               lgb.log_evaluation(period=200)],
)

print(f"\nBest iteration: {ranker.best_iteration_}")

In [None]:
# ── 6d) Inspect ranker scores for one test given_word ─────────────────────────
sample_word = list(test_words)[0]
df_sample   = df_r_test[df_r_test["given_word"] == sample_word].copy()

Xs = df_sample[FEATURE_COLS].copy()
for c in CAT_COLS:
    Xs[c] = Xs[c].astype("category")

df_sample["ranker_score"] = ranker.predict(Xs)
df_sample[["given_word", "keyword", "ctr", "ranker_score"]] \
    .sort_values("ranker_score", ascending=False) \
    .reset_index(drop=True)

## 7) Feature importance

Using **gain** (total reduction in loss attributed to each feature).

In [None]:
def show_importance(model, title: str):
    fi = pd.DataFrame({
        "feature":    model.feature_name_,
        "importance": model.booster_.feature_importance(importance_type="gain"),
    }).sort_values("importance", ascending=False).reset_index(drop=True)
    print(f"\n=== {title} ===")
    print(fi.to_string(index=False))
    return fi

fi_reg    = show_importance(reg,    "CTR Regression")
fi_clf    = show_importance(clf,    "Conversion Classifier")
fi_ranker = show_importance(ranker, "LambdaMART Ranker")

## Quick-reference: choosing the right setup

| Success metric | Target variable | LightGBM objective | Eval metric |
|---|---|---|---|
| CTR | `ctr` (float) | `regression` | RMSE / MAE |
| Conversion | `has_conversion` (0/1) | `binary` | AUC / PR-AUC |
| ROAS / Profit | continuous value | `regression` or `tweedie` | RMSE |
| Click volume | `clicks` (count) | `poisson` | — |
| Keyword ranking | any relevance label | `lambdarank` | NDCG@k |