# LightGBM Ads Tutorial

End-to-end tutorial using synthetic keyword-ads performance data.

**Models covered:**
1. CTR prediction (regression)
2. Conversion prediction (binary classification)
3. Keyword ranking function (score-based)
4. Learning-to-Rank with LambdaMART (group-split version)
5. Feature importance

## 0) Install dependencies

In [1]:
# !pip install lightgbm scikit-learn pandas numpy

## 1) Create synthetic ads dataset

Each row represents one `(given_word, keyword)` pair with features:
- `similarity` – cosine-like similarity between the two words
- `competition`, `impressions`, `clicks`, `cpc`, `cost`, `device`, `hour`

Targets:
- `ctr` – click-through rate (regression)
- `has_conversion` – did it convert at least once? (binary classification)

In [2]:
import json
import numpy as np
import pandas as pd

rng = np.random.default_rng(42)

# Load word lists from files
with open("given_words.json") as f:
    given_words = np.array(json.load(f))

with open("keywords.json") as f:
    keywords = np.array(json.load(f))

print(f"given_words : {len(given_words)}")
print(f"keywords    : {len(keywords)}")

def make_ads_dataset(n=100_000):
    given = rng.choice(given_words, size=n)
    kw    = rng.choice(keywords, size=n)

    # Similarity: random base (no hardcoded pair boosts with large vocabularies)
    similarity = rng.uniform(0.05, 0.95, size=n)

    impressions = rng.integers(50, 20000, size=n)
    device      = rng.choice(["mobile", "desktop"], size=n, p=[0.7, 0.3])
    hour        = rng.integers(0, 24, size=n)

    competition = rng.uniform(0.1, 1.0, size=n)
    cpc = np.clip(
        0.2 + 2.0 * competition + 0.5 * (1 - similarity) + rng.normal(0, 0.15, size=n),
        0.05, None
    )

    device_boost = np.where(device == "mobile", 0.02, 0.0)
    hour_boost   = np.where((hour >= 19) & (hour <= 23), 0.01, 0.0)
    ctr = np.clip(
        0.01 + 0.10 * similarity + device_boost + hour_boost + rng.normal(0, 0.01, size=n),
        0.0005, 0.30
    )

    clicks = rng.binomial(impressions, p=ctr)
    cost   = clicks * cpc

    conv_p = 1 / (1 + np.exp(-(-2.0 + 4.0 * similarity - 0.4 * cpc)))
    conversions    = rng.binomial(np.maximum(clicks, 1), p=np.clip(conv_p, 0.0001, 0.8))
    has_conversion = (conversions > 0).astype(int)

    df = pd.DataFrame({
        "given_word":     given,
        "keyword":        kw,
        "similarity":     similarity,
        "competition":    competition,
        "impressions":    impressions,
        "clicks":         clicks,
        "cpc":            cpc,
        "cost":           cost,
        "device":         device,
        "hour":           hour,
        "ctr":            np.where(impressions > 0, clicks / impressions, 0.0),
        "has_conversion": has_conversion,
        "conversions":    conversions,
    })
    return df

df = make_ads_dataset(100_000)
print(f"\nDataset shape: {df.shape}")
df.head()

given_words : 50
keywords    : 1030

Dataset shape: (100000, 13)


Unnamed: 0,given_word,keyword,similarity,competition,impressions,clicks,cpc,cost,device,hour,ctr,has_conversion,conversions
0,watch,vacuum for pet hair,0.894986,0.901347,15103,1514,2.182333,3304.051789,desktop,1,0.100245,1,1035
1,supplement,slide guitar,0.90286,0.538472,8103,742,1.105999,820.65149,desktop,7,0.091571,1,566
2,mattress,horseshoe necklace,0.532406,0.753773,18052,1179,1.812916,2137.427676,desktop,4,0.065311,1,395
3,sunglasses,monitor arm,0.275778,0.45805,7866,589,1.53456,903.855895,mobile,19,0.074879,1,115
4,sunglasses,pixel phone,0.715391,0.758914,8275,796,1.624289,1292.933963,desktop,6,0.096193,1,427


In [3]:
df.describe()

Unnamed: 0,similarity,competition,impressions,clicks,cpc,cost,hour,ctr,has_conversion,conversions
count,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0
mean,0.499401,0.548108,10042.04323,763.56243,1.545873,1147.029746,11.50184,0.076045,0.99547,347.76208
std,0.259928,0.259939,5767.395724,556.924017,0.556984,942.280951,6.902771,0.029938,0.067153,388.698936
min,0.05002,0.100002,50.0,0.0,0.060418,0.0,0.0,0.0,0.0,0.0
25%,0.273222,0.323529,5028.0,308.0,1.094631,421.735371,6.0,0.053116,1.0,61.0
50%,0.498087,0.546702,10075.0,654.0,1.5422,907.503238,12.0,0.075877,1.0,191.0
75%,0.724805,0.77335,15048.0,1120.0,1.997313,1636.166584,17.0,0.099104,1.0,510.0
max,0.949998,0.999993,19999.0,3061.0,3.097405,6457.487686,23.0,0.272727,1.0,2417.0


## 2) Prepare features

LightGBM handles categorical features natively when they are `pandas.Categorical` dtype.

In [4]:
from sklearn.model_selection import train_test_split

FEATURE_COLS = [
    "given_word", "keyword", "similarity", "competition",
    "impressions", "clicks", "cpc", "cost", "device", "hour"
]
CAT_COLS = ["given_word", "keyword", "device"]

X = df[FEATURE_COLS].copy()
for c in CAT_COLS:
    X[c] = X[c].astype("category")

y_ctr  = df["ctr"].values
y_conv = df["has_conversion"].values

X_train, X_test, y_ctr_train, y_ctr_test = train_test_split(
    X, y_ctr, test_size=0.2, random_state=42
)
# reuse the same split indices for the conversion target
y_conv_train = y_conv[X_train.index]
y_conv_test  = y_conv[X_test.index]

print(f"Train: {X_train.shape}  |  Test: {X_test.shape}")
print(f"Conversion rate (train): {y_conv_train.mean():.3f}")

Train: (80000, 10)  |  Test: (20000, 10)
Conversion rate (train): 0.996


## 3) Model A — CTR prediction (regression)

CTR is continuous and bounded in (0, 1). We weight each sample by `impressions` so high-volume rows have more influence.

In [5]:
import lightgbm as lgb
from sklearn.metrics import mean_squared_error

reg = lgb.LGBMRegressor(
    n_estimators=2000,
    learning_rate=0.03,
    num_leaves=63,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    verbose=-1,
)

reg.fit(
    X_train, y_ctr_train,
    sample_weight=X_train["impressions"],
    eval_set=[(X_test, y_ctr_test)],
    eval_sample_weight=[X_test["impressions"]],
    eval_metric="l2",
    categorical_feature=CAT_COLS,
    callbacks=[lgb.early_stopping(stopping_rounds=80, verbose=False),
               lgb.log_evaluation(period=200)],
)

pred_ctr = reg.predict(X_test)
rmse = mean_squared_error(y_ctr_test, pred_ctr) ** 0.5
print(f"\nCTR RMSE : {rmse:.6f}")
print(f"Best iter: {reg.best_iteration_}")

[200]	valid_0's l2: 1.00769e-05
[400]	valid_0's l2: 4.18689e-06
[600]	valid_0's l2: 2.45286e-06
[800]	valid_0's l2: 1.8003e-06
[1000]	valid_0's l2: 1.52953e-06
[1200]	valid_0's l2: 1.39242e-06
[1400]	valid_0's l2: 1.32024e-06
[1600]	valid_0's l2: 1.27857e-06
[1800]	valid_0's l2: 1.25081e-06
[2000]	valid_0's l2: 1.23382e-06

CTR RMSE : 0.002305
Best iter: 1999


## 4) Model B — Conversion prediction (binary classification)

In [6]:
from sklearn.metrics import roc_auc_score, average_precision_score

clf = lgb.LGBMClassifier(
    n_estimators=3000,
    learning_rate=0.03,
    num_leaves=63,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    verbose=-1,
)

clf.fit(
    X_train, y_conv_train,
    eval_set=[(X_test, y_conv_test)],
    eval_metric="auc",
    categorical_feature=CAT_COLS,
    callbacks=[lgb.early_stopping(stopping_rounds=100, verbose=False),
               lgb.log_evaluation(period=200)],
)

proba = clf.predict_proba(X_test)[:, 1]
print(f"\nAUC   : {roc_auc_score(y_conv_test, proba):.4f}")
print(f"PR-AUC: {average_precision_score(y_conv_test, proba):.4f}")


AUC   : 0.9969
PR-AUC: 1.0000


## 5) Rank keywords for a given word

For a new `given_word`, score a list of candidate keywords using:
- `pred_ctr` from the regression model
- `pred_conv_prob` from the classifier
- `score = pred_ctr × pred_conv_prob` (customize to ROAS, profit, etc.)

In [None]:
def rank_keywords_for_given(given_word: str, candidates: dict, base_features: dict) -> pd.DataFrame:
    """Score and rank candidate keywords for a given word.

    Parameters
    ----------
    given_word    : The query / seed word.
    candidates    : Dict of {keyword: similarity_score} pairs.
    base_features : Dict of feature values shared across all candidates
                    (all cols except given_word, keyword, similarity).

    Returns
    -------
    DataFrame sorted by score descending.
    """
    rows = [
        {**base_features, "given_word": given_word, "keyword": kw, "similarity": sim}
        for kw, sim in candidates.items()
    ]
    Xcand = pd.DataFrame(rows)[FEATURE_COLS]
    for c in CAT_COLS:
        Xcand[c] = Xcand[c].astype("category")

    ctr_hat  = reg.predict(Xcand)
    conv_hat = clf.predict_proba(Xcand)[:, 1]

    return pd.DataFrame({
        "given_word":     given_word,
        "keyword":        list(candidates.keys()),
        "similarity":     list(candidates.values()),
        "pred_ctr":       ctr_hat,
        "pred_conv_prob": conv_hat,
        "score":          ctr_hat * conv_hat,
    }).sort_values("score", ascending=False).reset_index(drop=True)


# Per-keyword similarity scores (in practice, compute these from word embeddings)
candidates = {
    "white sneakers":    0.95,
    "running shoes":     0.82,
    "canvas shoes":      0.74,
    "hiking boots":      0.55,
    "yoga mat":          0.20,
    "leather wallet":    0.10,
    "wireless earbuds":  0.08,
    "gaming mouse":      0.05,
}

base = {
    "competition": 0.6,
    "impressions": 5000,
    "clicks":      0,
    "cpc":         2.0,
    "cost":        0.0,
    "device":      "mobile",
    "hour":        21,
}

ranked = rank_keywords_for_given("sneakers", candidates, base)
ranked

## 6) Learning-to-Rank with LambdaMART

A proper LambdaMART setup requires:
1. **Group-based train/test split** — keep all rows for a `given_word` in the same split.
2. **Group sizes array** — number of candidate keywords per query, in order.
3. **Relevance labels** — here we use `ctr`; in production use ROAS or conversions.

In [8]:
from lightgbm import LGBMRanker

# ── 6a) Group-based train/test split ──────────────────────────────────────────
unique_given = df["given_word"].unique()
rng_split    = np.random.default_rng(0)
rng_split.shuffle(unique_given)

split_idx    = int(len(unique_given) * 0.8)
train_words  = set(unique_given[:split_idx])
test_words   = set(unique_given[split_idx:])

df_rank = df.sort_values("given_word").copy()

mask_train = df_rank["given_word"].isin(train_words)
df_r_train = df_rank[mask_train].copy()
df_r_test  = df_rank[~mask_train].copy()

print(f"Ranker train rows: {len(df_r_train)}  |  test rows: {len(df_r_test)}")
print(f"Train given_words: {sorted(train_words)}")
print(f"Test  given_words: {sorted(test_words)}")

Ranker train rows: 80092  |  test rows: 19908
Train given_words: ['boots', 'camera', 'camping', 'candle', 'coffee', 'desk', 'dress', 'fishing', 'gaming', 'gift', 'guitar', 'handbag', 'headphones', 'jacket', 'jewelry', 'keyboard', 'laptop', 'luggage', 'makeup', 'mattress', 'monitor', 'necklace', 'pants', 'perfume', 'pet', 'phone', 'plant', 'printer', 'protein', 'running', 'shoes', 'skincare', 'sneakers', 'sunglasses', 'supplement', 'toy', 'vitamin', 'wallet', 'watch', 'yoga']
Test  given_words: ['baby', 'backpack', 'bicycle', 'blender', 'book', 'ring', 'shirt', 'sofa', 'tent', 'vacuum']


In [9]:
# ── 6b) Build feature matrices and group size arrays ──────────────────────────
def bin_ctr(ctr_values: np.ndarray, n_bins: int = 5) -> np.ndarray:
    """Convert continuous CTR into integer relevance grades (0 to n_bins-1)."""
    bins = np.quantile(ctr_values, np.linspace(0, 1, n_bins + 1))
    bins = np.unique(bins)  # remove duplicates if any
    return np.digitize(ctr_values, bins[1:-1]).astype(int)

def build_rank_arrays(subset: pd.DataFrame):
    Xr = subset[FEATURE_COLS].copy()
    for c in CAT_COLS:
        Xr[c] = Xr[c].astype("category")
    y      = bin_ctr(subset["ctr"].values)   # integer grades required by LambdaMART
    groups = subset.groupby("given_word", sort=True).size().tolist()
    return Xr, y, groups

Xr_train, yr_train, groups_train = build_rank_arrays(df_r_train)
Xr_test,  yr_test,  groups_test  = build_rank_arrays(df_r_test)

print(f"Label range: {yr_train.min()} – {yr_train.max()}  (grades 0–4)")
print(f"Group sizes (train): {groups_train}")
print(f"Group sizes (test) : {groups_test}")

Label range: 0 – 4  (grades 0–4)
Group sizes (train): [2077, 1977, 2026, 1925, 2017, 1916, 1993, 2019, 2022, 1946, 1978, 1976, 2089, 1906, 2006, 1985, 2069, 2056, 1932, 2027, 2093, 2046, 1986, 2097, 2021, 1973, 2031, 2016, 2092, 1976, 1963, 2021, 1938, 1998, 1975, 1984, 1962, 1979, 1967, 2032]
Group sizes (test) : [2002, 1956, 2011, 2028, 1987, 1920, 2081, 1967, 1957, 1999]


In [10]:
# ── 6c) Train LambdaMART ranker ───────────────────────────────────────────────
ranker = LGBMRanker(
    objective="lambdarank",
    metric="ndcg",
    n_estimators=2000,
    learning_rate=0.03,
    num_leaves=63,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    verbose=-1,
)

ranker.fit(
    Xr_train, yr_train,
    group=groups_train,
    eval_set=[(Xr_test, yr_test)],
    eval_group=[groups_test],
    eval_at=[3, 5, 10],
    categorical_feature=CAT_COLS,
    callbacks=[lgb.early_stopping(stopping_rounds=80, verbose=False),
               lgb.log_evaluation(period=200)],
)

print(f"\nBest iteration: {ranker.best_iteration_}")


Best iteration: 2


In [11]:
# ── 6d) Inspect ranker scores for one test given_word ─────────────────────────
sample_word = list(test_words)[0]
df_sample   = df_r_test[df_r_test["given_word"] == sample_word].copy()

Xs = df_sample[FEATURE_COLS].copy()
for c in CAT_COLS:
    Xs[c] = Xs[c].astype("category")

df_sample["ranker_score"] = ranker.predict(Xs)
df_sample[["given_word", "keyword", "ctr", "ranker_score"]] \
    .sort_values("ranker_score", ascending=False) \
    .reset_index(drop=True)

Unnamed: 0,given_word,keyword,ctr,ranker_score
0,ring,backpack patch,0.114436,0.117172
1,ring,yoga mat,0.127631,0.117172
2,ring,bluetooth keyboard,0.115581,0.117172
3,ring,nursing pillow,0.110617,0.117172
4,ring,yoga towel,0.126879,0.117172
...,...,...,...,...
1915,ring,birthstone ring,0.073955,-0.113983
1916,ring,king mattress,0.039341,-0.113983
1917,ring,espadrilles,0.023457,-0.113983
1918,ring,yoga pants,0.056410,-0.113983


## 7) Feature importance

Using **gain** (total reduction in loss attributed to each feature).

In [12]:
def show_importance(model, title: str):
    fi = pd.DataFrame({
        "feature":    model.feature_name_,
        "importance": model.booster_.feature_importance(importance_type="gain"),
    }).sort_values("importance", ascending=False).reset_index(drop=True)
    print(f"\n=== {title} ===")
    print(fi.to_string(index=False))
    return fi

fi_reg    = show_importance(reg,    "CTR Regression")
fi_clf    = show_importance(clf,    "Conversion Classifier")
fi_ranker = show_importance(ranker, "LambdaMART Ranker")


=== CTR Regression ===
    feature   importance
 similarity 6.901257e+06
     clicks 2.753261e+06
impressions 1.182219e+06
     device 7.017221e+05
       cost 1.426487e+05
        cpc 1.163797e+05
       hour 9.276968e+04
competition 3.552020e+04
    keyword 1.582057e+04
 given_word 4.588684e+03

=== Conversion Classifier ===
    feature   importance
     clicks 42370.335656
 similarity 13723.701457
        cpc  5637.854656
       cost  4073.249611
competition  2308.126128
impressions  1812.622571
       hour  1767.731493
    keyword   964.981350
 given_word   267.031436
     device   229.277286

=== LambdaMART Ranker ===
    feature  importance
     clicks  607.021876
 similarity  401.939956
     device   60.238510
        cpc   33.235327
competition   28.903426
       hour   12.042174
    keyword    8.719098
impressions    8.193921
       cost    5.897397
 given_word    0.000000


## Quick-reference: choosing the right setup

| Success metric | Target variable | LightGBM objective | Eval metric |
|---|---|---|---|
| CTR | `ctr` (float) | `regression` | RMSE / MAE |
| Conversion | `has_conversion` (0/1) | `binary` | AUC / PR-AUC |
| ROAS / Profit | continuous value | `regression` or `tweedie` | RMSE |
| Click volume | `clicks` (count) | `poisson` | — |
| Keyword ranking | any relevance label | `lambdarank` | NDCG@k |

## 8) Real similarity with sentence embeddings

Instead of hardcoding similarity values, compute them from a pretrained embedding model.
`all-MiniLM-L6-v2` runs locally, is fast, and understands semantic meaning.

In [None]:
# !pip install sentence-transformers

In [None]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

# Load model once — downloads ~80MB on first run
embed_model = SentenceTransformer("all-MiniLM-L6-v2")

def compute_similarities(given_word: str, keywords: list) -> dict:
    """Compute cosine similarity between given_word and each keyword."""
    texts     = [given_word] + keywords
    vecs      = embed_model.encode(texts, normalize_embeddings=True)
    given_vec = vecs[0:1]
    kw_vecs   = vecs[1:]
    scores    = cosine_similarity(given_vec, kw_vecs)[0]
    return dict(zip(keywords, scores.tolist()))


given_word = "sneakers"
keywords   = [
    "white sneakers", "running shoes", "canvas shoes",
    "hiking boots", "yoga mat", "leather wallet",
    "wireless earbuds", "gaming mouse",
]

candidates = compute_similarities(given_word, keywords)

for kw, sim in sorted(candidates.items(), key=lambda x: -x[1]):
    print(f"{sim:.3f}  {kw}")

In [None]:
# Plug real similarities into the ranker
base = {
    "competition": 0.6,
    "impressions": 5000,
    "clicks":      0,
    "cpc":         2.0,
    "cost":        0.0,
    "device":      "mobile",
    "hour":        21,
}

ranked = rank_keywords_for_given(given_word, candidates, base)
ranked