In [1]:
import sys
!{sys.executable} -m pip install -q --upgrade pip setuptools wheel
!{sys.executable} -m pip install -q "scikit-learn==0.24.2"


In [2]:
!{sys.executable} -m pip install --no-cache-dir --only-binary=:all: "xgboost==1.5.2"



In [3]:
!{sys.executable} -m pip install --no-cache-dir "lightgbm==3.3.5"



In [4]:
import sklearn
import xgboost as xgb
import lightgbm as lgb

In [5]:
!{sys.executable} -m pip install -q --force-reinstall "numpy==1.19.5"


In [6]:
!{sys.executable} -m pip install -q --force-reinstall "pandas==1.1.5"

In [7]:
import numpy as np
import pandas as pd

In [8]:
cols = [
    "impression_id",
    "clicked",
    "news_id",
    "coclick_score",
    "global_ctr",
    "global_shown_cnt",
    "global_click_cnt",
    "category",
    "subcategory",
    "title_len",
    "abstract_len"
]
train_head = pd.read_csv("rank_train.csv", header=None, names=cols, nrows=2000)
dev_head   = pd.read_csv("rank_dev.csv",   header=None, names=cols, nrows=2000)
print(train_head.columns.tolist())
print(train_head.head(3))


['impression_id', 'clicked', 'news_id', 'coclick_score', 'global_ctr', 'global_shown_cnt', 'global_click_cnt', 'category', 'subcategory', 'title_len', 'abstract_len']
   impression_id  clicked news_id  coclick_score  global_ctr  \
0              1        0  N35729       0.217019       15418   
1              1        1  N55689       0.235654       18315   
2              2        0  N39317       0.081494        6344   

   global_shown_cnt global_click_cnt      category  subcategory  title_len  \
0              3346             news        newsus           68        100   
1              4316           sports  football_nfl           78        145   
2               517             news  newspolitics           69        129   

   abstract_len  
0           NaN  
1           NaN  
2           NaN  


In [9]:
dev_head.head()

Unnamed: 0,impression_id,clicked,news_id,coclick_score,global_ctr,global_shown_cnt,global_click_cnt,category,subcategory,title_len,abstract_len
0,1,0,N51470,0.0,0,0,weather,weathertopstories,68,180,
1,1,0,N6400,0.0,1,0,sports,more_sports,83,129,
2,1,0,N21679,0.0,0,0,sports,football_nfl,60,133,
3,1,0,N42844,0.0,8,0,tv,tvnews,80,184,
4,1,0,N58098,0.0,3,0,foodanddrink,quickandeasy,42,95,


loaded smaller sample sets with 2000 rows due to ram constraints in my college provided jupyter notebook.

In [10]:
print("train_head columns:")
print(list(train_head.columns))

print("\ndev_head columns:")
print(list(dev_head.columns))




train_head columns:
['impression_id', 'clicked', 'news_id', 'coclick_score', 'global_ctr', 'global_shown_cnt', 'global_click_cnt', 'category', 'subcategory', 'title_len', 'abstract_len']

dev_head columns:
['impression_id', 'clicked', 'news_id', 'coclick_score', 'global_ctr', 'global_shown_cnt', 'global_click_cnt', 'category', 'subcategory', 'title_len', 'abstract_len']


In [11]:
train = train_head.copy()
dev = dev_head.copy()

In [12]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline

feature_cols_num = ["coclick_score","global_ctr","global_shown_cnt","global_click_cnt","title_len","abstract_len"]
feature_cols_cat = ["category","subcategory"]

X_train = train[feature_cols_num + feature_cols_cat]
y_train = train["clicked"].values

X_dev = dev[feature_cols_num + feature_cols_cat]
y_dev = dev["clicked"].values

preprocess = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), feature_cols_cat),
        ("num", "passthrough", feature_cols_num),
    ]
)


In [13]:
from sklearn.linear_model import LogisticRegression

cols = [
    "impression_id","clicked","news_id","coclick_score","global_ctr",
    "global_shown_cnt","global_click_cnt","category","subcategory",
    "title_len","abstract_len"
]

train_head.columns = cols
dev_head.columns   = cols

train = train_head.copy()
dev   = dev_head.copy()

for df in (train, dev):
    df["impression_id"] = pd.to_numeric(df["impression_id"], errors="coerce").fillna(-1).astype("int32")

    df["clicked"] = pd.to_numeric(df["clicked"], errors="coerce").fillna(0).astype("int8")

    df["coclick_score"] = pd.to_numeric(df["coclick_score"], errors="coerce").fillna(0).astype("float32")
    df["global_ctr"]    = pd.to_numeric(df["global_ctr"], errors="coerce").fillna(0).astype("float32")

    df["global_shown_cnt"] = pd.to_numeric(df["global_shown_cnt"], errors="coerce").fillna(0).astype("int32")
    df["global_click_cnt"] = pd.to_numeric(df["global_click_cnt"], errors="coerce").fillna(0).astype("int32")

    df["title_len"]    = pd.to_numeric(df["title_len"], errors="coerce").fillna(0).astype("int32")
    df["abstract_len"] = pd.to_numeric(df["abstract_len"], errors="coerce").fillna(0).astype("int32")

    df["category"]    = df["category"].fillna("").astype(object)
    df["subcategory"] = df["subcategory"].fillna("").astype(object)
    df["news_id"]      = df["news_id"].fillna("").astype(object)

train = train[train["impression_id"] >= 0].copy()
dev   = dev[dev["impression_id"] >= 0].copy()

num_cols = ["coclick_score","global_ctr","global_shown_cnt","global_click_cnt","title_len","abstract_len"]
cat_cols = ["category","subcategory"]

X_train = train[num_cols + cat_cols]
y_train = train["clicked"].values

X_dev = dev[num_cols + cat_cols]
y_dev = dev["clicked"].values

preprocess = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
        ("num", "passthrough", num_cols),
    ]
)

rank_model = Pipeline(steps=[
    ("prep", preprocess),
    ("clf", LogisticRegression(max_iter=500, solver="liblinear"))
])

rank_model.fit(X_train, y_train)

dev["pred"] = rank_model.predict_proba(X_dev)[:, 1].astype("float32")

print("train shape:", train.shape, "dev shape:", dev.shape)
dev[["impression_id","news_id","clicked","pred","global_ctr","coclick_score","category","subcategory"]].head(10)


train shape: (2000, 11) dev shape: (2000, 12)


Unnamed: 0,impression_id,news_id,clicked,pred,global_ctr,coclick_score,category,subcategory
0,1,N51470,0,0.011852,0.0,0.0,weathertopstories,68
1,1,N6400,0,0.013039,1.0,0.0,more_sports,83
2,1,N21679,0,0.0095,0.0,0.0,football_nfl,60
3,1,N42844,0,0.048469,8.0,0.0,tvnews,80
4,1,N58098,0,0.015495,3.0,0.0,quickandeasy,42
5,1,N5472,0,0.006925,0.0,0.0,football_nfl,103
6,1,N49285,0,0.073113,3.0,0.0,tv-celebrity,67
7,1,N33176,0,0.015061,0.0,0.0,newsus,75
8,1,N24802,0,0.017077,0.0,0.0,newsus,85
9,1,N31958,1,0.009155,2.0,0.5,football_nfl,68


In [14]:
def ndcg_at_k(labels, scores, k=10):
    labels = np.asarray(labels, dtype=np.int32)
    scores = np.asarray(scores, dtype=np.float32)

    order = np.argsort(-scores)
    labels_sorted = labels[order][:k]

    gains = (2 ** labels_sorted - 1)
    discounts = 1.0 / np.log2(np.arange(2, len(labels_sorted) + 2))
    dcg = float(np.sum(gains * discounts))

    ideal = np.sort(labels)[::-1][:k]
    gains_i = (2 ** ideal - 1)
    discounts_i = 1.0 / np.log2(np.arange(2, len(ideal) + 2))
    idcg = float(np.sum(gains_i * discounts_i))

    return 0.0 if idcg == 0 else dcg / idcg

def mean_ndcg(df, score_col, k=10):
    vals = []
    for _, g in df.groupby("impression_id"):
        vals.append(ndcg_at_k(g["clicked"].values, g[score_col].values, k=k))
    return float(np.mean(vals)) if vals else 0.0

ndcg_pred = mean_ndcg(dev, "pred", k=10)
ndcg_ctr  = mean_ndcg(dev, "global_ctr", k=10)
ndcg_co   = mean_ndcg(dev, "coclick_score", k=10)

print("nDCG@10 (ML pred):        ", ndcg_pred)
print("nDCG@10 (global_ctr base):", ndcg_ctr)
print("nDCG@10 (coclick base):   ", ndcg_co)

print("Dev CTR (label prevalence):", float(dev["clicked"].mean()))
print("Unique impressions in dev:", int(dev["impression_id"].nunique()))
print("Avg rows per impression:", float(len(dev) / max(dev["impression_id"].nunique(), 1)))


nDCG@10 (ML pred):         0.2984224926592879
nDCG@10 (global_ctr base): 0.3123537075891196
nDCG@10 (coclick base):    0.42202890662192727
Dev CTR (label prevalence): 0.0465
Unique impressions in dev: 64
Avg rows per impression: 31.25


I tested three ways to decide which news articles to show a user and measured how well each method
puts the articles a user actually clicks near the top of the list (higher score = better). In my test set, users
clicked about 4.7% of the shown articles, which is normal for recommendation systems. The method based
on "people who read this also read that" (co-click) performed best (0.422), meaning it most reliably ranked
the articles users ended up choosing. The "overall popularity" method (global click rate) was weaker (0.312),
and our first machine-learning model using basic information (like popularity and simple text/category
details) did not beat these baselines (0.298). The takeaway is that user behavior patterns are currently the
strongest signal for recommending content, and the next business value step is improving the machine-
learning model so it can add extra lift beyond behavior-only recommendations (for example, by learning
what types of content each user prefers and adapting recommendations more precisely).

In [15]:
example_imp = int(dev["impression_id"].iloc[0])

ex = dev[dev["impression_id"] == example_imp].copy()
ex = ex.sort_values("pred", ascending=False)

ex[["impression_id","news_id","clicked","pred","global_ctr","coclick_score","category","subcategory"]].head(20)


Unnamed: 0,impression_id,news_id,clicked,pred,global_ctr,coclick_score,category,subcategory
6,1,N49285,0,0.073113,3.0,0.0,tv-celebrity,67
10,1,N34130,0,0.060896,2.0,0.0,tv-celebrity,80
17,1,N53572,0,0.058468,2.0,0.0,musicnews,102
3,1,N42844,0,0.048469,8.0,0.0,tvnews,80
16,1,N50775,0,0.032336,1.0,0.0,newsworld,67
14,1,N48740,0,0.028892,1.0,0.0,voices,139
18,1,N6916,0,0.025692,1061.0,0.04147,celebrity,63
20,1,N29862,0,0.022282,0.0,0.0,shop-holidays,54
8,1,N24802,0,0.017077,0.0,0.0,newsus,85
4,1,N58098,0,0.015495,3.0,0.0,quickandeasy,42


In [16]:
num_cols = []
for c in ["coclick_score", "global_ctr", "global_shown_cnt", "global_click_cnt", "title_len", "abstract_len"]:
    if c in train.columns and c in dev.columns:
        num_cols.append(c)

cat_cols = []
for c in ["category", "subcategory"]:
    if c in train.columns and c in dev.columns:
        cat_cols.append(c)

print("Numeric features:", num_cols)
print("Categorical features:", cat_cols)


Numeric features: ['coclick_score', 'global_ctr', 'global_shown_cnt', 'global_click_cnt', 'title_len', 'abstract_len']
Categorical features: ['category', 'subcategory']


In [17]:
train = train.sort_values("impression_id").reset_index(drop=True)
dev   = dev.sort_values("impression_id").reset_index(drop=True)

y_train = train["clicked"].astype(int).values
y_dev   = dev["clicked"].astype(int).values

X_train_raw = train[num_cols + cat_cols]
X_dev_raw   = dev[num_cols + cat_cols]

pre = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
        ("num", "passthrough", num_cols),
    ],
    remainder="drop",
)

X_train = pre.fit_transform(X_train_raw)
X_dev   = pre.transform(X_dev_raw)

group_train = train.groupby("impression_id").size().values
group_dev   = dev.groupby("impression_id").size().values

print("groups train:", len(group_train), "rows:", group_train.sum())
print("groups dev:", len(group_dev), "rows:", group_dev.sum())



groups train: 44 rows: 2000
groups dev: 64 rows: 2000


In [18]:
from lightgbm import LGBMRanker

ranker = LGBMRanker(
    objective="lambdarank",
    metric="ndcg",
    n_estimators=300,
    learning_rate=0.05,
    num_leaves=31,
    min_child_samples=20,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
)

ranker.fit(
    X_train, y_train,
    group=group_train,
    eval_set=[(X_dev, y_dev)],
    eval_group=[group_dev],
    eval_at=[10],
    callbacks=[lgb.log_evaluation(period=50)]
)
dev["pred_lgbm"] = ranker.predict(X_dev).astype("float32")
dev[["impression_id","news_id","clicked","pred_lgbm"]].head()

[50]	valid_0's ndcg@10: 0.423143
[100]	valid_0's ndcg@10: 0.430427
[150]	valid_0's ndcg@10: 0.445105
[200]	valid_0's ndcg@10: 0.452361
[250]	valid_0's ndcg@10: 0.443973
[300]	valid_0's ndcg@10: 0.444303


Unnamed: 0,impression_id,news_id,clicked,pred_lgbm
0,1,N51470,0,-9.522739
1,1,N55237,0,-9.921217
2,1,N29862,0,-9.333686
3,1,N6916,0,-7.75713
4,1,N53572,0,-10.906137


In [19]:
def ndcg_at_k(labels, scores, k=10):
    labels = np.asarray(labels, dtype=np.int32)
    scores = np.asarray(scores, dtype=np.float32)

    order = np.argsort(-scores)
    labels_sorted = labels[order][:k]

    gains = (2 ** labels_sorted - 1)
    discounts = 1.0 / np.log2(np.arange(2, len(labels_sorted) + 2))
    dcg = float(np.sum(gains * discounts))

    ideal = np.sort(labels)[::-1][:k]
    gains_i = (2 ** ideal - 1)
    discounts_i = 1.0 / np.log2(np.arange(2, len(ideal) + 2))
    idcg = float(np.sum(gains_i * discounts_i))

    return 0.0 if idcg == 0 else dcg / idcg

def mean_ndcg(df, score_col, k=10):
    vals = []
    for _, g in df.groupby("impression_id"):
        vals.append(ndcg_at_k(g["clicked"].values, g[score_col].values, k=k))
    return float(np.mean(vals)) if vals else 0.0

ndcg_lgbm = mean_ndcg(dev, "pred_lgbm", k=10)
print("nDCG@10 (LightGBM ranker):", ndcg_lgbm)

if "global_ctr" in dev.columns:
    print("nDCG@10 (global_ctr):", mean_ndcg(dev, "global_ctr", k=10))
if "coclick_score" in dev.columns:
    print("nDCG@10 (coclick_score):", mean_ndcg(dev, "coclick_score", k=10))


nDCG@10 (LightGBM ranker): 0.4443029032056575
nDCG@10 (global_ctr): 0.2991319013358326
nDCG@10 (coclick_score): 0.38460111697209587


I upgraded my recommendation system from simple "rule-based" ranking methods to a more advanced
machine-learning ranking model that learns from past user behavior and article characteristics to decide
what to show first. After preparing clean training and test datasets, I trained a LightGBM ranking model to
predict which articles a user is most likely to click and then evaluated how well it orders results near the top
of the list. The results show a clear improvement... the machine-learning ranker achieved the highest
performance score (0.444 nDCG@10), beating both the popularity-based approach (0.299) and the "people
who read this also read that" behavior baseline (0.385). For Microsoft News, this means the new model is
better at putting the content a user is most likely to engage with in the first few positions, which should
translate to higher engagement (more clicks and longer sessions) if deployed.

In [20]:
from scipy import sparse

Xtr = X_train if sparse.issparse(X_train) else sparse.csr_matrix(X_train)
Xdv = X_dev   if sparse.issparse(X_dev)   else sparse.csr_matrix(X_dev)

dtrain = xgb.DMatrix(Xtr, label=y_train)
ddev   = xgb.DMatrix(Xdv, label=y_dev)

dtrain.set_group(group_train)
ddev.set_group(group_dev)

params = {
    "objective": "rank:ndcg",
    "eval_metric": "ndcg@10",
    "eta": 0.05,
    "max_depth": 6,
    "min_child_weight": 1,
    "subsample": 0.8,
    "colsample_bytree": 0.8,
    "lambda": 1.0,
    "random_state": 42,
}

watchlist = [(dtrain, "train"), (ddev, "dev")]

xgb_ranker = xgb.train(
    params=params,
    dtrain=dtrain,
    num_boost_round=500,
    evals=watchlist,
    verbose_eval=50,
    early_stopping_rounds=50
)

dev["pred_xgb"] = xgb_ranker.predict(ddev).astype("float32")

ndcg_xgb = mean_ndcg(dev, "pred_xgb", k=10)
print("nDCG@10 (XGBoost ranker):", ndcg_xgb)
print("nDCG@10 (LightGBM ranker):", mean_ndcg(dev, "pred_lgbm", k=10))
print("nDCG@10 (coclick_score):", mean_ndcg(dev, "coclick_score", k=10))
print("nDCG@10 (global_ctr):", mean_ndcg(dev, "global_ctr", k=10))


[0]	train-ndcg@10:0.67257	dev-ndcg@10:0.37892
[50]	train-ndcg@10:0.78954	dev-ndcg@10:0.38162
[100]	train-ndcg@10:0.81226	dev-ndcg@10:0.39962
[145]	train-ndcg@10:0.83199	dev-ndcg@10:0.38843
nDCG@10 (XGBoost ranker): 0.3884337122857202
nDCG@10 (LightGBM ranker): 0.4443029032056575
nDCG@10 (coclick_score): 0.38460111697209587
nDCG@10 (global_ctr): 0.2991319013358326


I compared two advanced machine-learning ranking models (LightGBM and XGBoost) against simpler business baselines: overall popularity (global click rate) and "people who read this also read that" similarity (co-click). The results show that LightGBM performed best with an overall ranking quality score of 0.444 nDCG@10, meaning it was most effective at placing the articles users actually clicked into the top positions; XGBoost performed lower at 0.388, roughly similar to the co-click
baseline (0.385) and still well above popularity alone (0.299). In business terms, this tells us two things: (1)
behavior-based similarity is a strong starting point, but (2) the LightGBM machine-learning model adds clear
incremental value beyond simple rules and should be considered the preferred approach because it most
consistently surfaces the right content earlier, which typically translates into higher engagement (more clicks
and longer sessions).