<a href="https://colab.research.google.com/github/ChintPatel/CMPE255-Decision-trees-and-ensemble/blob/main/gradient_boost_ranking.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install scikit-surprise

Collecting scikit-surprise
  Downloading scikit_surprise-1.1.4.tar.gz (154 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/154.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m153.6/154.4 kB[0m [31m5.1 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.4/154.4 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (pyproject.toml) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.4-cp310-cp310-linux_x86_64.whl size=2357288 sha256=e9e7f836551463c7da343fe7c3a4a34ccb5832a893c343e524b5a7675bd969eb
  Stored in directory: /root/.cache/pip/wheels/4b/3f/df/6acbf0a

In [5]:
!pip install xgboost catboost lightgbm scikit-learn pandas


Collecting xgboost
  Using cached xgboost-2.1.3-py3-none-manylinux_2_28_x86_64.whl.metadata (2.1 kB)
Collecting lightgbm
  Using cached lightgbm-4.5.0-py3-none-manylinux_2_28_x86_64.whl.metadata (17 kB)
Collecting nvidia-nccl-cu12 (from xgboost)
  Using cached nvidia_nccl_cu12-2.23.4-py3-none-manylinux2014_x86_64.whl.metadata (1.8 kB)
Using cached xgboost-2.1.3-py3-none-manylinux_2_28_x86_64.whl (153.9 MB)
Using cached lightgbm-4.5.0-py3-none-manylinux_2_28_x86_64.whl (3.6 MB)
Using cached nvidia_nccl_cu12-2.23.4-py3-none-manylinux2014_x86_64.whl (199.0 MB)
Installing collected packages: nvidia-nccl-cu12, xgboost, lightgbm
Successfully installed lightgbm-4.5.0 nvidia-nccl-cu12-2.23.4 xgboost-2.1.3


In [6]:

# Import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import ndcg_score
import xgboost as xgb
import catboost as cb
import lightgbm as lgb

# Load the MovieLens dataset
# Install the dataset package if not already installed: pip install surprise
from surprise import Dataset

data = Dataset.load_builtin("ml-100k")  # MovieLens 100k dataset
raw_data = pd.DataFrame(data.raw_ratings, columns=["user", "item", "rating", "timestamp"])

# Preprocess dataset
raw_data["user"] = raw_data["user"].astype(int)
raw_data["item"] = raw_data["item"].astype(int)
raw_data["rating"] = raw_data["rating"].astype(float)

# Create features and target
features = raw_data[["user", "item"]]
target = raw_data["rating"]

# Simulate groups as "users"
groups = raw_data["user"]

# Split into training and testing
X_train, X_test, y_train, y_test, groups_train, groups_test = train_test_split(
    features, target, groups, test_size=0.2, random_state=42
)



Dataset ml-100k could not be found. Do you want to download it? [Y/n] Y
Trying to download dataset from https://files.grouplens.org/datasets/movielens/ml-100k.zip...
Done! Dataset ml-100k has been saved to /root/.surprise_data/ml-100k


In [7]:
# Train data for CatBoost
train_data = pd.concat(
    [X_train.reset_index(drop=True), y_train.reset_index(drop=True), groups_train.reset_index(drop=True)],
    axis=1
)
train_data.columns = ["user", "item", "rating", "group_id"]  # Assign distinct column names
train_data = train_data.sort_values(by="group_id")
X_train_sorted = train_data[["user", "item"]]
y_train_sorted = train_data["rating"]
groups_train_sorted = train_data["group_id"]

# Test data for CatBoost
test_data = pd.concat(
    [X_test.reset_index(drop=True), y_test.reset_index(drop=True), groups_test.reset_index(drop=True)],
    axis=1
)
test_data.columns = ["user", "item", "rating", "group_id"]  # Assign distinct column names
test_data = test_data.sort_values(by="group_id")
X_test_sorted = test_data[["user", "item"]]
y_test_sorted = test_data["rating"]
groups_test_sorted = test_data["group_id"]

# Models for ranking
models = {
    "XGBoost": xgb.XGBRanker(
        objective="rank:pairwise", learning_rate=0.1, n_estimators=300, max_depth=8, random_state=42
    ),
    "CatBoost": cb.CatBoostRanker(
        iterations=300, learning_rate=0.1, depth=8, verbose=0, random_seed=42
    ),
    "LightGBM": lgb.LGBMRanker(
        boosting_type="gbdt", objective="lambdarank", learning_rate=0.1, n_estimators=300, max_depth=8, random_state=42
    ),
}

# Train and evaluate each model
results = []
for model_name, model in models.items():
    if model_name == "XGBoost":
        model.fit(
            X_train, y_train, group=np.bincount(groups_train.astype(int))  # Groups as group sizes
        )
    elif model_name == "CatBoost":
        model.fit(
            X_train_sorted, y_train_sorted, group_id=groups_train_sorted
        )
    elif model_name == "LightGBM":
        model.fit(
            X_train, y_train, group=np.bincount(groups_train.astype(int))  # Groups as group sizes
        )

    # Predict relevance scores
    y_pred = model.predict(X_test)

    # Evaluate using Normalized Discounted Cumulative Gain (NDCG)
    ndcg = ndcg_score([y_test_sorted], [y_pred], k=10)  # NDCG@10
    results.append((model_name, ndcg))

# Convert results to DataFrame for display
results_df = pd.DataFrame(results, columns=["Model", "NDCG@10"])

# Display results
print(results_df)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000435 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 507
[LightGBM] [Info] Number of data points in the train set: 80000, number of used features: 2
      Model   NDCG@10
0   XGBoost  0.721659
1  CatBoost  0.702478
2  LightGBM  0.766141
