In [11]:
import pandas as pd
import numpy as np

import xgboost as xgb

from sklearn.model_selection import train_test_split
from sklearn.metrics import ndcg_score, average_precision_score

In [None]:
url_rating = 'https://raw.githubusercontent.com/zygmuntz/goodbooks-10k/refs/heads/master/ratings.csv'
df_rating = pd.read_csv(url_rating)

url_books = 'https://raw.githubusercontent.com/zygmuntz/goodbooks-10k/refs/heads/master/books.csv'
df_books = pd.read_csv(url_books)

In [26]:
df = pd.merge(df_rating, df_books, on='book_id')
df

Unnamed: 0,user_id,book_id,rating,goodreads_book_id,best_book_id,work_id,books_count,isbn,isbn13,authors,...,ratings_count,work_ratings_count,work_text_reviews_count,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5,image_url,small_image_url
0,1,258,5,1232,1232,3209783,279,143034901,9.780143e+12,"Carlos Ruiz Zafón, Lucia Graves",...,263685,317554,24652,4789,11769,42214,101612,157170,https://images.gr-assets.com/books/1344545047m...,https://images.gr-assets.com/books/1344545047s...
1,11,258,3,1232,1232,3209783,279,143034901,9.780143e+12,"Carlos Ruiz Zafón, Lucia Graves",...,263685,317554,24652,4789,11769,42214,101612,157170,https://images.gr-assets.com/books/1344545047m...,https://images.gr-assets.com/books/1344545047s...
2,143,258,4,1232,1232,3209783,279,143034901,9.780143e+12,"Carlos Ruiz Zafón, Lucia Graves",...,263685,317554,24652,4789,11769,42214,101612,157170,https://images.gr-assets.com/books/1344545047m...,https://images.gr-assets.com/books/1344545047s...
3,242,258,5,1232,1232,3209783,279,143034901,9.780143e+12,"Carlos Ruiz Zafón, Lucia Graves",...,263685,317554,24652,4789,11769,42214,101612,157170,https://images.gr-assets.com/books/1344545047m...,https://images.gr-assets.com/books/1344545047s...
4,325,258,4,1232,1232,3209783,279,143034901,9.780143e+12,"Carlos Ruiz Zafón, Lucia Graves",...,263685,317554,24652,4789,11769,42214,101612,157170,https://images.gr-assets.com/books/1344545047m...,https://images.gr-assets.com/books/1344545047s...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5976474,36773,9548,5,30839185,30839185,51437887,4,,,L.J. Shen,...,15460,15482,2179,241,504,1952,5193,7592,https://images.gr-assets.com/books/1481945407m...,https://images.gr-assets.com/books/1481945407s...
5976475,50673,9548,4,30839185,30839185,51437887,4,,,L.J. Shen,...,15460,15482,2179,241,504,1952,5193,7592,https://images.gr-assets.com/books/1481945407m...,https://images.gr-assets.com/books/1481945407s...
5976476,45213,9548,3,30839185,30839185,51437887,4,,,L.J. Shen,...,15460,15482,2179,241,504,1952,5193,7592,https://images.gr-assets.com/books/1481945407m...,https://images.gr-assets.com/books/1481945407s...
5976477,12872,9548,4,30839185,30839185,51437887,4,,,L.J. Shen,...,15460,15482,2179,241,504,1952,5193,7592,https://images.gr-assets.com/books/1481945407m...,https://images.gr-assets.com/books/1481945407s...


In [28]:
train_data, test_data = train_test_split(df, test_size=0.3, shuffle=True)

In [29]:
# Сортуємо train по user_id
train_data = train_data.sort_values('user_id').reset_index(drop=True)
X_train = train_data[['ratings_count', 'average_rating']].values
y_train = train_data['rating'].values
group_train = train_data.groupby('user_id').size().tolist()

dtrain = xgb.DMatrix(X_train, label=y_train)
dtrain.set_group(group_train)

# Сортуємо test по user_id
test_data = test_data.sort_values('user_id').reset_index(drop=True)
X_test = test_data[['ratings_count', 'average_rating']].values
y_test = test_data['rating'].values
group_test = test_data.groupby('user_id').size().tolist()

dtest = xgb.DMatrix(X_test, label=y_test)
dtest.set_group(group_test)

In [36]:
params = {
    'objective': 'rank:pairwise',
    'eta': 0.1,
    'gamma': 1.0,
    'min_child_weight': 0.1,
    'max_depth': 3,
    'eval_metric': 'ndcg'
}

evals = [(dtrain, 'train'), (dtest, 'test')]
model = xgb.train(params, dtrain, num_boost_round=20, evals=evals)

[0]	train-ndcg:0.75477	test-ndcg:0.86447
[1]	train-ndcg:0.75631	test-ndcg:0.86559
[2]	train-ndcg:0.75725	test-ndcg:0.86610
[3]	train-ndcg:0.75787	test-ndcg:0.86627
[4]	train-ndcg:0.75847	test-ndcg:0.86654
[5]	train-ndcg:0.75855	test-ndcg:0.86656
[6]	train-ndcg:0.75865	test-ndcg:0.86666
[7]	train-ndcg:0.75872	test-ndcg:0.86673
[8]	train-ndcg:0.75878	test-ndcg:0.86674
[9]	train-ndcg:0.75879	test-ndcg:0.86676
[10]	train-ndcg:0.75881	test-ndcg:0.86677
[11]	train-ndcg:0.75881	test-ndcg:0.86677
[12]	train-ndcg:0.75882	test-ndcg:0.86678
[13]	train-ndcg:0.75892	test-ndcg:0.86679
[14]	train-ndcg:0.75893	test-ndcg:0.86682
[15]	train-ndcg:0.75893	test-ndcg:0.86682
[16]	train-ndcg:0.75893	test-ndcg:0.86683
[17]	train-ndcg:0.75897	test-ndcg:0.86687
[18]	train-ndcg:0.75898	test-ndcg:0.86687
[19]	train-ndcg:0.75900	test-ndcg:0.86688


In [37]:
test_data['pred_score'] = model.predict(dtest)

ranked_data = test_data.sort_values(['user_id','pred_score'], ascending=[True, False])
ranked_data = ranked_data[['user_id', 'original_title', 'rating', 'pred_score']]
ranked_data

Unnamed: 0,user_id,original_title,rating,pred_score
12,1,East of Eden,4,0.224817
10,1,Ender's Game,5,0.174068
20,1,Братья Карамазовы,4,0.174068
15,1,The Last Lecture,4,0.157302
5,1,La sombra del viento,5,0.117076
...,...,...,...,...
1792925,53424,Little Lord Fauntleroy,4,-0.363182
1792942,53424,Artemis Fowl,4,-0.363182
1792913,53424,"Sarah, Plain and Tall",4,-0.469412
1792930,53424,Breaking Dawn,3,-0.555981


In [38]:
ranked_data[ranked_data['user_id'] == 4]

Unnamed: 0,user_id,original_title,rating,pred_score
99,4,The Hobbit and The Lord of the Rings,4,0.490567
86,4,Harry Potter and the Prisoner of Azkaban,5,0.430197
98,4,Harry Potter and the Philosopher's Stone,5,0.326971
89,4,Harry Potter and the Chamber of Secrets,5,0.242362
100,4,The Ultimate Hitchhiker's Guide: Five Complete...,4,0.242362
105,4,The Giving Tree,5,0.242362
92,4,Matilda,4,0.174068
77,4,A Tree Grows In Brooklyn,5,0.117076
95,4,The Glass Castle,5,0.117076
110,4,Pride and Prejudice,5,0.117076


In [39]:
groups = test_data.groupby('user_id')
ndcg_list = []
map_list = []

for user, group in groups:
    y_true = group['rating'].values
    y_pred = group['pred_score'].values
    
    # NDCG на всій групі
    try:
        ndcg = ndcg_score([y_true], [y_pred])
        ndcg_list.append(ndcg)
    except ValueError:
        pass

    # MAP (average precision)
    y_true_bin = (y_true > 0).astype(int)
    if y_true_bin.sum() > 0:
        map_score = average_precision_score(y_true_bin, y_pred)
        map_list.append(map_score)

print("Mean NDCG:", np.mean(ndcg_list))
print("Mean MAP:", np.mean(map_list))

Mean NDCG: 0.9593988723190952
Mean MAP: 1.0
