In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import xgboost as xgb

In [12]:
df = pd.read_csv('/home/adm2/python-scripts/retraining_pipeline/datasets/scorer_dataset_20240829.zip', compression='zip')
df['rank'] = df.groupby('you_oid')['rank'].rank(method='first', ascending=True).astype(int)

In [19]:
from sklearn.model_selection import GroupShuffleSplit

# Features and labels
X = df[['color_score', 'openai_score', 'image_score', 'mobilenet_score']]  # Feature columns
y = df['rank']  # Ranking labels

# Group sizes (5 per group since you have top 5 ranks for each 'you_oid')
groups = df.groupby('you_oid').size().values

# Group-based train-test split
gss = GroupShuffleSplit(test_size=0.1, n_splits=1, random_state=42)
train_idx, test_idx = next(gss.split(X, y, groups=df['you_oid']))

X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
groups_train = df.iloc[train_idx].groupby('you_oid').size().values
groups_test = df.iloc[test_idx].groupby('you_oid').size().values

# Create DMatrix for XGBoost
train_dmatrix = xgb.DMatrix(X_train, label=y_train)
test_dmatrix = xgb.DMatrix(X_test, label=y_test)

# Set group sizes for the training and test sets
train_dmatrix.set_group(groups_train)
test_dmatrix.set_group(groups_test)

# Train the XGBoost model
params = {
    'objective': 'rank:ndcg',
    'eval_metric': 'ndcg',
    'eta': 0.1,
    'max_depth': 6,
    'seed': 42
}

bst = xgb.train(params, train_dmatrix, num_boost_round=500, evals=[(test_dmatrix, 'test')], early_stopping_rounds=10)

# Predict ranking for the test set
pred = bst.predict(test_dmatrix)

# Display predictions
print("Predicted rankings:", pred)

# Evaluate the model
from sklearn.metrics import ndcg_score

# Calculate NDCG score
ndcg = ndcg_score([y_test], [pred], k=5)
print("NDCG score:", ndcg)

[0]	test-ndcg:0.76927
[1]	test-ndcg:0.77193
[2]	test-ndcg:0.77193
[3]	test-ndcg:0.77758
[4]	test-ndcg:0.77730
[5]	test-ndcg:0.77819
[6]	test-ndcg:0.77861
[7]	test-ndcg:0.78042
[8]	test-ndcg:0.78032
[9]	test-ndcg:0.78031
[10]	test-ndcg:0.78052
[11]	test-ndcg:0.78077
[12]	test-ndcg:0.78077
[13]	test-ndcg:0.78126
[14]	test-ndcg:0.78122
[15]	test-ndcg:0.78124
[16]	test-ndcg:0.78127
[17]	test-ndcg:0.78125
[18]	test-ndcg:0.78161
[19]	test-ndcg:0.78204
[20]	test-ndcg:0.78274
[21]	test-ndcg:0.78275
[22]	test-ndcg:0.78356
[23]	test-ndcg:0.78392
[24]	test-ndcg:0.78394
[25]	test-ndcg:0.78414
[26]	test-ndcg:0.78419
[27]	test-ndcg:0.78443
[28]	test-ndcg:0.78388
[29]	test-ndcg:0.78397
[30]	test-ndcg:0.78379
[31]	test-ndcg:0.78376
[32]	test-ndcg:0.78389
[33]	test-ndcg:0.78389
[34]	test-ndcg:0.78404
[35]	test-ndcg:0.78453
[36]	test-ndcg:0.78425
[37]	test-ndcg:0.78412
[38]	test-ndcg:0.78412
[39]	test-ndcg:0.78404
[40]	test-ndcg:0.78417
[41]	test-ndcg:0.78411
[42]	test-ndcg:0.78377
[43]	test-ndcg:0.7838

In [22]:
import pandas as pd
import numpy as np

# Assuming you already have your X_test and y_pred as in previous steps

# Convert predictions to a DataFrame for easier manipulation
pred_df = pd.DataFrame({
    'you_oid': df['you_oid'].iloc[test_idx],  # Grouping key
    'pred_score': pred                      # Predicted scores from the model
})

# Group by `you_oid` and rank the `pred_score` within each group
pred_df['pred_rank'] = pred_df.groupby('you_oid')['pred_score'].rank(method='first', ascending=False).astype(int)

# Display the predictions with ranks
print(pred_df)


                         you_oid  pred_score  pred_rank
126     5f640b3b051a932d5762b433    0.468921          2
127     5f640b3b051a932d5762b433    0.385344          4
128     5f640b3b051a932d5762b433    0.315848          5
129     5f640b3b051a932d5762b433    0.177788          6
130     5f640b3b051a932d5762b433    0.108562          7
...                          ...         ...        ...
521316  66ce56b2b94a5511fe0be08f    0.810955          1
521317  66ce56b2b94a5511fe0be08f    0.647703          2
521318  66ce56b2b94a5511fe0be08f    0.274513          4
521319  66ce56b2b94a5511fe0be08f    0.515637          3
521320  66ce56b2b94a5511fe0be08f    0.109823          5

[52077 rows x 3 columns]


In [24]:
y_test

126       5
127       6
128       1
129       2
130       3
         ..
521316    4
521317    3
521318    1
521319    5
521320    2
Name: rank, Length: 52077, dtype: int64

In [25]:
from sklearn.metrics import classification_report

print(classification_report(y_test, pred_df['pred_rank']))

              precision    recall  f1-score   support

           1       0.16      0.16      0.16     10006
           2       0.17      0.17      0.17      9541
           3       0.21      0.21      0.21      9351
           4       0.17      0.17      0.17      9183
           5       0.14      0.14      0.14      9034
           6       0.09      0.09      0.09      2626
           7       0.08      0.08      0.08       989
           8       0.09      0.09      0.09       524
           9       0.04      0.04      0.04       332
          10       0.06      0.06      0.06       258
          11       0.02      0.02      0.02       128
          12       0.00      0.00      0.00        54
          13       0.06      0.06      0.06        32
          14       0.00      0.00      0.00        13
          15       0.00      0.00      0.00         5
          16       0.00      0.00      0.00         1

    accuracy                           0.16     52077
   macro avg       0.08   

In [23]:
import pandas as pd
import numpy as np
from sklearn.metrics import ndcg_score

# Assuming `df` is your original DataFrame with the true ranks
# and `pred_df` contains the new predicted ranks.

# Merge the true ranks with the predicted ranks
evaluation_df = pd.merge(df[['you_oid', 'rank']], pred_df[['you_oid', 'pred_rank']], on='you_oid')

# Display the merged DataFrame
print("Merged DataFrame with True Ranks and Predicted Ranks:")
print(evaluation_df)

# Evaluate Mean NDCG
# Group by `you_oid` to compute NDCG for each group
groups = evaluation_df.groupby('you_oid')
ndcg_scores = []

for _, group in groups:
    true_ranks = group['rank'].values
    pred_ranks = group['pred_rank'].values
    ndcg = ndcg_score([true_ranks], [pred_ranks])
    ndcg_scores.append(ndcg)

mean_ndcg = np.mean(ndcg_scores)
print(f'Mean NDCG Score: {mean_ndcg}')

# Evaluate Mean Reciprocal Rank (MRR)
reciprocal_ranks = []

for _, group in groups:
    # Sort by predicted rank to find the true rank of the top prediction
    sorted_group = group.sort_values(by='pred_rank')
    # Get the rank of the first item in sorted predictions
    true_rank_of_top_pred = sorted_group['rank'].iloc[0]
    reciprocal_rank = 1 / true_rank_of_top_pred
    reciprocal_ranks.append(reciprocal_rank)

mean_mrr = np.mean(reciprocal_ranks)
print(f'Mean Reciprocal Rank (MRR): {mean_mrr}')


Merged DataFrame with True Ranks and Predicted Ranks:
                         you_oid  rank  pred_rank
0       5f640b3b051a932d5762b433     5          2
1       5f640b3b051a932d5762b433     5          4
2       5f640b3b051a932d5762b433     5          5
3       5f640b3b051a932d5762b433     5          6
4       5f640b3b051a932d5762b433     5          7
...                          ...   ...        ...
296372  66ce56b2b94a5511fe0be08f     2          1
296373  66ce56b2b94a5511fe0be08f     2          2
296374  66ce56b2b94a5511fe0be08f     2          4
296375  66ce56b2b94a5511fe0be08f     2          3
296376  66ce56b2b94a5511fe0be08f     2          5

[296377 rows x 3 columns]


ValueError: Only ('multilabel-indicator', 'continuous-multioutput', 'multiclass-multioutput') formats are supported. Got binary instead