In [39]:
import numpy as np
import pandas as pd

#!pip install lightgbm
import lightgbm as lgb

from sklearn.model_selection import RandomizedSearchCV,GridSearchCV
from sklearn.metrics import ndcg_score,log_loss,make_scorer,label_ranking_average_precision_score

In [13]:
#const
SEED = 42
TARGET_COLUMNS = 'relevance_Rank'
REMOVE_COLUMNS = ['receipt_id', 'company_id', 'matched_transaction_id',
       'feature_transaction_id', 'matched',
       'relevance_score', 'relevance_rank']

In [14]:
train_df = pd.read_pickle('./data/train_df_with_rank.pkl')
test_df = pd.read_pickle('./data/test_df_with_rank.pkl')
train_df.columns

Index(['receipt_id', 'company_id', 'matched_transaction_id',
       'feature_transaction_id', 'DateMappingMatch', 'AmountMappingMatch',
       'DescriptionMatch', 'DifferentPredictedTime', 'TimeMappingMatch',
       'PredictedNameMatch', 'ShortNameMatch', 'DifferentPredictedDate',
       'PredictedAmountMatch', 'PredictedTimeCloseMatch', 'matched',
       'relevance_score', 'relevance_rank'],
      dtype='object')

In [15]:
qids_train = train_df.groupby("receipt_id")["receipt_id"].count().to_numpy()
qids_test = test_df.groupby("receipt_id")["receipt_id"].count().to_numpy()

X_train = train_df.drop(REMOVE_COLUMNS, axis = 1)
y_train = train_df['relevance_rank']

X_test = test_df.drop(REMOVE_COLUMNS, axis = 1)
y_test = test_df['relevance_rank'].astype(int)


In [38]:
ranker = lgb.LGBMRanker(
                    objective="lambdarank",
                    boosting_type = "gbdt",
                    n_estimators = 5,
                    importance_type = "gain",
                    metric= "ndcg",
                    num_leaves = 10,
                    learning_rate = 0.05,
                    max_depth = -1,
                    label_gain =[i for i in range(max(y_train.max(), y_test.max()) + 1)])

# Training the model
ranker.fit(
      X=X_train,
      y=y_train,
      group=qids_train,
      eval_set=[(X_train, y_train),(X_test, y_test)],
      eval_group=[qids_train, qids_test],
      eval_at=[4, 8])

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.014147 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 46
[LightGBM] [Info] Number of data points in the train set: 9445, number of used features: 10


38       1
39       1
40       2
41       2
34       3
        ..
11969    1
11970    2
11966    3
11967    4
11968    4
Name: relevance_rank, Length: 2589, dtype: int64

In [56]:
y_pred = ranker.predict(X_test)
ndcg = ndcg_score(np.array(y_test).reshape(1, -1), y_pred.reshape(1, -1))
print(f"NDCG Score: {ndcg}")

NDCG Score: 0.915233335796369


In [47]:
y_pred

array([-0.41464254, -0.41464254, -0.23800353, ..., -0.10708225,
        0.43720942,  0.43720942])

In [53]:
test_df.loc[:,'pred_rs'] = y_pred
test_df.sort_values(by=['receipt_id', 'pred_rs'], ascending=[True, False], inplace=True)
# Use groupby on 'receipt_id' and rank by 'pred_rs', assigning rank to a new 'rank' column
test_df['pred_relevance_rank'] = test_df.groupby('receipt_id')['pred_rs'].rank(method='dense', ascending=True)
test_df['pred_relevance_rank'] = test_df['pred_relevance_rank'].astype(int)
test_df.head()

Unnamed: 0,receipt_id,company_id,matched_transaction_id,feature_transaction_id,DateMappingMatch,AmountMappingMatch,DescriptionMatch,DifferentPredictedTime,TimeMappingMatch,PredictedNameMatch,ShortNameMatch,DifferentPredictedDate,PredictedAmountMatch,PredictedTimeCloseMatch,matched,relevance_score,relevance_rank,pred_rs,pred_relevance_rank
40,10003,10000,10412,10414,0.85,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.164758,2,0.437209,3
41,10003,10000,10412,10415,0.85,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.164758,2,0.437209,3
38,10003,10000,10412,10412,0.85,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1,0.5558,1,0.437209,3
39,10003,10000,10412,10413,0.85,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0,0.5558,1,0.437209,3
36,10003,10000,10412,10410,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0,0.001045,3,-0.238004,2


In [54]:
def calculate_mrr(df):
    mrr_scores = []
    
    for receipt_id in df['receipt_id'].unique():
        # Filter DataFrame by receipt_id
        receipt_df = df[df['receipt_id'] == receipt_id]
        
        # Get the actual rank of the top-predicted transaction
        top_predicted = receipt_df.loc[receipt_df['pred_relevance_rank'].idxmax()]
        actual_rank_of_top_predicted = top_predicted['relevance_rank']
        
        # Calculate the reciprocal rank
        reciprocal_rank = 1.0 / actual_rank_of_top_predicted
        mrr_scores.append(reciprocal_rank)
    
    # Compute the MRR across all receipts
    mrr = sum(mrr_scores) / len(mrr_scores)
    return mrr

# Calculate MRR for the DataFrame
mrr = calculate_mrr(test_df)
print(f"Mean Reciprocal Rank (MRR): {mrr}")

Mean Reciprocal Rank (MRR): 0.6234487734487735


In [55]:
test_df[test_df.receipt_id == '10,043'	]

Unnamed: 0,receipt_id,company_id,matched_transaction_id,feature_transaction_id,DateMappingMatch,AmountMappingMatch,DescriptionMatch,DifferentPredictedTime,TimeMappingMatch,PredictedNameMatch,ShortNameMatch,DifferentPredictedDate,PredictedAmountMatch,PredictedTimeCloseMatch,matched,relevance_score,relevance_rank,pred_rs,pred_relevance_rank
488,10043,10000,10622,10620,0.95,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.164758,2,0.437209,4
489,10043,10000,10622,10621,0.95,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.164758,2,0.437209,4
490,10043,10000,10622,10622,0.95,0.0,0.6,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1,0.940159,1,0.437209,4
487,10043,10000,10622,10619,0.0,0.0,0.4,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0,0.001308,3,-0.137051,3
485,10043,10000,10622,10617,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0,0.001045,4,-0.238004,2
486,10043,10000,10622,10618,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0,0.001045,4,-0.238004,2
484,10043,10000,10622,10614,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0,0.001045,4,-0.439198,1
