In [1]:
import numpy as np
import polars as pl

import lightgbm

from recommenders.utils import fix_dtypes

In [2]:
test_path = './data/features/merged_features_test.parquet.gzip'
merged_test = fix_dtypes(pl.read_parquet(test_path))

In [3]:
merged_test = merged_test.to_pandas()
print(f'Test shape: {merged_test.shape}')

Test shape: (146573625, 35)


In [4]:
ranker = lightgbm.Booster(model_file='./models/lgb/ranker_32feat_2902it.lgb')

In [8]:
SUPPP_COLS = ['user_id', 'item_id', 'source_id']

chank_size = 20_000_000
predictions_list = []

for i, chank in enumerate(range(0, merged_test.shape[0], chank_size)):
    print(f"Started chank: {i}, Chank idxs: {chank:_} : {(chank + chank_size):_}...")
    test_chank = merged_test.iloc[chank : chank + chank_size].drop(columns=SUPPP_COLS)
    print(f"Chank size: {test_chank.shape}")
    chank_preds = ranker.predict(test_chank.to_numpy())
    predictions_list.append(chank_preds)

Started chank: 0, Chank idxs: 0 : 20_000_000...
Chank size: (20000000, 32)
Started chank: 1, Chank idxs: 20_000_000 : 40_000_000...
Chank size: (20000000, 32)
Started chank: 2, Chank idxs: 40_000_000 : 60_000_000...
Chank size: (20000000, 32)
Started chank: 3, Chank idxs: 60_000_000 : 80_000_000...
Chank size: (20000000, 32)
Started chank: 4, Chank idxs: 80_000_000 : 100_000_000...
Chank size: (20000000, 32)
Started chank: 5, Chank idxs: 100_000_000 : 120_000_000...
Chank size: (20000000, 32)
Started chank: 6, Chank idxs: 120_000_000 : 140_000_000...
Chank size: (20000000, 32)
Started chank: 7, Chank idxs: 140_000_000 : 160_000_000...
Chank size: (6573625, 32)


In [9]:
concated_preds = np.concatenate(predictions_list, axis=0)
print(concated_preds.shape)

(146573625,)


In [10]:
prediction_df = merged_test[['user_id', 'item_id']]
prediction_df['predictions'] = concated_preds

prediction_df = prediction_df.sort_values(['user_id', 'predictions'], ascending=[True, False])
prediction_df['rank'] = prediction_df.groupby('user_id').cumcount()

prf = prediction_df[prediction_df['rank'] < 20].groupby('user_id', as_index=False)['item_id'].agg(list)
prf = prf.rename(columns={'item_id': 'predictions'})
prf

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  prediction_df['predictions'] = concated_preds


Unnamed: 0,user_id,predictions
0,7,"[115127, 221001, 77577, 162251, 35482, 12697, ..."
1,8,"[97249, 142183, 163702, 44222, 172180, 134460,..."
2,9,"[29054, 105130, 227299, 32474, 53603, 149513, ..."
3,11,"[191744, 168651, 83212, 187465, 140293, 97505,..."
4,18,"[117495, 120767, 189621, 91967, 206293, 41635,..."
...,...,...
199995,1000160,"[220549, 103409, 173500, 7765, 22512, 190438, ..."
199996,1000165,"[76464, 3800, 36759, 74367, 225171, 186660, 11..."
199997,1000166,"[35482, 23837, 14760, 66499, 73237, 101474, 16..."
199998,1000168,"[150028, 186660, 179166, 141730, 200730, 84951..."


In [12]:
prf.to_parquet('./data/submissions/sub_final.parquet.gzip', compression='gzip')