In [1]:
import random
import implicit
import pickle 

import pandas as pd
import numpy as np
import polars as pl

from tqdm import tqdm
from scipy import sparse

import lightgbm

In [2]:
def cast_to_float32(pl_frame: pl.DataFrame):
    float_columns = [
        col for col in pl_frame.columns
        if pl_frame[col].dtype == pl.Float64 or (pl_frame[col].dtype == pl.Int64 and pl_frame[col].null_count() > 0)
    ]
    return pl_frame.with_columns([pl.col(col).cast(pl.Float32) for col in float_columns])

def read_parquet(path: str) -> pl.DataFrame:
    df = pl.read_parquet(path)
    for col in ['user_id', 'item_id', 'source_id']:
        if col in df.columns:
            df = df.with_columns(pl.col(col).cast(pl.Int32, strict=False))

    if '__index_level_0__' in df.columns:
        df = df.drop(['__index_level_0__'])
    return df

test_path = '../data/final_df/test/merged_features_test_rs.parquet.gzip'
merged_test = cast_to_float32(read_parquet(test_path))

merged_test = merged_test.to_pandas()
print(f'Test shape: {merged_test.shape}')

Test shape: (86284083, 49)


In [5]:
ranker = lightgbm.Booster(model_file='../models/lgb/als250_bm150_cont150_40feat_w_rank_n_score_2711it_modif_tg.lgb')

In [6]:
preds_final_test = ranker.predict(merged_test[ranker.feature_name()])

In [7]:
prediction_df = merged_test[['user_id', 'item_id']]
prediction_df['predictions'] = preds_final_test

prediction_df_sorted = prediction_df.sort_values(['user_id', 'predictions'], ascending=[True, False])
prediction_df_sorted['rnk'] = prediction_df_sorted.groupby('user_id').cumcount()

prf = prediction_df_sorted[prediction_df_sorted['rnk'] < 20].groupby('user_id', as_index=False)['item_id'].agg(list)
prf = prf.rename(columns={'item_id': 'predictions'})
prf

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  prediction_df['predictions'] = preds_final_test


Unnamed: 0,user_id,predictions
0,7,"[115127, 221001, 77577, 2216, 12697, 162251, 6..."
1,8,"[142183, 44222, 163702, 97249, 49912, 46440, 2..."
2,9,"[32474, 53603, 4804, 227299, 194428, 29054, 14..."
3,11,"[83212, 533, 149738, 116476, 56056, 215901, 14..."
4,18,"[7871, 225311, 120767, 155973, 129830, 131220,..."
...,...,...
199995,1000160,"[70456, 125478, 220549, 170326, 41241, 86886, ..."
199996,1000165,"[120027, 161636, 87174, 210739, 74367, 194017,..."
199997,1000166,"[217651, 181415, 128307, 82546, 162492, 160944..."
199998,1000168,"[186660, 150028, 98002, 101773, 141730, 179166..."


In [8]:
prf.to_parquet('../data/submissions/pred_final_attm_5_40f_modif_tg.parquet.gzip', compression='gzip')

In [9]:
prf[prf['predictions'].apply(len) == 20]

Unnamed: 0,user_id,predictions
0,7,"[115127, 221001, 77577, 2216, 12697, 162251, 6..."
1,8,"[142183, 44222, 163702, 97249, 49912, 46440, 2..."
2,9,"[32474, 53603, 4804, 227299, 194428, 29054, 14..."
3,11,"[83212, 533, 149738, 116476, 56056, 215901, 14..."
4,18,"[7871, 225311, 120767, 155973, 129830, 131220,..."
...,...,...
199995,1000160,"[70456, 125478, 220549, 170326, 41241, 86886, ..."
199996,1000165,"[120027, 161636, 87174, 210739, 74367, 194017,..."
199997,1000166,"[217651, 181415, 128307, 82546, 162492, 160944..."
199998,1000168,"[186660, 150028, 98002, 101773, 141730, 179166..."
