In [2]:
import pandas as pd
from pathlib import Path
import os
import random
import numpy as np
import json
from datetime import timedelta
from collections import Counter
from tqdm.notebook import tqdm
from heapq import nlargest

import matplotlib.pyplot as plt
import seaborn as sns

sns.set_theme()

import warnings

warnings.filterwarnings("ignore")

from sklearn.preprocessing import OrdinalEncoder


In [3]:
### Paths ###

DATA_PATH = Path("../data")
TRAIN_RAW_PATH = DATA_PATH / "raw/train.jsonl"
TEST_RAW_PATH = DATA_PATH / "raw/test.jsonl"
SAMPLE_SUBMISSION_RAW_PATH = DATA_PATH / "raw/sample_submission.csv"

DATA_PROCESSED_PATH = DATA_PATH / "processed"

DATA_PATH_RN5L = Path("../models/session-rec/data/OTTO/prepared")

In [12]:
# read data
test_df = pd.read_parquet(
    "/Users/Artem_Boltaev/Documents/EPAM Projects/7. RecSys_OTTO_Kaggle/source_code/otto_recsys_kaggle/data/processed/test_processed.parquet"
)
train_df = pd.read_parquet(
    "/Users/Artem_Boltaev/Documents/EPAM Projects/7. RecSys_OTTO_Kaggle/source_code/otto_recsys_kaggle/data/processed/train_processed.parquet"
)

## 1.Data preparation

In [8]:
#drop duplicates in actions for an aid in session_real
#train from 216.7M to 167.6M
#test from 6.9M to 5.6M
test_df_filtered = test_df.drop_duplicates(subset=['session_real_id_encode', 'aid', 'type'])
train_df_filtered = train_df.drop_duplicates(subset=['session_real_id_encode', 'aid', 'type'])

KeyboardInterrupt: 

In [None]:
#split test and train dataframes to cold and warm users
#train_df_filtered_cold 0.8M
#test_df_filtered_cold 0.88M
#test_df_filtered_warm 4.5M
#train_df_filtered_warm 147.9M


test_df_filtered_cold = test_df_filtered[test_df_filtered.num_session_real == 0][test_df_filtered.num_unique_aid == 1]
train_df_filtered_cold = train_df_filtered[train_df_filtered.num_session_real == 0][train_df_filtered.num_unique_aid == 1]

test_df_filtered_warm = test_df_filtered[test_df_filtered.num_unique_aid != 1]
train_df_filtered_warm = train_df_filtered[train_df_filtered.num_unique_aid != 1]

In [None]:
test_df_filtered_warm

Unnamed: 0,session,aid,ts,type,session_real,num_session_real,session_real_id,aid_count,aid_count_type_0,aid_count_type_1,...,num_session_real_id,num_unique_aid,num_unique_aid_0,num_unique_aid_1,num_unique_aid_2,num_unique_aid_user,num_unique_aid_user_0,num_unique_aid_user_1,num_unique_aid_user_2,session_real_id_encode
1,12899780,1142000,1661724000378,0,0,0,12899780_0,511,473.0,0.0,...,5,4,4.0,0.0,0.0,4,4.0,0.0,0.0,1
2,12899780,582732,1661724058352,0,0,0,12899780_0,371,333.0,0.0,...,5,4,4.0,0.0,0.0,4,4.0,0.0,0.0,1
3,12899780,973453,1661724109199,0,0,0,12899780_0,94,87.0,0.0,...,5,4,4.0,0.0,0.0,4,4.0,0.0,0.0,1
4,12899780,736515,1661724136868,0,0,0,12899780_0,563,488.0,0.0,...,5,4,4.0,0.0,0.0,4,4.0,0.0,0.0,1
6,12899781,141736,1661724000559,0,0,3,12899781_0,253,231.0,0.0,...,4,4,4.0,0.0,0.0,5,5.0,0.0,0.0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6928072,14571534,1152891,1662328727586,0,0,0,14571534_0,183,149.0,0.0,...,4,2,2.0,0.0,0.0,2,2.0,0.0,0.0,1927162
6928086,14571547,1546409,1662328727386,0,0,0,14571547_0,363,352.0,0.0,...,2,2,2.0,0.0,0.0,2,2.0,0.0,0.0,1927175
6928087,14571547,1117925,1662328764818,0,0,0,14571547_0,144,137.0,0.0,...,2,2,2.0,0.0,0.0,2,2.0,0.0,0.0,1927175
6928088,14571548,1453906,1662328728006,0,0,0,14571548_0,183,180.0,0.0,...,2,2,2.0,0.0,0.0,2,2.0,0.0,0.0,1927176


In [None]:
test_df_rn5l = test_df_filtered_warm[['session', 'aid', 'ts', 'session_real_id']]
train_df_rn5l = train_df_filtered_warm[['session', 'aid', 'ts', 'session_real_id']]

In [None]:
# rename columns
test_df_rn5l = test_df_rn5l.rename(
    columns={
        "aid": "ItemId",
        "ts": "Time",
        "session": "UserId",
    }
)
train_df_rn5l = train_df_rn5l.rename(
    columns={
        "aid": "ItemId",
        "ts": "Time",
        "session": "UserId",
    }
)

In [None]:
test_df_rn5l["Time"] = test_df_rn5l["Time"].apply(lambda x: x / 1000)
train_df_rn5l["Time"] = train_df_rn5l["Time"].apply(lambda x: x / 1000)

In [None]:
# // ordinary encoding SessionId in tess_df_rn5l dataframe with scikit learn OrdinalEncoder
enc = OrdinalEncoder()

test_df_rn5l["SessionId"] = enc.fit_transform(test_df_rn5l[["session_real_id"]])
test_df_rn5l["SessionId"] = test_df_rn5l["SessionId"].astype(int)

train_df_rn5l['SessionId'] = enc.fit_transform(train_df_rn5l[['session_real_id']])
train_df_rn5l['SessionId'] = train_df_rn5l['SessionId'].astype(int)

In [None]:
test_df_rn5l.head()

Unnamed: 0,UserId,ItemId,Time,session_real_id,SessionId
1,12899780,1142000,1661724000.0,12899780_0,0
2,12899780,582732,1661724000.0,12899780_0,0
3,12899780,973453,1661724000.0,12899780_0,0
4,12899780,736515,1661724000.0,12899780_0,0
6,12899781,141736,1661724000.0,12899781_0,1


## 2.Write data to start a training

In [44]:
train_df_rn5l.drop(columns=["session_real_id"]).to_csv(
    "/Users/Artem_Boltaev/Documents/EPAM Projects/7. RecSys_OTTO_Kaggle/source_code/otto_recsys_kaggle/models/session-rec/data/OTTO/prepared/OTTO_train_full.txt",
    sep="\t",
    header=True,
    index=False,
)

In [45]:
#// concat test and part of train dataframe with big session
test_df_rn5l.drop(columns=["session_real_id"]).to_csv(
    "/Users/Artem_Boltaev/Documents/EPAM Projects/7. RecSys_OTTO_Kaggle/source_code/otto_recsys_kaggle/models/session-rec/data/OTTO/prepared/OTTO_test.txt",
    sep="\t",
    header=True,
    index=False,
)

## 3.Prepare data for submission

In [6]:
test_df_rn5l = pd.read_csv(
    "/Users/Artem_Boltaev/Documents/EPAM Projects/7. RecSys_OTTO_Kaggle/source_code/otto_recsys_kaggle/models/session-rec/data/OTTO/prepared/OTTO_test.txt",
    sep="\t"
)

In [7]:
test_df_rn5l.head()

Unnamed: 0,UserId,ItemId,Time,SessionId
0,12899780,1142000,1661724000.0,0
1,12899780,582732,1661724000.0,0
2,12899780,973453,1661724000.0,0
3,12899780,736515,1661724000.0,0
4,12899781,141736,1661724000.0,1


In [26]:
results = pd.read_csv('/Users/Artem_Boltaev/Documents/EPAM Projects/7. RecSys_OTTO_Kaggle/source_code/otto_recsys_kaggle/models/session-rec/results/next/OTTO/test_OTTO_next_predictions.csv')

In [27]:
results.head()

Unnamed: 0,SessionId,Time,preds,preds_scores
0,0,1661724000.0,[ 736515 582732 1142000 1712906 973453 2090...,[784.52272037 132.43695238 45.9507786 37.25...
1,1,1661724000.0,[ 57315 1460571 194067 811371 959548 6205...,[606.54657613 45.6568043 38.56261552 33.55...
2,2,1662060000.0,[ 918667 199008 1681537 1628918 1422724 3740...,[320.39168046 75.54339211 61.5427205 26.97...
3,3,1661724000.0,[ 45034 1596098 603159 602722 817520 1206...,[245.94542946 28.64936242 20.63376626 11.94...
4,4,1661766000.0,[ 562753 476063 1037537 691809 1675581 3878...,[196.63537362 142.12531318 104.30972898 50.91...


In [28]:
results['preds'][1]

'[  57315 1460571  194067  811371  959548  620545  884993 1674500 1516952\n 1611581 1204405  551645  824944  102345 1148071 1837490  640599  944778\n  447645 1125095  399992  419287  724035 1215540 1299194 1066416 1359243\n  426980 1783610  109499  743168  141736  987059  517762 1853288    3378\n  331708  247240 1469891  518425  303808 1611455  150294 1675318 1370091\n 1066725 1302852 1065944  779056 1724971  580740 1848140  674590 1061776\n  428924  893268  880734  258651  675766 1669465 1019736 1777876  275288\n 1719446  886789  287154 1619579  959208  978558  150586 1652784 1796103\n 1647563  660451  249809  453029 1416497 1172231 1705470  974208 1192169\n 1340329 1206598  199409  624327  242852 1197632 1008624  127864 1658034\n 1611141 1820163  301629 1603001  124383 1621009 1116864  311255 1426624\n 1554664]'

In [29]:
results.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 874828 entries, 0 to 874827
Data columns (total 4 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   SessionId     874828 non-null  int64  
 1   Time          874828 non-null  float64
 2   preds         874828 non-null  object 
 3   preds_scores  874828 non-null  object 
dtypes: float64(1), int64(1), object(2)
memory usage: 26.7+ MB


In [30]:
# join results with test_df_rn5l dataframe by SessionId
results = results.merge(test_df_rn5l[['SessionId', 'UserId']], left_on='SessionId', right_on='SessionId', how='left')
results.head()

Unnamed: 0,SessionId,Time,preds,preds_scores,UserId
0,0,1661724000.0,[ 736515 582732 1142000 1712906 973453 2090...,[784.52272037 132.43695238 45.9507786 37.25...,12899780
1,0,1661724000.0,[ 736515 582732 1142000 1712906 973453 2090...,[784.52272037 132.43695238 45.9507786 37.25...,12899780
2,0,1661724000.0,[ 736515 582732 1142000 1712906 973453 2090...,[784.52272037 132.43695238 45.9507786 37.25...,12899780
3,0,1661724000.0,[ 736515 582732 1142000 1712906 973453 2090...,[784.52272037 132.43695238 45.9507786 37.25...,12899780
4,1,1661724000.0,[ 57315 1460571 194067 811371 959548 6205...,[606.54657613 45.6568043 38.56261552 33.55...,12899781


In [31]:
# get rows for eache SessionId with max Time in results dataframe
results_filtr = results.sort_values(by=['UserId', 'Time'], ascending=False).groupby('UserId').head(1)
results_filtr.head()

Unnamed: 0,SessionId,Time,preds,preds_scores,UserId
4516618,874827,1662329000.0,[1798580 1278671 1453906 1266807 1671725 4999...,[997.28726303 149.37172559 75.63969314 21.26...,14571548
4516616,874826,1662329000.0,[1117925 1546409 925950 1305137 1405405 1583...,[547.70462094 388.48679399 97.25091739 23.89...,14571547
4516613,874825,1662329000.0,[1152891 272221 279269 1116621 1684543 15544...,[587.23464376 385.25268329 61.93960383 15.39...,14571534
4516611,874824,1662329000.0,[ 893173 1255319 780305 584023 1449637 12042...,[40.58457468 6.91722999 5.85150206 4.891471...,14571533
4516609,874823,1662329000.0,[1515948 1132404 1179109 1673783 1735054 5309...,[259.01905752 227.60175684 34.56127565 30.56...,14571525


In [34]:
results_filtr['preds'][4516618]

'[1798580 1278671 1453906 1266807 1671725  499921  199126 1226391 1126038\n  650139  152563  134368 1833470  821746  576317  916266  434137  832087\n  359333 1107036 1775887 1720963 1475121 1554719  579118  668655  651346\n 1237607 1681316 1547060  131186 1799476  758877 1843528 1841138 1282828\n 1135356 1439896   70610  587956 1271649  300957  602491   10398   96165\n 1458977   54145 1022943 1204550 1830179  942323  975891 1037672 1086474\n  948581 1030939  705343 1029427  488216 1001375  886583 1678286 1107893\n 1751274 1542180  596780 1632700  927087 1384219 1754984 1302845  132616\n  154478 1203155  878948 1712751  337471 1724455 1760147 1361985  318629\n  325854   13552    9971 1036074  613512  517781  380216  153368  986275\n  559418  130704 1793171 1256425  597742 1852924 1493144  929742 1255099\n 1138662]'

In [35]:
results_filtr.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 782816 entries, 4516618 to 0
Data columns (total 5 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   SessionId     782816 non-null  int64  
 1   Time          782816 non-null  float64
 2   preds         782816 non-null  object 
 3   preds_scores  782816 non-null  object 
 4   UserId        782816 non-null  int64  
dtypes: float64(1), int64(2), object(2)
memory usage: 52.0+ MB


In [36]:
results_filtr.head()

Unnamed: 0,SessionId,Time,preds,preds_scores,UserId
4516618,874827,1662329000.0,[1798580 1278671 1453906 1266807 1671725 4999...,[997.28726303 149.37172559 75.63969314 21.26...,14571548
4516616,874826,1662329000.0,[1117925 1546409 925950 1305137 1405405 1583...,[547.70462094 388.48679399 97.25091739 23.89...,14571547
4516613,874825,1662329000.0,[1152891 272221 279269 1116621 1684543 15544...,[587.23464376 385.25268329 61.93960383 15.39...,14571534
4516611,874824,1662329000.0,[ 893173 1255319 780305 584023 1449637 12042...,[40.58457468 6.91722999 5.85150206 4.891471...,14571533
4516609,874823,1662329000.0,[1515948 1132404 1179109 1673783 1735054 5309...,[259.01905752 227.60175684 34.56127565 30.56...,14571525


In [37]:
#get list of uniq sessions
submmission = test_df[['session']].drop_duplicates().reset_index(drop=True)
submmission.head()

Unnamed: 0,session
0,12899779
1,12899780
2,12899781
3,12899782
4,12899783


In [38]:
submmision = submmission.merge(results_filtr[['UserId', 'preds']], left_on='session', right_on='UserId', how='left').dropna()
submmision.head()

Unnamed: 0,session,UserId,preds
1,12899780,12899780.0,[ 736515 582732 1142000 1712906 973453 2090...
2,12899781,12899781.0,[ 918667 199008 1681537 1628918 1422724 3740...
3,12899782,12899782.0,[1007613 1033148 383474 595994 479970 11124...
4,12899783,12899783.0,[ 607638 1754419 1216820 294573 351665 8871...
5,12899784,12899784.0,[1190477 1579935 1546830 22981 476216 6555...


In [47]:
# make the same predictions for clicks, carts and orders
submission_clicks = submmision.copy()
submission_carts = submmision.copy()
submission_orders = submmision.copy()

submission_clicks["session"] = submission_clicks["session"].apply(
    lambda x: str(x) + "_clicks"
)
submission_carts["session"] = submission_carts["session"].apply(
    lambda x: str(x) + "_carts"
)
submission_orders["session"] = submission_orders["session"].apply(
    lambda x: str(x) + "_orders"
)

submission_full = pd.concat(
    [submission_clicks, submission_carts, submission_orders], axis=0, ignore_index=True
)[["session", "preds"]]

submission_full = submission_full.rename(
    columns={"session": "session_type", "preds": "labels"}
)
submission_full

Unnamed: 0,session_type,labels
0,12899780_clicks,[ 736515 582732 1142000 1712906 973453 2090...
1,12899781_clicks,[ 918667 199008 1681537 1628918 1422724 3740...
2,12899782_clicks,[1007613 1033148 383474 595994 479970 11124...
3,12899783_clicks,[ 607638 1754419 1216820 294573 351665 8871...
4,12899784_clicks,[1190477 1579935 1546830 22981 476216 6555...
...,...,...
2348443,14571525_orders,[1515948 1132404 1179109 1673783 1735054 5309...
2348444,14571533_orders,[ 893173 1255319 780305 584023 1449637 12042...
2348445,14571534_orders,[1152891 272221 279269 1116621 1684543 15544...
2348446,14571547_orders,[1117925 1546409 925950 1305137 1405405 1583...


In [48]:
submission_full.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2348448 entries, 0 to 2348447
Data columns (total 2 columns):
 #   Column        Dtype 
---  ------        ----- 
 0   session_type  object
 1   labels        object
dtypes: object(2)
memory usage: 35.8+ MB


In [49]:
#drop some simbols
submission_full['labels'] = submission_full['labels'].apply(lambda x: x.replace('[', ''))
submission_full['labels'] = submission_full['labels'].apply(lambda x: x.replace(']', ''))
submission_full['labels'] = submission_full['labels'].apply(lambda x: x.replace(',', ''))
submission_full['labels'] = submission_full['labels'].apply(lambda x: x.replace("\n", ''))
submission_full['labels'] = submission_full['labels'].apply(lambda x: x.lstrip())
submission_full['labels'] = submission_full['labels'].apply(lambda x: x.replace('  ', ' '))

submission_full.head()

Unnamed: 0,session_type,labels
0,12899780_clicks,736515 582732 1142000 1712906 973453 209046 17...
1,12899781_clicks,918667 199008 1681537 1628918 1422724 374037 5...
2,12899782_clicks,1007613 1033148 383474 595994 479970 1112446 1...
3,12899783_clicks,607638 1754419 1216820 294573 351665 887179 14...
4,12899784_clicks,1190477 1579935 1546830 22981 476216 655566 9...


In [52]:
submission_full['labels'] = submission_full['labels'].apply(lambda x: list(map(int,x.split())))

submission_full.head()


Unnamed: 0,session_type,labels
0,12899780_clicks,"[736515, 582732, 1142000, 1712906, 973453, 209..."
1,12899781_clicks,"[918667, 199008, 1681537, 1628918, 1422724, 37..."
2,12899782_clicks,"[1007613, 1033148, 383474, 595994, 479970, 111..."
3,12899783_clicks,"[607638, 1754419, 1216820, 294573, 351665, 887..."
4,12899784_clicks,"[1190477, 1579935, 1546830, 22981, 476216, 655..."


## 4.Write data to push to Kaggle

In [None]:
submission_full.to_parquet(
    "/Users/Artem_Boltaev/Documents/EPAM Projects/7. RecSys_OTTO_Kaggle/source_code/otto_recsys_kaggle/data/processed/predictions_warm_vstan.parquet")