In [2]:
import pandas as pd
from pathlib import Path
import os
import random
import numpy as np
import json
from datetime import timedelta
from collections import Counter
from tqdm.notebook import tqdm
from heapq import nlargest

import matplotlib.pyplot as plt
import seaborn as sns

sns.set_theme()

import warnings

warnings.filterwarnings("ignore")

from sklearn.preprocessing import OrdinalEncoder


In [3]:
### Paths ###

DATA_PATH = Path("../data")
TRAIN_RAW_PATH = DATA_PATH / "raw/train.jsonl"
TEST_RAW_PATH = DATA_PATH / "raw/test.jsonl"
SAMPLE_SUBMISSION_RAW_PATH = DATA_PATH / "raw/sample_submission.csv"

DATA_PROCESSED_PATH = DATA_PATH / "processed"

DATA_PATH_RN5L = Path("../models/session-rec/data/OTTO/prepared")

In [4]:
# read data
test_df = pd.read_parquet(
    "/Users/Artem_Boltaev/Documents/EPAM Projects/7. RecSys_OTTO_Kaggle/source_code/otto_recsys_kaggle/data/processed/test_processed.parquet"
)
train_df = pd.read_parquet(
    "/Users/Artem_Boltaev/Documents/EPAM Projects/7. RecSys_OTTO_Kaggle/source_code/otto_recsys_kaggle/data/processed/train_processed.parquet"
)

## 1.Data preparation

In [5]:
#drop duplicates in actions for an aid in session_real
#train from 216.7M to 167.6M
#test from 6.9M to 5.6M
test_df_filtered = test_df.drop_duplicates(subset=['session_real_id_encode', 'aid', 'type'])
train_df_filtered = train_df.drop_duplicates(subset=['session_real_id_encode', 'aid', 'type'])

In [21]:
#split test and train dataframes to cold and warm users
#train_df_filtered_cold 0.8M
#test_df_filtered_cold 0.88M
#test_df_filtered_warm 4.5M
#train_df_filtered_warm 147.9M


test_df_filtered_cold = test_df_filtered[test_df_filtered.num_session_real == 0][test_df_filtered.num_unique_aid == 1]
train_df_filtered_cold = train_df_filtered[train_df_filtered.num_session_real == 0][train_df_filtered.num_unique_aid == 1]

test_df_filtered_warm = test_df_filtered[test_df_filtered.num_unique_aid != 1]
train_df_filtered_warm = train_df_filtered[train_df_filtered.num_unique_aid != 1]

In [36]:
test_df_filtered_warm

Unnamed: 0,session,aid,ts,type,session_real,num_session_real,session_real_id,aid_count,aid_count_type_0,aid_count_type_1,...,num_session_real_id,num_unique_aid,num_unique_aid_0,num_unique_aid_1,num_unique_aid_2,num_unique_aid_user,num_unique_aid_user_0,num_unique_aid_user_1,num_unique_aid_user_2,session_real_id_encode
1,12899780,1142000,1661724000378,0,0,0,12899780_0,511,473.0,0.0,...,5,4,4.0,0.0,0.0,4,4.0,0.0,0.0,1
2,12899780,582732,1661724058352,0,0,0,12899780_0,371,333.0,0.0,...,5,4,4.0,0.0,0.0,4,4.0,0.0,0.0,1
3,12899780,973453,1661724109199,0,0,0,12899780_0,94,87.0,0.0,...,5,4,4.0,0.0,0.0,4,4.0,0.0,0.0,1
4,12899780,736515,1661724136868,0,0,0,12899780_0,563,488.0,0.0,...,5,4,4.0,0.0,0.0,4,4.0,0.0,0.0,1
6,12899781,141736,1661724000559,0,0,3,12899781_0,253,231.0,0.0,...,4,4,4.0,0.0,0.0,5,5.0,0.0,0.0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6928072,14571534,1152891,1662328727586,0,0,0,14571534_0,183,149.0,0.0,...,4,2,2.0,0.0,0.0,2,2.0,0.0,0.0,1927162
6928086,14571547,1546409,1662328727386,0,0,0,14571547_0,363,352.0,0.0,...,2,2,2.0,0.0,0.0,2,2.0,0.0,0.0,1927175
6928087,14571547,1117925,1662328764818,0,0,0,14571547_0,144,137.0,0.0,...,2,2,2.0,0.0,0.0,2,2.0,0.0,0.0,1927175
6928088,14571548,1453906,1662328728006,0,0,0,14571548_0,183,180.0,0.0,...,2,2,2.0,0.0,0.0,2,2.0,0.0,0.0,1927176


In [38]:
test_df_rn5l = test_df_filtered_warm[['session', 'aid', 'ts', 'session_real_id']]
train_df_rn5l = train_df_filtered_warm[['session', 'aid', 'ts', 'session_real_id']]

In [40]:
# rename columns
test_df_rn5l = test_df_rn5l.rename(
    columns={
        "aid": "ItemId",
        "ts": "Time",
        "session": "UserId",
    }
)
train_df_rn5l = train_df_rn5l.rename(
    columns={
        "aid": "ItemId",
        "ts": "Time",
        "session": "UserId",
    }
)

In [41]:
test_df_rn5l["Time"] = test_df_rn5l["Time"].apply(lambda x: x / 1000)
train_df_rn5l["Time"] = train_df_rn5l["Time"].apply(lambda x: x / 1000)

In [42]:
# // ordinary encoding SessionId in tess_df_rn5l dataframe with scikit learn OrdinalEncoder
enc = OrdinalEncoder()

test_df_rn5l["SessionId"] = enc.fit_transform(test_df_rn5l[["session_real_id"]])
test_df_rn5l["SessionId"] = test_df_rn5l["SessionId"].astype(int)

train_df_rn5l['SessionId'] = enc.fit_transform(train_df_rn5l[['session_real_id']])
train_df_rn5l['SessionId'] = train_df_rn5l['SessionId'].astype(int)

In [43]:
test_df_rn5l.head()

Unnamed: 0,UserId,ItemId,Time,session_real_id,SessionId
1,12899780,1142000,1661724000.0,12899780_0,0
2,12899780,582732,1661724000.0,12899780_0,0
3,12899780,973453,1661724000.0,12899780_0,0
4,12899780,736515,1661724000.0,12899780_0,0
6,12899781,141736,1661724000.0,12899781_0,1


## 2.Write data to start a training

In [44]:
train_df_rn5l.drop(columns=["session_real_id"]).to_csv(
    "/Users/Artem_Boltaev/Documents/EPAM Projects/7. RecSys_OTTO_Kaggle/source_code/otto_recsys_kaggle/models/session-rec/data/OTTO/prepared/OTTO_train_full.txt",
    sep="\t",
    header=True,
    index=False,
)

In [45]:
#// concat test and part of train dataframe with big session
test_df_rn5l.drop(columns=["session_real_id"]).to_csv(
    "/Users/Artem_Boltaev/Documents/EPAM Projects/7. RecSys_OTTO_Kaggle/source_code/otto_recsys_kaggle/models/session-rec/data/OTTO/prepared/OTTO_test.txt",
    sep="\t",
    header=True,
    index=False,
)

In [33]:
train_df_rn5l.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 147919992 entries, 0 to 216716095
Data columns (total 22 columns):
 #   Column                  Dtype  
---  ------                  -----  
 0   UserId                  int64  
 1   ItemId                  int64  
 2   Time                    float64
 3   type                    uint8  
 4   session_real            int8   
 5   num_session_real        int8   
 6   session_real_id         object 
 7   aid_count               int64  
 8   aid_count_type_0        float64
 9   aid_count_type_1        float64
 10  aid_count_type_2        float64
 11  num_session_real_id     int64  
 12  num_unique_aid          int64  
 13  num_unique_aid_0        float64
 14  num_unique_aid_1        float64
 15  num_unique_aid_2        float64
 16  num_unique_aid_user     int64  
 17  num_unique_aid_user_0   float64
 18  num_unique_aid_user_1   float64
 19  num_unique_aid_user_2   float64
 20  session_real_id_encode  int64  
 21  SessionId               int

In [27]:
train_df_rn5l['SessionId'].nunique()

43565109

## 3.Prepare data for submission

In [282]:
results = pd.read_csv('/Users/Artem_Boltaev/Documents/EPAM Projects/7. RecSys_OTTO_Kaggle/source_code/otto_recsys_kaggle/models/session-rec/results/next/OTTO/test_OTTO_next_predictions.csv')

In [283]:
# join results with test_df_rn5l dataframe by SessionId
results = results.merge(test_df_rn5l[['SessionId', 'UserId']], left_on='SessionId', right_on='SessionId', how='left')
results.head()

Unnamed: 0,SessionId,Time,preds,preds_scores,UserId
0,1,1661724000.0,[1142000 736515 1344758 889686 582732 77422 1419849 1263108 636101\n 1586171 1758603 1125638 1182614 1029150 1796048 19703 1502122 768323\n 1032776 1383529],[10312.84275685 720.54101725 413.70612591 408.07558038\n 332.69212398 328.1202455 311.05663407 298.74976015\n 227.68516727 210.40743773 186.56495209 178.89037239\n 163.23830148 161.49399638 154.63440854 139.135942\n 138.14546423 119.8499242 115.57700855 115.20198206],12899780
1,1,1661724000.0,[1142000 736515 1344758 889686 582732 77422 1419849 1263108 636101\n 1586171 1758603 1125638 1182614 1029150 1796048 19703 1502122 768323\n 1032776 1383529],[10312.84275685 720.54101725 413.70612591 408.07558038\n 332.69212398 328.1202455 311.05663407 298.74976015\n 227.68516727 210.40743773 186.56495209 178.89037239\n 163.23830148 161.49399638 154.63440854 139.135942\n 138.14546423 119.8499242 115.57700855 115.20198206],12899780
2,1,1661724000.0,[1142000 736515 1344758 889686 582732 77422 1419849 1263108 636101\n 1586171 1758603 1125638 1182614 1029150 1796048 19703 1502122 768323\n 1032776 1383529],[10312.84275685 720.54101725 413.70612591 408.07558038\n 332.69212398 328.1202455 311.05663407 298.74976015\n 227.68516727 210.40743773 186.56495209 178.89037239\n 163.23830148 161.49399638 154.63440854 139.135942\n 138.14546423 119.8499242 115.57700855 115.20198206],12899780
3,1,1661724000.0,[1142000 736515 1344758 889686 582732 77422 1419849 1263108 636101\n 1586171 1758603 1125638 1182614 1029150 1796048 19703 1502122 768323\n 1032776 1383529],[10312.84275685 720.54101725 413.70612591 408.07558038\n 332.69212398 328.1202455 311.05663407 298.74976015\n 227.68516727 210.40743773 186.56495209 178.89037239\n 163.23830148 161.49399638 154.63440854 139.135942\n 138.14546423 119.8499242 115.57700855 115.20198206],12899780
4,1,1661724000.0,[1142000 736515 1344758 889686 582732 77422 1419849 1263108 636101\n 1586171 1758603 1125638 1182614 1029150 1796048 19703 1502122 768323\n 1032776 1383529],[10312.84275685 720.54101725 413.70612591 408.07558038\n 332.69212398 328.1202455 311.05663407 298.74976015\n 227.68516727 210.40743773 186.56495209 178.89037239\n 163.23830148 161.49399638 154.63440854 139.135942\n 138.14546423 119.8499242 115.57700855 115.20198206],12899780


In [None]:
#// create new enviroment in terminal
#// conda create -n session-rec python=3.7

In [284]:
# get rows for eache SessionId with max Time in results dataframe
results_filtr = results.sort_values(by=['UserId', 'Time'], ascending=False).groupby('UserId').head(1)
results_filtr.head()

Unnamed: 0,SessionId,Time,preds,preds_scores,UserId
5999585,1927176,1662329000.0,[1798580 1278671 1453906 821746 650139 199126 359333 916266 1001375\n 1036074 1646706 10398 1475121 1554719 1487580 1843528 674870 1138662\n 134368 668655],[3174.35659831 254.91364738 179.54204572 89.54329588 49.27946026\n 44.3150408 36.7905988 34.01144605 34.00111894 33.7859234\n 30.62213136 29.60096212 28.29012303 27.74089981 23.44425565\n 23.36157821 23.13242134 22.08099143 21.74477722 21.34935465],14571548
5999583,1927175,1662329000.0,[1117925 1546409 158321 925950 1405405 363573 137221 447692 1389725\n 1031206 1617606 719278 1010685 431309 1341519 1334193 336930 1067609\n 137728 65873],[1418.83787141 467.74095911 81.46170669 59.0186441 58.71764195\n 43.13524847 40.06795548 36.30589207 35.51755538 34.40894928\n 33.92177612 32.13343134 31.38476146 28.50240212 27.04450418\n 23.81526112 23.78649905 22.65805109 19.20361348 19.05212597],14571547
5999581,1927167,1662329000.0,[ 317311 171073 1504878 275687 1788265 1564473 610416 1661593 1068897\n 150188 839219 820406 24391 1158826 1818515 1029279 843075 205969\n 1243259 1329403],[3501.93248055 34.30037762 27.30760769 27.22771973 26.54141772\n 19.46079644 18.56690488 17.74518412 17.10937671 16.48921108\n 16.25761242 15.68857719 13.90329155 13.84867802 13.51804807\n 13.10060599 13.07464798 12.80415768 12.61029205 12.56025767],14571539
5999577,1927162,1662329000.0,[1152891 272221 279269 1421968 560418 1554434 1492293 1182750 1059463\n 566655 1662236 1695484 1530971 999871 1402783 1684543 1267464 709970\n 1763549 1132001],[1200.0681144 457.46329526 65.19551136 19.84639268 19.67390842\n 17.59356387 17.42070921 17.12675526 16.59986939 13.70117017\n 13.44012471 13.33219682 13.30148189 13.28707103 12.56048407\n 12.41337067 12.35552557 12.06094033 11.90813379 11.5695276 ],14571534
5999575,1927161,1662329000.0,[ 893173 229094 564523 1255319 445516 944041 1656492 471812 1513950\n 665705 1496885 1218350 81849 215934 528034 191782 332511 1789046\n 1841305 1673953],[111.25978846 41.0097419 6.4898089 5.71260012 4.42323406\n 3.624011 2.75572041 2.44641412 1.96760791 1.51795944\n 1.3911662 1.06056544 1.03452024 0.79959333 0.7220693\n 0.66297645 0.66042337 0.5518731 0.53273621 0.42036139],14571533


In [285]:
results_filtr.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 886157 entries, 5999585 to 0
Data columns (total 5 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   SessionId     886157 non-null  int64  
 1   Time          886157 non-null  float64
 2   preds         886157 non-null  object 
 3   preds_scores  886157 non-null  object 
 4   UserId        886157 non-null  int64  
dtypes: float64(1), int64(2), object(2)
memory usage: 40.6+ MB


In [286]:
#get list of uniq sessions
submmission = test_df[['session']].drop_duplicates().reset_index(drop=True)
submmission.head()

Unnamed: 0,session
0,12899779
1,12899780
2,12899781
3,12899782
4,12899783


In [287]:
submmision = submmission.merge(results_filtr[['UserId', 'preds']], left_on='session', right_on='UserId', how='left').dropna()
submmision.head()

Unnamed: 0,session,UserId,preds
1,12899780,12899780.0,[1142000 736515 1344758 889686 582732 77422 1419849 1263108 636101\n 1586171 1758603 1125638 1182614 1029150 1796048 19703 1502122 768323\n 1032776 1383529]
2,12899781,12899781.0,[ 918667 199008 1836671 1628918 1422724 515373 528496 1119163 1681537\n 1121265 744051 1495531 428697 600579 824574 1102089 1446845 374360\n 1722991 1373992]
3,12899782,12899782.0,[1007613 1112446 595994 1344564 409367 413453 1727935 383474 335186\n 670423 1694535 1779730 1465743 606309 910942 365272 1007094 862294\n 892387 1302179]
4,12899783,12899783.0,[ 607638 1754419 486811 289363 1216820 850888 1731918 300127 1763662\n 963395 230028 37798 1088900 1451994 1396148 564341 1085748 511416\n 936734 1236114]
5,12899784,12899784.0,[1190477 22981 1546830 624476 29187 229155 1719401 476216 401010\n 1758193 345189 655566 988806 1187202 1579935 37695 346959 1130634\n 1747991 1335839]


In [288]:
#drop some simbols
submmision['preds_trunk'] = submmision['preds'].apply(lambda x: x.replace('[', ''))
submmision['preds_trunk'] = submmision['preds_trunk'].apply(lambda x: x.replace(']', ''))
submmision['preds_trunk'] = submmision['preds_trunk'].apply(lambda x: x.replace(',', ''))
submmision['preds_trunk'] = submmision['preds_trunk'].apply(lambda x: x.replace("\n", ''))
submmision['preds_trunk'] = submmision['preds_trunk'].apply(lambda x: x.lstrip())
submmision['preds_trunk'] = submmision['preds_trunk'].apply(lambda x: x.replace('  ', ' '))
submmision.head()

Unnamed: 0,session,UserId,preds,preds_trunk
1,12899780,12899780.0,[1142000 736515 1344758 889686 582732 77422 1419849 1263108 636101\n 1586171 1758603 1125638 1182614 1029150 1796048 19703 1502122 768323\n 1032776 1383529],1142000 736515 1344758 889686 582732 77422 1419849 1263108 636101 1586171 1758603 1125638 1182614 1029150 1796048 19703 1502122 768323 1032776 1383529
2,12899781,12899781.0,[ 918667 199008 1836671 1628918 1422724 515373 528496 1119163 1681537\n 1121265 744051 1495531 428697 600579 824574 1102089 1446845 374360\n 1722991 1373992],918667 199008 1836671 1628918 1422724 515373 528496 1119163 1681537 1121265 744051 1495531 428697 600579 824574 1102089 1446845 374360 1722991 1373992
3,12899782,12899782.0,[1007613 1112446 595994 1344564 409367 413453 1727935 383474 335186\n 670423 1694535 1779730 1465743 606309 910942 365272 1007094 862294\n 892387 1302179],1007613 1112446 595994 1344564 409367 413453 1727935 383474 335186 670423 1694535 1779730 1465743 606309 910942 365272 1007094 862294 892387 1302179
4,12899783,12899783.0,[ 607638 1754419 486811 289363 1216820 850888 1731918 300127 1763662\n 963395 230028 37798 1088900 1451994 1396148 564341 1085748 511416\n 936734 1236114],607638 1754419 486811 289363 1216820 850888 1731918 300127 1763662 963395 230028 37798 1088900 1451994 1396148 564341 1085748 511416 936734 1236114
5,12899784,12899784.0,[1190477 22981 1546830 624476 29187 229155 1719401 476216 401010\n 1758193 345189 655566 988806 1187202 1579935 37695 346959 1130634\n 1747991 1335839],1190477 22981 1546830 624476 29187 229155 1719401 476216 401010 1758193 345189 655566 988806 1187202 1579935 37695 346959 1130634 1747991 1335839


In [289]:
#make the same predictions for clicks, carts and orders
submission_clicks = submmision.copy()
submission_carts = submmision.copy()
submission_orders = submmision.copy()

submission_clicks['session'] = submission_clicks['session'].apply(lambda x: str(x) + '_clicks')
submission_carts['session'] = submission_carts['session'].apply(lambda x: str(x) + '_carts')
submission_orders['session'] = submission_orders['session'].apply(lambda x: str(x) + '_orders')

submission_full = pd.concat([submission_clicks, submission_carts, submission_orders], axis=0)[['session', 'preds_trunk']]

submission_full = submission_full.rename(columns={'session': 'session_type', 'preds_trunk': 'labels'})
submission_full

Unnamed: 0,session_type,labels
1,12899780_clicks,1142000 736515 1344758 889686 582732 77422 1419849 1263108 636101 1586171 1758603 1125638 1182614 1029150 1796048 19703 1502122 768323 1032776 1383529
2,12899781_clicks,918667 199008 1836671 1628918 1422724 515373 528496 1119163 1681537 1121265 744051 1495531 428697 600579 824574 1102089 1446845 374360 1722991 1373992
3,12899782_clicks,1007613 1112446 595994 1344564 409367 413453 1727935 383474 335186 670423 1694535 1779730 1465743 606309 910942 365272 1007094 862294 892387 1302179
4,12899783_clicks,607638 1754419 486811 289363 1216820 850888 1731918 300127 1763662 963395 230028 37798 1088900 1451994 1396148 564341 1085748 511416 936734 1236114
5,12899784_clicks,1190477 22981 1546830 624476 29187 229155 1719401 476216 401010 1758193 345189 655566 988806 1187202 1579935 37695 346959 1130634 1747991 1335839
...,...,...
1671754,14571533_orders,893173 229094 564523 1255319 445516 944041 1656492 471812 1513950 665705 1496885 1218350 81849 215934 528034 191782 332511 1789046 1841305 1673953
1671755,14571534_orders,1152891 272221 279269 1421968 560418 1554434 1492293 1182750 1059463 566655 1662236 1695484 1530971 999871 1402783 1684543 1267464 709970 1763549 1132001
1671760,14571539_orders,317311 171073 1504878 275687 1788265 1564473 610416 1661593 1068897 150188 839219 820406 24391 1158826 1818515 1029279 843075 205969 1243259 1329403
1671768,14571547_orders,1117925 1546409 158321 925950 1405405 363573 137221 447692 1389725 1031206 1617606 719278 1010685 431309 1341519 1334193 336930 1067609 137728 65873


In [290]:
submission_last = pd.read_csv('/Users/Artem_Boltaev/Documents/EPAM Projects/7. RecSys_OTTO_Kaggle/source_code/otto_recsys_kaggle/notebooks/submission_mix_open_0578.csv.icloud')
submission_last.head()

FileNotFoundError: [Errno 2] No such file or directory: '/Users/Artem_Boltaev/Documents/EPAM Projects/7. RecSys_OTTO_Kaggle/source_code/otto_recsys_kaggle/notebooks/submission_mix_open_0578.csv'

In [291]:
submission_new = pd.concat([submission_last, submission_full], ignore_index=True)
submission_new = submission_new.drop_duplicates(subset=['session_type'], keep='last', inplace=False)
submission_new.head()


Unnamed: 0,session_type,labels
1,14304314_orders,1739996 891179 946204 134121 1097507 143990 318328 1150072 378454 616986 64529 57540 6168 1262712 335261 106609 1181182 1436160 1520570 10123
4,12974731_carts,803928 1080683 1605824 524190 1253913 1101601 104998 1338099 1185084 159789 554660 538483 42628 765745 1116095 1610239 124887 558573 523198 146407
7,14122408_orders,1814759 1080017 1356826 708891 378327 658543 479111 1023766 924068 99462 1063846 80683 131858 1144289 458537 1202276 766615 447444 1576140 1183877
8,13346006_orders,364101 1009427 92614 1242375 521430 728774 1441471 33884 1730447 326687 1622133 1285872 918066 1306128 427007 1855113 1319645 1643164 1042854 1591828
12,12923143_carts,347271 132096 377679 359947 52261 1354917 372990 3746 8936 144514 141558 1739465 1636411 1634263 1459034 1383748 1246677 895967 698209 646961


In [292]:
submission_full

Unnamed: 0,session_type,labels
1,12899780_clicks,1142000 736515 1344758 889686 582732 77422 1419849 1263108 636101 1586171 1758603 1125638 1182614 1029150 1796048 19703 1502122 768323 1032776 1383529
2,12899781_clicks,918667 199008 1836671 1628918 1422724 515373 528496 1119163 1681537 1121265 744051 1495531 428697 600579 824574 1102089 1446845 374360 1722991 1373992
3,12899782_clicks,1007613 1112446 595994 1344564 409367 413453 1727935 383474 335186 670423 1694535 1779730 1465743 606309 910942 365272 1007094 862294 892387 1302179
4,12899783_clicks,607638 1754419 486811 289363 1216820 850888 1731918 300127 1763662 963395 230028 37798 1088900 1451994 1396148 564341 1085748 511416 936734 1236114
5,12899784_clicks,1190477 22981 1546830 624476 29187 229155 1719401 476216 401010 1758193 345189 655566 988806 1187202 1579935 37695 346959 1130634 1747991 1335839
...,...,...
1671754,14571533_orders,893173 229094 564523 1255319 445516 944041 1656492 471812 1513950 665705 1496885 1218350 81849 215934 528034 191782 332511 1789046 1841305 1673953
1671755,14571534_orders,1152891 272221 279269 1421968 560418 1554434 1492293 1182750 1059463 566655 1662236 1695484 1530971 999871 1402783 1684543 1267464 709970 1763549 1132001
1671760,14571539_orders,317311 171073 1504878 275687 1788265 1564473 610416 1661593 1068897 150188 839219 820406 24391 1158826 1818515 1029279 843075 205969 1243259 1329403
1671768,14571547_orders,1117925 1546409 158321 925950 1405405 363573 137221 447692 1389725 1031206 1617606 719278 1010685 431309 1341519 1334193 336930 1067609 137728 65873


In [293]:
submission_new.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5015409 entries, 1 to 7673879
Data columns (total 2 columns):
 #   Column        Dtype 
---  ------        ----- 
 0   session_type  object
 1   labels        object
dtypes: object(2)
memory usage: 114.8+ MB


## 4.Write data to push to Kaggle

In [294]:
submission_new.to_csv('/Users/Artem_Boltaev/Documents/EPAM Projects/7. RecSys_OTTO_Kaggle/source_code/otto_recsys_kaggle/models/session-rec/results/next/OTTO/submission_OTTO.csv.gz', index=False, compression="gzip", sep=",")

In [260]:
submission_new

Unnamed: 0,session_type,labels
0,14061476_carts,743431 84110 1267119 536718 1236804 1745348 765755 1639042 899855 835881 1738202 35357 872182 110731 942532 508758 206735 1694453 460141 1497238
1,14304314_orders,1739996 891179 946204 134121 1097507 143990 318328 1150072 378454 616986 64529 57540 6168 1262712 335261 106609 1181182 1436160 1520570 10123
3,13478260_orders,1438170 1410359 341459 1233775 92555 1272539 513463 407061 184463 333998 938013 910960 1800610 63477 663613 375018 41025 840744 1617917 1529045
4,12974731_carts,803928 1080683 1605824 524190 1253913 1101601 104998 1338099 1185084 159789 554660 538483 42628 765745 1116095 1610239 124887 558573 523198 146407
5,14421699_carts,648355 1651046 1245519 156122 432623 1037052 924454 1687395 981867 138357 393665 1605858 428215 433032 1682515 411098 1775021 519884 1698747 1087452
...,...,...
5407057,13143296_orders,1648004 681642 118673 1159768 1323825 481770 867325 573705 612148 962041 219532 607897 577729 976511 1599254 149791 672432 68553 759584 1779419
5407058,13143297_orders,1224175 1008133 321936 159327 580712 1159497 1345349 136239 1313761 1240837 247359 942149 1785135 1441150 920651 1076322 1762270 686683 1016140 1255685
5407059,13143299_orders,1814640 792515 419048 381764 1841712 967345 184358 175595 232327 132623 1167574 159044 1814945 846437 501634 57008 1757830 1291428 993077 1206103
5407060,13143300_orders,494411 578839 827664 1554559 1014597 1655163 742893 809835 1778341 1042816 880549 1349192 1004390 1288637 182034 192078 1009327 1305636 1594509 1226224


In [231]:
submission_last[submission_last['session_type'].str.contains('14061476')]


Unnamed: 0,session_type,labels
0,14061476_carts,743431 84110 1267119 536718 1236804 1745348 765755 1639042 899855 835881 1738202 35357 872182 110731 942532 508758 206735 1694453 460141 1497238
391657,14061476_clicks,1150361 926383 508758 1193644 846501 1098082 995910 35357 1015461 1267119 872182 84110 536718 1745348 1236804 765755 1639042 899855 835881 1738202
1678615,14061476_orders,692282 942532 232370 508758 995910 1015461 35357 1694453 872182 84110 1267119 743431 536718 1745348 1236804 765755 1639042 899855 835881 1738202


In [242]:
submission_new['labels'][5659613]

#// delete two space in the string if it exists


'846501 1098082 1193644 1613543 1328112 1812969 239937 1527132 926383 1150361 1002160 616656 123010 1337262 171073 1339928 1739184 525087 1027669 1267119'

In [227]:
#// get rows from submission_new dataframe with session_type contain 1244
submission_last[submission_last['session_type'].str.contains('14061476')]




Unnamed: 0,session_type,labels
0,14061476_carts,743431 84110 1267119 536718 1236804 1745348 765755 1639042 899855 835881 1738202 35357 872182 110731 942532 508758 206735 1694453 460141 1497238
391657,14061476_clicks,1150361 926383 508758 1193644 846501 1098082 995910 35357 1015461 1267119 872182 84110 536718 1745348 1236804 765755 1639042 899855 835881 1738202
1678615,14061476_orders,692282 942532 232370 508758 995910 1015461 35357 1694453 872182 84110 1267119 743431 536718 1745348 1236804 765755 1639042 899855 835881 1738202


In [243]:
submission_new_new = submission_new.copy()
submission_last_new = submission_last.copy()

In [244]:
#// delete rows from submission_last_new dataframe with session_type contain _carts or _clicks
submission_last_new = submission_last_new[~submission_last_new['session_type'].str.contains('_clicks')]

#// delete rows from submission_new_new dataframe with session_type contain _orders
submission_new_new = submission_new_new[~submission_new_new['session_type'].str.contains('_orders')]
submission_new_new = submission_new_new[~submission_new_new['session_type'].str.contains('_carts')]

#// concat submission_last_new and submission_new_new
submission_new_new = pd.concat([submission_last_new, submission_new_new], ignore_index=True)