## 0.Import modules

In [52]:
import pandas as pd

from pathlib import Path
import os
import random
import numpy as np
import json
from datetime import timedelta
from collections import Counter
from heapq import nlargest
from tqdm import tqdm
import gc

import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

from sklearn.preprocessing import OrdinalEncoder

In [53]:
#// increase widht of the columns
pd.set_option('display.max_colwidth', 2000)

## 1.Load data

In [54]:
### Paths ###
DATA_PATH = Path('../data')
TRAIN_RAW_PATH = DATA_PATH/'raw/train.jsonl'
TEST_RAW_PATH = DATA_PATH/'raw/test.jsonl'
SAMPLE_SUBMISSION_RAW_PATH = DATA_PATH/'raw/sample_submission.csv'

DATA_PROCESSED_PATH = DATA_PATH/'processed'

In [55]:
#// load data pandas
test_df = pd.read_parquet(DATA_PROCESSED_PATH/'test_processed.parquet')

In [56]:
test_df.head(10)

Unnamed: 0,session,aid,ts,type,session_real,num_session_real,session_real_id,aid_count,aid_count_type_0,aid_count_type_1,...,num_session_real_id,num_unique_aid,num_unique_aid_0,num_unique_aid_1,num_unique_aid_2,num_unique_aid_user,num_unique_aid_user_0,num_unique_aid_user_1,num_unique_aid_user_2,session_real_id_encode
0,12899779,59625,1661724000278,0,0,0,12899779_0,1,1.0,0.0,...,1,1,1.0,0.0,0.0,1,1.0,0.0,0.0,0
1,12899780,1142000,1661724000378,0,0,0,12899780_0,511,473.0,0.0,...,5,4,4.0,0.0,0.0,4,4.0,0.0,0.0,1
2,12899780,582732,1661724058352,0,0,0,12899780_0,371,333.0,0.0,...,5,4,4.0,0.0,0.0,4,4.0,0.0,0.0,1
3,12899780,973453,1661724109199,0,0,0,12899780_0,94,87.0,0.0,...,5,4,4.0,0.0,0.0,4,4.0,0.0,0.0,1
4,12899780,736515,1661724136868,0,0,0,12899780_0,563,488.0,0.0,...,5,4,4.0,0.0,0.0,4,4.0,0.0,0.0,1
5,12899780,1142000,1661724155248,0,0,0,12899780_0,511,473.0,0.0,...,5,4,4.0,0.0,0.0,4,4.0,0.0,0.0,1
6,12899781,141736,1661724000559,0,0,3,12899781_0,253,231.0,0.0,...,4,4,4.0,0.0,0.0,5,5.0,0.0,0.0,2
7,12899781,199008,1661724022851,0,0,3,12899781_0,437,400.0,0.0,...,4,4,4.0,0.0,0.0,5,5.0,0.0,0.0,2
8,12899781,57315,1661724170835,0,0,3,12899781_0,1105,1047.0,0.0,...,4,4,4.0,0.0,0.0,5,5.0,0.0,0.0,2
9,12899781,194067,1661724246188,0,0,3,12899781_0,40,38.0,0.0,...,4,4,4.0,0.0,0.0,5,5.0,0.0,0.0,2


In [57]:
#Features in test dataset

#num_unique_aid_user - number inique aids in the histort for each user
#num_unique_aid_user_0 - number of unique aids in the history for user, action type 0
#num_unique_aid_user_1 - number of unique aids in the history for user, action type 1
#num_unique_aid_user_2 - number of unique aids in the history for user, action type 2

#num_unique_aid - number of unique aids in the history for each unique session. We have a number of session for each user
#num_unique_aid_0 - number of unique aids in the history for each unique session, action type 0
#num_unique_aid_1 - number of unique aids in the history for each unique session, action type 1
#num_unique_aid_2 - number of unique aids in the history for each unique session, action type 2

#num_session_real_id - number of actions in real session for each user

#aid_count - number of times each aid appears in the history for each user
#aid_count_0 - number of times each aid appears in the history for each user, action type 0
#aid_count_1 - number of times each aid appears in the history for each user, action type 1
#aid_count_2 - number of times each aid appears in the history for each user, action type 2

#session_real - 

#num_session_real - num real sessions for each user if 0 it is 1 session

In [58]:
test_df[test_df.session == 12899781]

Unnamed: 0,session,aid,ts,type,session_real,num_session_real,session_real_id,aid_count,aid_count_type_0,aid_count_type_1,...,num_session_real_id,num_unique_aid,num_unique_aid_0,num_unique_aid_1,num_unique_aid_2,num_unique_aid_user,num_unique_aid_user_0,num_unique_aid_user_1,num_unique_aid_user_2,session_real_id_encode
6,12899781,141736,1661724000559,0,0,3,12899781_0,253,231.0,0.0,...,4,4,4.0,0.0,0.0,5,5.0,0.0,0.0,2
7,12899781,199008,1661724022851,0,0,3,12899781_0,437,400.0,0.0,...,4,4,4.0,0.0,0.0,5,5.0,0.0,0.0,2
8,12899781,57315,1661724170835,0,0,3,12899781_0,1105,1047.0,0.0,...,4,4,4.0,0.0,0.0,5,5.0,0.0,0.0,2
9,12899781,194067,1661724246188,0,0,3,12899781_0,40,38.0,0.0,...,4,4,4.0,0.0,0.0,5,5.0,0.0,0.0,2
10,12899781,199008,1661780623778,0,1,3,12899781_1,437,400.0,0.0,...,3,1,1.0,0.0,0.0,5,5.0,0.0,0.0,3
11,12899781,199008,1661781274081,0,1,3,12899781_1,437,400.0,0.0,...,3,1,1.0,0.0,0.0,5,5.0,0.0,0.0,3
12,12899781,199008,1661781409993,1,1,3,12899781_1,437,0.0,33.0,...,3,1,0.0,1.0,0.0,5,0.0,1.0,0.0,3
13,12899781,199008,1661804151788,0,2,3,12899781_2,437,400.0,0.0,...,1,1,1.0,0.0,0.0,5,5.0,0.0,0.0,4
14,12899781,199008,1662060028567,0,3,3,12899781_3,437,400.0,0.0,...,3,2,2.0,0.0,0.0,5,5.0,0.0,0.0,5
15,12899781,199008,1662060064706,0,3,3,12899781_3,437,400.0,0.0,...,3,2,2.0,0.0,0.0,5,5.0,0.0,0.0,5


In [59]:
last_user_session = test_df[test_df.session == 12899781]['session_real_id_encode'].max()

In [60]:
list_unique_sessions_test = test_df['session'].unique()
list_unique_sessions_test

array([12899779, 12899780, 12899781, ..., 14571579, 14571580, 14571581])

In [61]:
# a bunch of functions to get the predictions from the history

def get_last_session_predictions(session_id, test):

    '''Parameters
        --------
        

        Returns
        --------
        list_prediction_last_session: list
            list of aids which appears in the last real session of user, aids sort by type and number of times they appear in the history
    '''
    test_df_session = test_df[test_df.session == session_id]
    test_df_session = test_df_session.sort_values(by=['ts'], ascending=False)

    last_user_session = test_df_session['session_real_id_encode'].max()
    test_df_session = test_df_session[test_df_session.session_real_id_encode == last_user_session]

    list_0 = test_df_session[test_df_session.type == 0]["aid"].to_list()
    list_1 = test_df_session[test_df_session.type == 1]["aid"].to_list()
    list_2 = test_df_session[test_df_session.type == 2]["aid"].to_list()

    list_0 = list(dict.fromkeys(list_0))
    list_1 = list(dict.fromkeys(list_1))
    list_2 = list(dict.fromkeys(list_2))

    list_prediction_last_session = [list_0,  list_1, list_2]
    
    return list_prediction_last_session


def get_all_session_predictions(session_id, test):

    '''Parameters
        --------
        

        Returns
        --------
        list_prediction_all_session: list
            list of aids which appears in the all real session of user, aids sort by type and number of times they appear in the history
    '''
    test_df_session = test_df[test_df.session == session_id]
    test_df_session = test_df_session.sort_values(by=['ts'], ascending=False)

    list_0 = test_df_session[test_df_session.type == 0]["aid"].to_list()
    list_1 = test_df_session[test_df_session.type == 1]["aid"].to_list()
    list_2 = test_df_session[test_df_session.type == 2]["aid"].to_list()

    list_0 = list(dict.fromkeys(list_0))
    list_1 = list(dict.fromkeys(list_1))
    list_2 = list(dict.fromkeys(list_2))

    list_prediction_all_session = [list_0,  list_1, list_2]
    
    return list_prediction_all_session


In [62]:
# get the predictions for last session
session_id_list = []
prediction_last_session_0 = []
prediction_last_session_1 = []
prediction_last_session_2 = []

for session_id in tqdm(list_unique_sessions_test):
    list_prediction_last_session = get_last_session_predictions(session_id, test_df)

    session_id_list.append(session_id)
    prediction_last_session_0.append(list_prediction_last_session[0])
    prediction_last_session_1.append(list_prediction_last_session[1])
    prediction_last_session_2.append(list_prediction_last_session[2])

100%|██████████| 1671803/1671803 [2:57:52<00:00, 156.64it/s]  


In [63]:
prediction_last_session_df = pd.DataFrame(
    {
        "session": session_id_list,
        "prediction_last_session_0": prediction_last_session_0,
        "prediction_last_session_1": prediction_last_session_1,
        "prediction_last_session_2": prediction_last_session_2,
    }
)
prediction_last_session_df.to_parquet(DATA_PROCESSED_PATH/'prediction_last_session_df.parquet')
prediction_last_session_df.head()

Unnamed: 0,session,prediction_last_session_0,prediction_last_session_1,prediction_last_session_2
0,12899779,[59625],[],[]
1,12899780,"[1142000, 736515, 973453, 582732]",[],[]
2,12899781,"[918667, 199008]",[],[]
3,12899782,"[740494, 229748, 530899, 1072927, 638410, 987399, 1099390, 889671, 834354, 406001, 975116, 476063, 723956, 654809, 829180, 363336, 794259, 1344773, 1352725]","[834354, 740494, 987399, 889671, 127404, 1711180, 1344773]","[1007613, 595994, 1033148, 834354, 479970, 1696036, 829180, 1669402]"
4,12899783,[1817895],[],[]
5,12899784,"[1190477, 22981, 1546830, 1579935, 476216, 1269952]",[],[]
6,12899785,"[1497876, 775584, 1179870, 453905, 258458, 383003, 160666, 41655, 1433061, 614626, 804799, 253080, 874493]",[],[]
7,12899786,[955252],[955252],[]
8,12899787,"[1024433, 1682750]",[1682750],[]
9,12899788,"[1663048, 1259911, 39846, 245131]",[],[]


In [67]:
#get predictions for all sessions
session_id_list = []
prediction_all_session_0 = []
prediction_all_session_1 = []
prediction_all_session_2 = []

for session_id in tqdm(list_unique_sessions_test):
    list_prediction_all_session = get_all_session_predictions(session_id, test_df)

    session_id_list.append(session_id)
    prediction_all_session_0.append(list_prediction_all_session[0])
    prediction_all_session_1.append(list_prediction_all_session[1])
    prediction_all_session_2.append(list_prediction_all_session[2])


100%|██████████| 1671803/1671803 [2:38:35<00:00, 175.70it/s]  


In [68]:
prediction_all_session_df = pd.DataFrame(
    {
        "session": session_id_list,
        "prediction_all_session_0": prediction_all_session_0,
        "prediction_all_session_1": prediction_all_session_1,
        "prediction_all_session_2": prediction_all_session_2,
    }
)
prediction_all_session_df.to_parquet(
    DATA_PROCESSED_PATH / "prediction_all_session_df.parquet"
)
prediction_all_session_df.head(10)

Unnamed: 0,session,prediction_all_session_0,prediction_all_session_1,prediction_all_session_2
0,12899779,[59625],[],[]
1,12899780,"[1142000, 736515, 973453, 582732]",[],[]
2,12899781,"[918667, 199008, 194067, 57315, 141736]",[199008],[]
3,12899782,"[740494, 229748, 530899, 1072927, 638410, 987399, 1099390, 889671, 834354, 406001, 975116, 476063, 723956, 654809, 829180, 363336, 794259, 1344773, 1352725, 1299062, 595994, 779477, 562753, 1037537, 413962, 603159, 45034, 1596098, 602722, 1674681, 1494780, 1669402]","[834354, 740494, 987399, 889671, 127404, 1711180, 1344773, 595994, 975116, 779477, 476063, 562753, 413962, 1494780]","[1007613, 595994, 1033148, 834354, 479970, 1696036, 829180, 1669402]"
4,12899783,"[1817895, 607638, 1754419, 1216820, 1729553, 300127, 198385, 255297, 1114789]",[],[]
5,12899784,"[1190477, 22981, 1546830, 1579935, 476216, 1269952, 1036375]",[],[]
6,12899785,"[1497876, 775584, 1179870, 453905, 258458, 383003, 160666, 41655, 1433061, 614626, 804799, 253080, 874493, 1397811, 219032, 1008570, 1784451, 1308865, 656708, 1169631, 10851, 1553332, 543919, 1071379, 552698, 1259418, 1154962, 587969, 51772, 903397, 544238, 1254866, 1444536, 179800, 95259, 608210, 970225]",[],[]
7,12899786,[955252],[955252],[]
8,12899787,"[1024433, 1682750]",[1682750],[]
9,12899788,"[1663048, 1259911, 39846, 245131]",[],[]
