## 0.Import modules

In [1]:
import pandas as pd

from pathlib import Path
import os
import random
import numpy as np
import json
from datetime import timedelta
from collections import Counter
from heapq import nlargest
from tqdm import tqdm
import gc

import matplotlib.pyplot as plt

import warnings

warnings.filterwarnings("ignore")

from sklearn.preprocessing import OrdinalEncoder

import multiprocess
from functools import partial

In [2]:
# // increase widht of the columns
pd.set_option("display.max_colwidth", 2000)

## 1.Load data

In [3]:
### Paths ###
DATA_PATH = Path("../data")
TRAIN_RAW_PATH = DATA_PATH / "raw/train.jsonl"
TEST_RAW_PATH = DATA_PATH / "raw/test.jsonl"
SAMPLE_SUBMISSION_RAW_PATH = DATA_PATH / "raw/sample_submission.csv"

DATA_PROCESSED_PATH = DATA_PATH / "processed"

In [4]:
# // load data pandas
train_df = pd.read_parquet(DATA_PROCESSED_PATH / "train_processed.parquet")
test_df = pd.read_parquet(DATA_PROCESSED_PATH / "test_processed.parquet")

In [5]:
# drop duplicates in actions for an aid in session_real
# train from 216.7M to 167.6M
# test from 6.9M to 5.6M
test_df_filtered = test_df.drop_duplicates(
    subset=["session_real_id_encode", "aid", "type"]
)
train_df_filtered = train_df.drop_duplicates(
    subset=["session_real_id_encode", "aid", "type"]
)

## 2.Get recommendations for cold start users with 1 unique item

In [6]:
test_df.head(10)

Unnamed: 0,session,aid,ts,type,session_real,num_session_real,session_real_id,aid_count,aid_count_type_0,aid_count_type_1,...,num_session_real_id,num_unique_aid,num_unique_aid_0,num_unique_aid_1,num_unique_aid_2,num_unique_aid_user,num_unique_aid_user_0,num_unique_aid_user_1,num_unique_aid_user_2,session_real_id_encode
0,12899779,59625,1661724000278,0,0,0,12899779_0,1,1.0,0.0,...,1,1,1.0,0.0,0.0,1,1.0,0.0,0.0,0
1,12899780,1142000,1661724000378,0,0,0,12899780_0,511,473.0,0.0,...,5,4,4.0,0.0,0.0,4,4.0,0.0,0.0,1
2,12899780,582732,1661724058352,0,0,0,12899780_0,371,333.0,0.0,...,5,4,4.0,0.0,0.0,4,4.0,0.0,0.0,1
3,12899780,973453,1661724109199,0,0,0,12899780_0,94,87.0,0.0,...,5,4,4.0,0.0,0.0,4,4.0,0.0,0.0,1
4,12899780,736515,1661724136868,0,0,0,12899780_0,563,488.0,0.0,...,5,4,4.0,0.0,0.0,4,4.0,0.0,0.0,1
5,12899780,1142000,1661724155248,0,0,0,12899780_0,511,473.0,0.0,...,5,4,4.0,0.0,0.0,4,4.0,0.0,0.0,1
6,12899781,141736,1661724000559,0,0,3,12899781_0,253,231.0,0.0,...,4,4,4.0,0.0,0.0,5,5.0,0.0,0.0,2
7,12899781,199008,1661724022851,0,0,3,12899781_0,437,400.0,0.0,...,4,4,4.0,0.0,0.0,5,5.0,0.0,0.0,2
8,12899781,57315,1661724170835,0,0,3,12899781_0,1105,1047.0,0.0,...,4,4,4.0,0.0,0.0,5,5.0,0.0,0.0,2
9,12899781,194067,1661724246188,0,0,3,12899781_0,40,38.0,0.0,...,4,4,4.0,0.0,0.0,5,5.0,0.0,0.0,2


In [7]:
# Features in test dataset

# num_unique_aid_user - number inique aids in the histort for each user
# num_unique_aid_user_0 - number of unique aids in the history for user, action type 0
# num_unique_aid_user_1 - number of unique aids in the history for user, action type 1
# num_unique_aid_user_2 - number of unique aids in the history for user, action type 2

# num_unique_aid - number of unique aids in the history for each unique session. We have a number of session for each user
# num_unique_aid_0 - number of unique aids in the history for each unique session, action type 0
# num_unique_aid_1 - number of unique aids in the history for each unique session, action type 1
# num_unique_aid_2 - number of unique aids in the history for each unique session, action type 2

# num_session_real_id - number of actions in real session for each user

# aid_count - number of times each aid appears in the history for each user
# aid_count_0 - number of times each aid appears in the history for each user, action type 0
# aid_count_1 - number of times each aid appears in the history for each user, action type 1
# aid_count_2 - number of times each aid appears in the history for each user, action type 2

# session_real -

# num_session_real - num real sessions for each user if 0 it is 1 session

In [8]:
# filter dataframes with only sessions where ocurce aids from unique_aids_for_one_session_test_users
# train_test_df_trunc_sessions 172.4M
# train_test_df_trunc_sessions_real 168.9M
unique_aids_for_one_session_test_users = test_df_filtered[
    test_df_filtered["num_session_real"] == 0
][test_df_filtered.num_unique_aid == 1]["aid"].unique()



train_test_df_trunc = pd.concat(
    [train_df_filtered, test_df_filtered], ignore_index=True
)[["session", "aid", "type", "session_real_id_encode"]]

unique_sessions_real_for_one_session_test_users = train_test_df_trunc[
    train_test_df_trunc.aid.isin(unique_aids_for_one_session_test_users)
]["session_real_id_encode"].unique()


train_test_df_trunc_sessions_real = train_test_df_trunc[
    train_test_df_trunc.session_real_id_encode.isin(
        unique_sessions_real_for_one_session_test_users
    )
]

In [9]:
len(unique_aids_for_one_session_test_users)

255247

In [10]:
len(unique_sessions_real_for_one_session_test_users)

34839652

In [11]:
train_test_df_trunc_sessions_real.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 162212245 entries, 0 to 173230747
Data columns (total 4 columns):
 #   Column                  Dtype
---  ------                  -----
 0   session                 int64
 1   aid                     int64
 2   type                    uint8
 3   session_real_id_encode  int64
dtypes: int64(3), uint8(1)
memory usage: 5.0 GB


In [12]:
# A bunch of usfull functions


def get_cold_start_predictions(
    train_df,
    session_type,
    unique_aid_one_session_users,
):

    df = pd.DataFrame()

    for aid in tqdm(unique_aid_one_session_users):
        df = pd.concat(
            [
                df,
                get_aids_for_cold_aid(train_df, aid, session_type),
            ],
            ignore_index=True,
        )

    return df


def get_aids_for_cold_aid(train_df, aid, session_type):
    # session_type = 'session' or 'session_real_id_encode'

    sessions_for_aid = train_df[train_df.aid == aid][session_type].drop_duplicates()

    df_filtered = train_df[train_df[session_type].isin(sessions_for_aid)]

    df_filtered_0 = df_filtered[df_filtered.type == 0]
    df_filtered_1 = df_filtered[df_filtered.type == 1]
    df_filtered_2 = df_filtered[df_filtered.type == 2]

    df_filtered_0["num_aid_appereance"] = df_filtered_0.groupby("aid")["aid"].transform(
        "count"
    )
    df_filtered_1["num_aid_appereance"] = df_filtered_1.groupby("aid")["aid"].transform(
        "count"
    )
    df_filtered_2["num_aid_appereance"] = df_filtered_2.groupby("aid")["aid"].transform(
        "count"
    )

    df_filtered_0 = df_filtered_0.sort_values(
        by=["num_aid_appereance"], ascending=False
    )
    df_filtered_1 = df_filtered_1.sort_values(
        by=["num_aid_appereance"], ascending=False
    )
    df_filtered_2 = df_filtered_2.sort_values(
        by=["num_aid_appereance"], ascending=False
    )

    aids_for_aid_0 = df_filtered_0["aid"].unique()[:100]
    aids_for_aid_1 = df_filtered_1["aid"].unique()[:100]
    aids_for_aid_2 = df_filtered_2["aid"].unique()[:100]

    result = pd.DataFrame(
        {
            "aid": [aid, aid, aid,],
            'type': [0, 1, 2],
            "aids_for_cold_aid": [
                aids_for_aid_0,
                aids_for_aid_1,
                aids_for_aid_2,
            ],
        }
    )

    return result


# // add elements from second numpy array to first numpy array if they do not exist in the first numpy array
def add_elements_to_numpy_array(array1, array2):
    for item in array2:
        if item not in array1:
            array1 = np.append(array1, item)
    return array1

def extract_co_ocurrences(train_test_df_trunc, unique_aids_for_one_session_test_users):
    co_ocurence_dict = {}

    for aid in tqdm(unique_aids_for_one_session_test_users[:10]):

        sessions_for_aid = train_test_df_trunc[train_test_df_trunc.aid == aid][
            "session"
        ].drop_duplicates()

        df = train_test_df_trunc[
            train_test_df_trunc.session_real_id_encode.isin(sessions_for_aid)
        ]

        co_ocurence_dict[aid] = df["aid"].values

    return co_ocurence_dict

In [13]:
# // get predictions for cold start users
num_of_processes = 4
batches = np.array_split(unique_aids_for_one_session_test_users[150000:], num_of_processes)
pool = multiprocess.Pool(processes=num_of_processes)

result_list = pool.map(
    partial(get_cold_start_predictions,
            train_test_df_trunc_sessions_real,
    "session_real_id_encode",
    ), batches
)

pool.close()

#without multiprocessing
'''cold_start_df_session_real = get_cold_start_predictions(
    train_test_df_trunc_sessions_real,
    unique_aids_for_one_session_test_users,
    num_unique_aid,
    "session_real_id_encode",
    test_df,
)'''

100%|██████████| 37500/37500 [19:44:37<00:00,  1.90s/it]     
100%|██████████| 37500/37500 [20:02:57<00:00,  1.92s/it]  
100%|██████████| 37500/37500 [20:10:30<00:00,  1.94s/it]
100%|██████████| 37500/37500 [20:23:16<00:00,  1.96s/it]


'cold_start_df_session_real = get_cold_start_predictions(\n    train_test_df_trunc_sessions_real,\n    unique_aids_for_one_session_test_users,\n    num_unique_aid,\n    "session_real_id_encode",\n    test_df,\n)'

In [14]:
cold_start_df_session_real = pd.DataFrame()
for result in result_list:
    cold_start_df_session_real = pd.concat(
        [cold_start_df_session_real, pd.DataFrame(result)], ignore_index=True
    )

cold_start_df_session_real.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 450000 entries, 0 to 449999
Data columns (total 3 columns):
 #   Column             Non-Null Count   Dtype 
---  ------             --------------   ----- 
 0   aid                450000 non-null  int64 
 1   type               450000 non-null  int64 
 2   aids_for_cold_aid  450000 non-null  object
dtypes: int64(2), object(1)
memory usage: 10.3+ MB




In [15]:
cold_start_df_session_real.to_parquet(
    DATA_PROCESSED_PATH / "cold_start_df_session_real_part2.parquet"
)



In [None]:
cold_start = pd.DataFrame()

cold_start_df_session_real.rename(
    columns={
        "session_type": "session_type_real",
        "aids_for_cold_aid": "aids_for_cold_aid_real",
    },
    inplace=True,
)

# join two pandas dataframes
#cold_start = pd.concat([cold_start_df_session_real, cold_start_df_session], axis=1)
cold_start = cold_start_df_session_real.copy()

cold_start.head()

Unnamed: 0,aid,type,aids_for_cold_aid_real
0,59625,0,"[59625, 737445, 438191, 1340695, 731692, 1253524, 941596, 1790770, 448688, 1667087, 475447, 1265085, 707616, 94230, 620510, 1138236, 1817286, 1415576, 967363, 273918, 707499, 1700255, 1081608, 1103941, 894169, 199422, 339846, 1422133, 1660529, 758199, 1202618, 697340, 1695265, 89997, 658421, 1491271, 435875, 370533, 1163588, 1410719, 1631559, 1014952, 920960, 425824, 1589312, 1582154, 1774847, 381972, 1852616, 535538, 1159379, 1630830, 1224704, 1464625, 1854220, 1072236, 1718550, 1545861, 581388, 1336097, 1034093, 1288826, 146176, 1031677, 973193, 1349921, 378851, 1149867, 387026, 1517085, 1204217, 225209, 984047, 830629, 499621, 1019683, 45290, 164098, 523135, 811662, 3295, 1505323, 1854910, 1606540, 985764, 637538, 535414, 361861, 679602, 790748, 849637, 687865, 718440, 1481388, 286535, 1804863, 302827, 1153899, 729915, 578742]"
1,59625,1,"[469285, 251302, 1130911, 1657590, 397451, 1493965, 689970, 1491271, 435875, 920960]"
2,59625,2,[]
3,955252,0,"[955252, 1632910, 554230, 98930, 1853605, 1792771, 1186873, 1218809, 962766, 1207659, 302745, 15983, 1250405, 1840048, 90173, 1736098, 543690, 940465, 777554, 28674, 213691, 152132, 908492, 1082228, 1593281, 1610349, 1320155, 1458791, 1687558, 679967, 1699291, 691417, 1644889, 285480, 28104, 1560710, 1442051, 1369591, 981872, 1348058, 81857, 853993, 491200, 1203425, 150533, 1149933, 453267, 1599889, 833245, 1555393, 570054, 1299839, 1014923, 794192, 59455, 1580498, 656475, 26525, 1263666, 606395, 518311, 1488154, 275725, 561063, 350891, 522206, 1563147, 136152, 815935, 997720, 1766301, 1797516, 1749694, 1679772, 678882, 1470301, 441090, 814450, 1460239, 1362711, 191012, 336903, 1395262, 1558673, 691777, 146256, 1600810, 1635094, 110579, 674494, 728700, 1036074, 438233, 1215103, 852817, 1255670, 318358, 908007, 1073616, 1855219]"
4,955252,1,"[955252, 1792771, 1593281, 1309503, 1348058, 28674, 997720, 940465, 1563147, 453267, 1149933, 1203425, 491200, 110579, 1840048, 678882, 543690, 852817, 554230, 1186873, 715146, 1245357, 1743402, 1615354, 108357, 1598882, 1855219, 908007, 1073616, 1218809]"


In [None]:
cold_start_sessions = test_df[test_df["num_session_real"] == 0
][test_df.num_unique_aid == 1].drop_duplicates(subset=['session'])[['session', 'aid']]

cold_start_sessions

Unnamed: 0,session,aid
0,12899779,59625
207,12899786,955252
256,12899796,4503
291,12899800,609871
330,12899804,1558691
...,...,...
6928118,14571577,1141710
6928119,14571578,519105
6928120,14571579,739876
6928121,14571580,202353


In [None]:
cold_start_sessions_0 = cold_start_sessions.merge(cold_start[cold_start.type == 0], on='aid', how='left')
cold_start_sessions_1 = cold_start_sessions.merge(cold_start[cold_start.type == 1], on='aid', how='left')
cold_start_sessions_2 = cold_start_sessions.merge(cold_start[cold_start.type == 2], on='aid', how='left')

cold_start_sessions_0['session_type'] = cold_start_sessions_0.apply(lambda row: str(row.session) + '_clicks', axis=1)
cold_start_sessions_1['session_type'] = cold_start_sessions_1.apply(lambda row: str(row.session) + '_carts', axis=1)
cold_start_sessions_2['session_type'] = cold_start_sessions_2.apply(lambda row: str(row.session) + '_orders', axis=1)

cold_start_sessions_full = pd.concat([cold_start_sessions_0, cold_start_sessions_1, cold_start_sessions_2], ignore_index=True)
cold_start_sessions_full = cold_start_sessions_full.sort_values(by=['session_type'])
cold_start_sessions_full

Unnamed: 0,session,aid,type,aids_for_cold_aid_real,session_type
839607,12899779,59625,1.0,"[469285, 251302, 1130911, 1657590, 397451, 1493965, 689970, 1491271, 435875, 920960]",12899779_carts
0,12899779,59625,0.0,"[59625, 737445, 438191, 1340695, 731692, 1253524, 941596, 1790770, 448688, 1667087, 475447, 1265085, 707616, 94230, 620510, 1138236, 1817286, 1415576, 967363, 273918, 707499, 1700255, 1081608, 1103941, 894169, 199422, 339846, 1422133, 1660529, 758199, 1202618, 697340, 1695265, 89997, 658421, 1491271, 435875, 370533, 1163588, 1410719, 1631559, 1014952, 920960, 425824, 1589312, 1582154, 1774847, 381972, 1852616, 535538, 1159379, 1630830, 1224704, 1464625, 1854220, 1072236, 1718550, 1545861, 581388, 1336097, 1034093, 1288826, 146176, 1031677, 973193, 1349921, 378851, 1149867, 387026, 1517085, 1204217, 225209, 984047, 830629, 499621, 1019683, 45290, 164098, 523135, 811662, 3295, 1505323, 1854910, 1606540, 985764, 637538, 535414, 361861, 679602, 790748, 849637, 687865, 718440, 1481388, 286535, 1804863, 302827, 1153899, 729915, 578742]",12899779_clicks
1679214,12899779,59625,2.0,[],12899779_orders
839608,12899786,955252,1.0,"[955252, 1792771, 1593281, 1309503, 1348058, 28674, 997720, 940465, 1563147, 453267, 1149933, 1203425, 491200, 110579, 1840048, 678882, 543690, 852817, 554230, 1186873, 715146, 1245357, 1743402, 1615354, 108357, 1598882, 1855219, 908007, 1073616, 1218809]",12899786_carts
1,12899786,955252,0.0,"[955252, 1632910, 554230, 98930, 1853605, 1792771, 1186873, 1218809, 962766, 1207659, 302745, 15983, 1250405, 1840048, 90173, 1736098, 543690, 940465, 777554, 28674, 213691, 152132, 908492, 1082228, 1593281, 1610349, 1320155, 1458791, 1687558, 679967, 1699291, 691417, 1644889, 285480, 28104, 1560710, 1442051, 1369591, 981872, 1348058, 81857, 853993, 491200, 1203425, 150533, 1149933, 453267, 1599889, 833245, 1555393, 570054, 1299839, 1014923, 794192, 59455, 1580498, 656475, 26525, 1263666, 606395, 518311, 1488154, 275725, 561063, 350891, 522206, 1563147, 136152, 815935, 997720, 1766301, 1797516, 1749694, 1679772, 678882, 1470301, 441090, 814450, 1460239, 1362711, 191012, 336903, 1395262, 1558673, 691777, 146256, 1600810, 1635094, 110579, 674494, 728700, 1036074, 438233, 1215103, 852817, 1255670, 318358, 908007, 1073616, 1855219]",12899786_clicks
...,...,...,...,...,...
839605,14571580,202353,,,14571580_clicks
2518819,14571580,202353,,,14571580_orders
1679213,14571581,1100210,,,14571581_carts
839606,14571581,1100210,,,14571581_clicks


In [None]:
cold_start_sessions_full["flag"] = cold_start_sessions_full.apply(
    lambda row: 1 if len(row.aids_for_cold_aid_real) < 20 else 0, axis=1
)
cold_start_sessions_full[cold_start_sessions_full.flag == 1]

NameError: name 'cold_start_sessions_full' is not defined

In [None]:
cold_start_sessions_full.to_parquet(
    DATA_PROCESSED_PATH / "prediction_for_cold_start_users.parquet"
)