## 0.Import modules

In [1]:
import pandas as pd

from pathlib import Path
import os
import random
import numpy as np
import json
from datetime import timedelta
from collections import Counter
from heapq import nlargest
from tqdm import tqdm
import gc

import matplotlib.pyplot as plt

import warnings

warnings.filterwarnings("ignore")

from sklearn.preprocessing import OrdinalEncoder

import multiprocess
from functools import partial

In [2]:
# // increase widht of the columns
pd.set_option("display.max_colwidth", 2000)

## 1.Load data

In [3]:
### Paths ###
DATA_PATH = Path("../data")
TRAIN_RAW_PATH = DATA_PATH / "raw/train.jsonl"
TEST_RAW_PATH = DATA_PATH / "raw/test.jsonl"
SAMPLE_SUBMISSION_RAW_PATH = DATA_PATH / "raw/sample_submission.csv"

DATA_PROCESSED_PATH = DATA_PATH / "processed"

In [4]:
# // load data pandas
train_df = pd.read_parquet(DATA_PROCESSED_PATH / "train_processed.parquet")
test_df = pd.read_parquet(DATA_PROCESSED_PATH / "test_processed.parquet")

In [5]:
# drop duplicates in actions for an aid in session_real
# train from 216.7M to 167.6M
# test from 6.9M to 5.6M
test_df_filtered = test_df.drop_duplicates(
    subset=["session_real_id_encode", "aid", "type"]
)
train_df_filtered = train_df.drop_duplicates(
    subset=["session_real_id_encode", "aid", "type"]
)

## 2.Get recommendations for cold start users with 1 unique item

In [6]:
test_df.head(10)

Unnamed: 0,session,aid,ts,type,session_real,num_session_real,session_real_id,aid_count,aid_count_type_0,aid_count_type_1,...,num_session_real_id,num_unique_aid,num_unique_aid_0,num_unique_aid_1,num_unique_aid_2,num_unique_aid_user,num_unique_aid_user_0,num_unique_aid_user_1,num_unique_aid_user_2,session_real_id_encode
0,12899779,59625,1661724000278,0,0,0,12899779_0,1,1.0,0.0,...,1,1,1.0,0.0,0.0,1,1.0,0.0,0.0,0
1,12899780,1142000,1661724000378,0,0,0,12899780_0,511,473.0,0.0,...,5,4,4.0,0.0,0.0,4,4.0,0.0,0.0,1
2,12899780,582732,1661724058352,0,0,0,12899780_0,371,333.0,0.0,...,5,4,4.0,0.0,0.0,4,4.0,0.0,0.0,1
3,12899780,973453,1661724109199,0,0,0,12899780_0,94,87.0,0.0,...,5,4,4.0,0.0,0.0,4,4.0,0.0,0.0,1
4,12899780,736515,1661724136868,0,0,0,12899780_0,563,488.0,0.0,...,5,4,4.0,0.0,0.0,4,4.0,0.0,0.0,1
5,12899780,1142000,1661724155248,0,0,0,12899780_0,511,473.0,0.0,...,5,4,4.0,0.0,0.0,4,4.0,0.0,0.0,1
6,12899781,141736,1661724000559,0,0,3,12899781_0,253,231.0,0.0,...,4,4,4.0,0.0,0.0,5,5.0,0.0,0.0,2
7,12899781,199008,1661724022851,0,0,3,12899781_0,437,400.0,0.0,...,4,4,4.0,0.0,0.0,5,5.0,0.0,0.0,2
8,12899781,57315,1661724170835,0,0,3,12899781_0,1105,1047.0,0.0,...,4,4,4.0,0.0,0.0,5,5.0,0.0,0.0,2
9,12899781,194067,1661724246188,0,0,3,12899781_0,40,38.0,0.0,...,4,4,4.0,0.0,0.0,5,5.0,0.0,0.0,2


In [7]:
# Features in test dataset

# num_unique_aid_user - number inique aids in the histort for each user
# num_unique_aid_user_0 - number of unique aids in the history for user, action type 0
# num_unique_aid_user_1 - number of unique aids in the history for user, action type 1
# num_unique_aid_user_2 - number of unique aids in the history for user, action type 2

# num_unique_aid - number of unique aids in the history for each unique session. We have a number of session for each user
# num_unique_aid_0 - number of unique aids in the history for each unique session, action type 0
# num_unique_aid_1 - number of unique aids in the history for each unique session, action type 1
# num_unique_aid_2 - number of unique aids in the history for each unique session, action type 2

# num_session_real_id - number of actions in real session for each user

# aid_count - number of times each aid appears in the history for each user
# aid_count_0 - number of times each aid appears in the history for each user, action type 0
# aid_count_1 - number of times each aid appears in the history for each user, action type 1
# aid_count_2 - number of times each aid appears in the history for each user, action type 2

# session_real -

# num_session_real - num real sessions for each user if 0 it is 1 session

In [8]:
# filter dataframes with only sessions where ocurce aids from unique_aids_for_one_session_test_users
# train_test_df_trunc_sessions 172.4M
# train_test_df_trunc_sessions_real 168.9M

unique_aids_for_one_aid_test_users = test_df_filtered[
    test_df_filtered["num_session_real"] != 0
][test_df_filtered.num_unique_aid_user == 1]["aid"].unique()



train_test_df_trunc = pd.concat(
    [train_df_filtered, test_df_filtered], ignore_index=True
)[["session", "aid", "type", "session_real_id_encode"]]



'unique_sessions_real_for_one_session_test_users = train_test_df_trunc[\n    train_test_df_trunc.aid.isin(unique_aids_for_one_session_test_users)\n]["session_real_id_encode"].unique()\n\n\ntrain_test_df_trunc_sessions_real = train_test_df_trunc[\n    train_test_df_trunc.session_real_id_encode.isin(\n        unique_sessions_real_for_one_session_test_users\n    )\n]'

In [10]:
#delete sessions where we dont have aids from unique_aids_for_one_aid_test_users
unique_sessions_real_for_one_session_test_users = train_test_df_trunc[
    train_test_df_trunc.aid.isin(unique_aids_for_one_aid_test_users)
]["session_real_id_encode"].unique()


train_test_df_trunc_sessions_real = train_test_df_trunc[
    train_test_df_trunc.session_real_id_encode.isin(
        unique_sessions_real_for_one_session_test_users
    )
]

In [13]:
# A bunch of usfull functions


def get_cold_start_predictions(
    train_df,
    session_type,
    unique_aid_one_session_users,
):

    df = pd.DataFrame()

    for aid in tqdm(unique_aid_one_session_users):
        df = pd.concat(
            [
                df,
                get_aids_for_cold_aid(train_df, aid, session_type),
            ],
            ignore_index=True,
        )

    return df


def get_aids_for_cold_aid(train_df, aid, session_type):
    # session_type = 'session' or 'session_real_id_encode'

    sessions_for_aid = train_df[train_df.aid == aid][session_type].drop_duplicates()

    df_filtered = train_df[train_df[session_type].isin(sessions_for_aid)]

    df_filtered_0 = df_filtered[df_filtered.type == 0]
    df_filtered_1 = df_filtered[df_filtered.type == 1]
    df_filtered_2 = df_filtered[df_filtered.type == 2]

    df_filtered_0["num_aid_appereance"] = df_filtered_0.groupby("aid")["aid"].transform(
        "count"
    )
    df_filtered_1["num_aid_appereance"] = df_filtered_1.groupby("aid")["aid"].transform(
        "count"
    )
    df_filtered_2["num_aid_appereance"] = df_filtered_2.groupby("aid")["aid"].transform(
        "count"
    )

    df_filtered_0 = df_filtered_0.sort_values(
        by=["num_aid_appereance"], ascending=False
    )
    df_filtered_1 = df_filtered_1.sort_values(
        by=["num_aid_appereance"], ascending=False
    )
    df_filtered_2 = df_filtered_2.sort_values(
        by=["num_aid_appereance"], ascending=False
    )

    aids_for_aid_0 = df_filtered_0["aid"].unique()[:100]
    aids_for_aid_1 = df_filtered_1["aid"].unique()[:100]
    aids_for_aid_2 = df_filtered_2["aid"].unique()[:100]

    result = pd.DataFrame(
        {
            "aid": [aid, aid, aid,],
            'type': [0, 1, 2],
            "aids_for_cold_aid": [
                aids_for_aid_0,
                aids_for_aid_1,
                aids_for_aid_2,
            ],
        }
    )

    return result


# // add elements from second numpy array to first numpy array if they do not exist in the first numpy array
def add_elements_to_numpy_array(array1, array2):
    for item in array2:
        if item not in array1:
            array1 = np.append(array1, item)
    return array1

def extract_co_ocurrences(train_test_df_trunc, unique_aids_for_one_session_test_users):
    co_ocurence_dict = {}

    for aid in tqdm(unique_aids_for_one_session_test_users[:10]):

        sessions_for_aid = train_test_df_trunc[train_test_df_trunc.aid == aid][
            "session"
        ].drop_duplicates()

        df = train_test_df_trunc[
            train_test_df_trunc.session_real_id_encode.isin(sessions_for_aid)
        ]

        co_ocurence_dict[aid] = df["aid"].values

    return co_ocurence_dict

In [14]:
# // get predictions for cold start users
num_of_processes = 4
batches = np.array_split(unique_aids_for_one_aid_test_users, num_of_processes)
pool = multiprocess.Pool(processes=num_of_processes)

result_list = pool.map(
    partial(get_cold_start_predictions,
            train_test_df_trunc_sessions_real,
    "session_real_id_encode",
    ), batches
)

pool.close()

100%|██████████| 2802/2802 [43:07<00:00,  1.08it/s] 
100%|██████████| 2801/2801 [42:59<00:00,  1.09it/s]
100%|██████████| 2801/2801 [42:38<00:00,  1.09it/s]
100%|██████████| 2801/2801 [42:19<00:00,  1.10it/s]


In [15]:
cold_start_df_session_real = pd.DataFrame()
for result in result_list:
    cold_start_df_session_real = pd.concat(
        [cold_start_df_session_real, pd.DataFrame(result)], ignore_index=True
    )

cold_start_df_session_real.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33615 entries, 0 to 33614
Data columns (total 3 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   aid                33615 non-null  int64 
 1   type               33615 non-null  int64 
 2   aids_for_cold_aid  33615 non-null  object
dtypes: int64(2), object(1)
memory usage: 788.0+ KB




In [16]:
cold_start = pd.DataFrame()

cold_start_df_session_real.rename(
    columns={
        "session_type": "session_type_real",
        "aids_for_cold_aid": "aids_for_cold_aid_real",
    },
    inplace=True,
)

# join two pandas dataframes
#cold_start = pd.concat([cold_start_df_session_real, cold_start_df_session], axis=1)
cold_start = cold_start_df_session_real.copy()

cold_start.head()



Unnamed: 0,aid,type,aids_for_cold_aid_real
0,986164,0,"[986164, 584027, 274783, 508883, 881286, 647522, 688602, 634452, 1734305, 785712, 390163, 1811984, 1043508, 1776419, 1679224, 615771, 727928, 148534, 584064, 159361, 423037, 147526, 726978, 304579, 206418, 1010579, 1354002, 1811963, 1723679, 1167765, 982938, 1533737, 1217083, 752756, 383364, 713187, 1596300, 837248, 1342155, 1580544, 303600, 1779147, 703792, 431720, 406829, 1792271, 569494, 1263928, 1156699, 474743, 1384129, 1634917, 352756, 1460571, 510488, 577290, 1367804, 1842170, 536830, 1068136, 1015737, 130020, 102861, 397795, 783412, 214063, 1135650, 683401, 913754, 309445, 1553959, 1394787, 87573, 1723428, 289026, 1292042, 108125, 917461, 1263355, 102664, 1339717, 333066, 829797, 151698, 673435, 824580, 1464121, 1079588, 129170, 707225, 29735, 1531187, 378227, 1259472, 919744, 1279878, 857899, 799679, 880872, 332654]"
1,986164,1,"[986164, 508883, 274783, 647522, 634452, 688602, 1811984, 584027, 390163, 881286, 615771, 785712, 1734305, 1776419, 1043508, 1679224, 727928, 423037, 1723679, 159361, 148534, 383364, 752756, 1010579, 1779147, 206418, 147526, 726978, 673435, 214063, 1842170, 1533737, 1342155, 431720, 584064, 1811963, 352756, 1460571, 1015737, 703792, 406829, 1580544, 102861, 1367804, 151698, 304579, 1292042, 1247513, 1167765, 982938, 397795, 29735, 1394787, 569494, 683401, 913754, 1156699, 1596300, 1068136, 309445, 332654, 713187, 1354002, 1263928, 832192, 536830, 577290, 303600, 1217083, 378227, 87573, 474743, 1263355, 1553959, 1422497, 670006, 447645, 1802053, 857899, 783412, 1531187, 249300, 917461, 130020, 1733943, 289026, 298371, 880872, 1722991, 510488, 1469891, 1853288, 1384129, 1780794, 333066, 331708, 707225, 837248, 1785754, 1723428]"
2,986164,2,"[986164, 508883, 1811984, 634452, 688602, 274783, 584027, 881286, 785712, 615771, 673435, 1043508, 1723679, 647522, 159361, 383364, 423037, 1734305, 752756, 727928, 148534, 1811963, 1580544, 206418, 431720, 1247513, 1842170, 397795, 214063, 332654, 1460571, 1342155, 1015737, 1010579, 1292042, 1779147, 1367804, 447645, 331708, 1167765, 726978, 147526, 1533737, 982938, 703792, 1156699, 352756, 29735, 1776419, 406829, 304579, 913754, 584064, 670006, 309445, 1722991, 530377, 151698, 1263355, 289026, 917461, 857899, 577290, 1802053, 329725, 1853288, 1354002, 1553959, 1217083, 1681537, 150294, 683401, 1679224, 536830, 832192, 1264313, 707225, 1263928, 1102089, 1083665, 510488, 130020, 303600, 1469891, 474743, 1436280, 1384129, 249300, 224345, 1684387, 1785754, 824580, 518425, 1596300, 102861, 1225780, 111057, 811371, 1611581, 1336175]"
3,732299,0,"[732299, 3542, 1008624, 331708, 480443, 518425, 1109824, 759436, 1006790, 754412, 1242608, 1469891, 980990, 1853288, 1460571, 240796, 397477, 1536792, 295859, 782161, 491766, 921455, 867018, 1107961, 618888, 1239475, 701489, 1798272, 1790094, 218668, 613059, 1448097, 1776244, 1472402, 379160, 950718, 1043508, 1623921, 1307159, 606307, 1611581, 858072, 447, 620545, 811084, 1513669, 295531, 901817, 96064, 836707, 734026, 35615, 43313, 713763, 1833745, 1399384, 961277, 811371, 1750274, 751510, 1621009, 102345, 1019736, 139752, 925086, 1201859, 1608431, 1674500, 1065944, 453029, 651938, 983816, 282982, 1767530, 1178629, 1310977, 150294, 1473990, 1444829, 944778, 199008, 670006, 1783610, 447645, 1849432, 1244863, 119874, 1248748, 634452, 1645651, 1148071, 21517, 1798581, 1287659, 824944, 608965, 1272792, 1197632, 1839414, 724035]"
4,732299,1,"[732299, 1008624, 3542, 331708, 759436, 1043508, 1109824, 1006790, 518425, 1469891, 491766, 1853288, 811084, 1460571, 480443, 754412, 701489, 867018, 282982, 606307, 613059, 1242608, 634452, 980990, 1536792, 1513669, 950718, 1798581, 379160, 295859, 782161, 1473990, 618888, 858072, 1790094, 1248748, 1444829, 199008, 1192169, 1776244, 843000, 1839414, 1611581, 700102, 1640219, 1767530, 986164, 1399384, 670006, 831165, 1310977, 1533519, 1068136, 1336175, 713763, 982423, 1514371, 897197, 150294, 925086, 1201859, 1623921, 493104, 1448097, 811371, 29650, 1558724, 119874, 1406660, 921455, 1239475, 1608431, 836707, 754963, 1750274, 570922, 1664927, 249809, 333908, 1747983, 1053786, 651938, 981831, 1206294, 218668, 589670, 1417655, 1401342, 1645651, 378404, 1307159, 77846, 1069146, 901817, 561772, 1596292, 373050, 43313, 1370091, 845181]"


In [18]:
cold_start_sessions = test_df[test_df["num_session_real"] != 0
][test_df.num_unique_aid_user == 1].drop_duplicates(subset=['session'])[['session', 'aid']]

cold_start_sessions

Unnamed: 0,session,aid
477,12899820,986164
3040,12899992,732299
4652,12900145,1368314
4676,12900154,406905
5775,12900368,565065
...,...,...
6761372,14521841,298320
6761635,14521925,291494
6763520,14522507,870735
6773892,14525282,1104957


In [19]:
cold_start_sessions_0 = cold_start_sessions.merge(cold_start[cold_start.type == 0], on='aid', how='left')
cold_start_sessions_1 = cold_start_sessions.merge(cold_start[cold_start.type == 1], on='aid', how='left')
cold_start_sessions_2 = cold_start_sessions.merge(cold_start[cold_start.type == 2], on='aid', how='left')

cold_start_sessions_0['session_type'] = cold_start_sessions_0.apply(lambda row: str(row.session) + '_clicks', axis=1)
cold_start_sessions_1['session_type'] = cold_start_sessions_1.apply(lambda row: str(row.session) + '_carts', axis=1)
cold_start_sessions_2['session_type'] = cold_start_sessions_2.apply(lambda row: str(row.session) + '_orders', axis=1)

cold_start_sessions_full = pd.concat([cold_start_sessions_0, cold_start_sessions_1, cold_start_sessions_2], ignore_index=True)
cold_start_sessions_full = cold_start_sessions_full.sort_values(by=['session_type'])
cold_start_sessions_full

Unnamed: 0,session,aid,type,aids_for_cold_aid_real,session_type
15456,12899820,986164,1,"[986164, 508883, 274783, 647522, 634452, 688602, 1811984, 584027, 390163, 881286, 615771, 785712, 1734305, 1776419, 1043508, 1679224, 727928, 423037, 1723679, 159361, 148534, 383364, 752756, 1010579, 1779147, 206418, 147526, 726978, 673435, 214063, 1842170, 1533737, 1342155, 431720, 584064, 1811963, 352756, 1460571, 1015737, 703792, 406829, 1580544, 102861, 1367804, 151698, 304579, 1292042, 1247513, 1167765, 982938, 397795, 29735, 1394787, 569494, 683401, 913754, 1156699, 1596300, 1068136, 309445, 332654, 713187, 1354002, 1263928, 832192, 536830, 577290, 303600, 1217083, 378227, 87573, 474743, 1263355, 1553959, 1422497, 670006, 447645, 1802053, 857899, 783412, 1531187, 249300, 917461, 130020, 1733943, 289026, 298371, 880872, 1722991, 510488, 1469891, 1853288, 1384129, 1780794, 333066, 331708, 707225, 837248, 1785754, 1723428]",12899820_carts
0,12899820,986164,0,"[986164, 584027, 274783, 508883, 881286, 647522, 688602, 634452, 1734305, 785712, 390163, 1811984, 1043508, 1776419, 1679224, 615771, 727928, 148534, 584064, 159361, 423037, 147526, 726978, 304579, 206418, 1010579, 1354002, 1811963, 1723679, 1167765, 982938, 1533737, 1217083, 752756, 383364, 713187, 1596300, 837248, 1342155, 1580544, 303600, 1779147, 703792, 431720, 406829, 1792271, 569494, 1263928, 1156699, 474743, 1384129, 1634917, 352756, 1460571, 510488, 577290, 1367804, 1842170, 536830, 1068136, 1015737, 130020, 102861, 397795, 783412, 214063, 1135650, 683401, 913754, 309445, 1553959, 1394787, 87573, 1723428, 289026, 1292042, 108125, 917461, 1263355, 102664, 1339717, 333066, 829797, 151698, 673435, 824580, 1464121, 1079588, 129170, 707225, 29735, 1531187, 378227, 1259472, 919744, 1279878, 857899, 799679, 880872, 332654]",12899820_clicks
30912,12899820,986164,2,"[986164, 508883, 1811984, 634452, 688602, 274783, 584027, 881286, 785712, 615771, 673435, 1043508, 1723679, 647522, 159361, 383364, 423037, 1734305, 752756, 727928, 148534, 1811963, 1580544, 206418, 431720, 1247513, 1842170, 397795, 214063, 332654, 1460571, 1342155, 1015737, 1010579, 1292042, 1779147, 1367804, 447645, 331708, 1167765, 726978, 147526, 1533737, 982938, 703792, 1156699, 352756, 29735, 1776419, 406829, 304579, 913754, 584064, 670006, 309445, 1722991, 530377, 151698, 1263355, 289026, 917461, 857899, 577290, 1802053, 329725, 1853288, 1354002, 1553959, 1217083, 1681537, 150294, 683401, 1679224, 536830, 832192, 1264313, 707225, 1263928, 1102089, 1083665, 510488, 130020, 303600, 1469891, 474743, 1436280, 1384129, 249300, 224345, 1684387, 1785754, 824580, 518425, 1596300, 102861, 1225780, 111057, 811371, 1611581, 1336175]",12899820_orders
15457,12899992,732299,1,"[732299, 1008624, 3542, 331708, 759436, 1043508, 1109824, 1006790, 518425, 1469891, 491766, 1853288, 811084, 1460571, 480443, 754412, 701489, 867018, 282982, 606307, 613059, 1242608, 634452, 980990, 1536792, 1513669, 950718, 1798581, 379160, 295859, 782161, 1473990, 618888, 858072, 1790094, 1248748, 1444829, 199008, 1192169, 1776244, 843000, 1839414, 1611581, 700102, 1640219, 1767530, 986164, 1399384, 670006, 831165, 1310977, 1533519, 1068136, 1336175, 713763, 982423, 1514371, 897197, 150294, 925086, 1201859, 1623921, 493104, 1448097, 811371, 29650, 1558724, 119874, 1406660, 921455, 1239475, 1608431, 836707, 754963, 1750274, 570922, 1664927, 249809, 333908, 1747983, 1053786, 651938, 981831, 1206294, 218668, 589670, 1417655, 1401342, 1645651, 378404, 1307159, 77846, 1069146, 901817, 561772, 1596292, 373050, 43313, 1370091, 845181]",12899992_carts
1,12899992,732299,0,"[732299, 3542, 1008624, 331708, 480443, 518425, 1109824, 759436, 1006790, 754412, 1242608, 1469891, 980990, 1853288, 1460571, 240796, 397477, 1536792, 295859, 782161, 491766, 921455, 867018, 1107961, 618888, 1239475, 701489, 1798272, 1790094, 218668, 613059, 1448097, 1776244, 1472402, 379160, 950718, 1043508, 1623921, 1307159, 606307, 1611581, 858072, 447, 620545, 811084, 1513669, 295531, 901817, 96064, 836707, 734026, 35615, 43313, 713763, 1833745, 1399384, 961277, 811371, 1750274, 751510, 1621009, 102345, 1019736, 139752, 925086, 1201859, 1608431, 1674500, 1065944, 453029, 651938, 983816, 282982, 1767530, 1178629, 1310977, 150294, 1473990, 1444829, 944778, 199008, 670006, 1783610, 447645, 1849432, 1244863, 119874, 1248748, 634452, 1645651, 1148071, 21517, 1798581, 1287659, 824944, 608965, 1272792, 1197632, 1839414, 724035]",12899992_clicks
...,...,...,...,...,...
15454,14525282,1104957,0,"[1104957, 347739, 565155, 1089769, 252277, 1569772, 770856, 1531151, 38033, 1489129, 1063803, 1790272, 1016163, 505007, 838161, 282175, 874481, 1571375, 924856, 1598376, 1063737, 1159240, 1300394, 955534, 1469989, 1772570, 1064970, 31748, 948957, 43813, 1205251, 1670259, 449938, 603392, 1096467, 1393635, 829026, 1460428, 1398136, 735114, 1431714, 938256, 554220, 1222448, 119992, 1250072, 1752011, 269435, 1043561, 1209127, 315265, 18774, 1462619, 376305, 588380, 1362590, 23831, 884509, 1850348, 462651, 765639, 1782412, 125405, 742294, 604004, 551866, 913732, 1393100, 1038985, 238218, 512828, 1108684, 1039051, 632899, 1626909, 486180, 1600883, 284042, 1415731, 245188, 1664622, 976746, 449053, 887006, 1380529, 1796234, 1575795, 1371576, 443040, 1771174, 1240328, 353066, 255978, 1767167, 1550570, 1144893, 958496, 755533, 1429286, 922482]",14525282_clicks
46366,14525282,1104957,2,"[938256, 1754075]",14525282_orders
30911,14527480,443659,1,"[443659, 507514, 1105983, 1059983, 1320998, 1511697, 1206175, 1520871, 1103158, 1157982, 1829584, 18232, 881858, 318610, 358911, 1307056, 167005, 923948, 1700357, 166037, 789186, 210208, 559216, 477595, 1185039, 512474, 904917, 1458904, 1350171, 56279, 279167, 1102222, 934663, 99429, 1107572, 50609, 385064, 145332, 669252, 839956, 1022566, 544144, 191178, 60528, 258353, 777212, 444084, 1125519, 1804558, 495275, 1691940, 68160, 537466, 164075, 29735, 182350, 1166587, 1796489, 1379013, 1817367, 1196256, 1006198, 825958, 124343, 51010, 1757467, 1407803, 1006218, 495025, 443425, 938646, 1116864, 1512879, 1836774, 893912, 77422, 1512038, 1127565, 563229, 1733943, 500609, 1088552, 992513, 781512, 1804788, 1190361, 1000951, 1825743, 805848, 352192, 1608085, 1390082, 311808, 1587347, 1164969, 637429, 1030727, 1607187, 502010, 439860]",14527480_carts
15455,14527480,443659,0,"[443659, 507514, 1105983, 1059983, 1520871, 1511697, 99429, 1103158, 1206175, 1320998, 881858, 167005, 318610, 385064, 1829584, 18232, 934663, 1102222, 1157982, 1307056, 279167, 1185039, 1350171, 512474, 559216, 166037, 789186, 904917, 1700357, 145332, 182350, 669252, 50609, 839956, 923948, 191178, 637429, 1691940, 1102821, 495275, 1022566, 1390082, 1121939, 56279, 210208, 1458904, 502010, 1261935, 537466, 242276, 477595, 1804558, 68160, 544144, 358911, 401955, 899213, 1515198, 1107572, 781512, 1607187, 1612828, 1125519, 777212, 825958, 124343, 447619, 1000951, 439860, 74892, 1205070, 547856, 60528, 483455, 1609228, 744861, 1321198, 448043, 1166587, 824978, 1472383, 361806, 1739471, 363250, 662455, 622619, 1817367, 1313446, 1831569, 640702, 1310883, 753507, 560112, 1262120, 10533, 1060012, 352564, 1033629, 965499, 1757467]",14527480_clicks


In [20]:
cold_start_sessions_full["flag"] = cold_start_sessions_full.apply(
    lambda row: 1 if len(row.aids_for_cold_aid_real) < 20 else 0, axis=1
)
cold_start_sessions_full[cold_start_sessions_full.flag == 1]

Unnamed: 0,session,aid,type,aids_for_cold_aid_real,session_type,flag
15459,12900154,406905,1,"[406905, 822736, 742592, 1052922, 712347, 477541, 1004179]",12900154_carts,1
30915,12900154,406905,2,"[712347, 406905]",12900154_orders,1
15467,12900837,1673233,1,"[1673233, 584853, 1403568, 1814519, 1408369, 1050029, 1148606, 185306]",12900837_carts,1
30923,12900837,1673233,2,[1408369],12900837_orders,1
30925,12901014,766301,2,"[766301, 640937, 1673387, 1242486, 359341, 1339785, 228514, 1782145, 1332062, 213785, 1594027, 8876, 1522864, 1358973, 31437, 604416]",12901014_orders,1
...,...,...,...,...,...,...
46350,14503063,1124877,2,[],14503063_orders,1
46351,14505087,96095,2,"[96095, 722249, 1114632, 487902, 966643, 314210, 1156699]",14505087_orders,1
46356,14506956,1253968,2,"[1253968, 973112, 1635668, 525699, 535575, 1127979, 964784, 1111775, 982364]",14506956_orders,1
46359,14518378,743634,2,"[945877, 1456049, 1686734, 713923, 895763, 811179, 868327, 126548, 588946, 219798, 1760550, 195230, 1007550, 189774, 1035396, 1383779, 304383, 986335, 145806]",14518378_orders,1


In [21]:
cold_start_sessions_full.to_parquet(
    DATA_PROCESSED_PATH / "prediction_for_cold_start_users_ad.parquet"
)