In [1]:
import pandas as pd
import numpy as np

In [2]:
def get_type_feature(type_str):
    """
    input : type field of df_train_tracking
    output : 2 one-hot-encoded vectors (page, event)
    
    """
    page_type = ['PA', 'LP', 'LR', 'CAROUSEL', 'SHOW_CASE']
    event_type = ['ADD_TO_BASKET', 'PURCHASE_PRODUCT', 'PRODUCT']
    
    page_vec = [0]*len(page_type)
    event_vec = [0]*len(event_type)
    
    indeces_page = [i for i,elem in enumerate(page_type) if elem in type_str]
    indeces_event = [i for i,elem in enumerate(event_type) if elem in type_str]
    
    if indeces_page:
        page_vec[indeces_page[0]] = 1
    
    if indeces_event:
        event_vec[indeces_event[0]] = 1
    
    # return 2 hot encoded vectors
    return page_vec, event_vec

In [3]:
def matr_to_list(l, op = np.add):
    
    res_page = np.zeros(5)
    res_event = np.zeros(3)
    
    for oh_page, oh_event in l:
        res_page = op(oh_page, res_page)
        res_event = op(oh_event, res_event)
        
    return np.append(res_page, res_event)

In [4]:
def process_string (s):
    s.replace("SEARCH", "LP")
    s.replace("LIST_PRODUCT", "LP")
    return s

In [5]:
tra = pd.read_csv("data/train_tracking.csv")
sess = pd.read_csv("data/train_session.csv")

In [6]:
sess.head()

Unnamed: 0,sid,target
0,U6clt0UYaQB7vJQFmSWYymsAfwmT9SMhrm2oXQ8TC5M8mf...,False
1,0qRbLkxeOjeEo6CtxtB6VI8FNPt4Vl8niySzZLqSWiQOka...,False
2,cnCRGomfPqEWuhMZkA0RQfvTQiwVoyHXUqvqnPCutVouFO...,False
3,zBwrofN0r2ps9u/UCUS134SiZIqB+UgIEr0MZGHzksqR4f...,False
4,Kbr60r9eaX/ENOJ81R1YBxfwrEohoxyQI9Ma0fODsS+/XN...,False


In [7]:
sess.describe()

Unnamed: 0,sid,target
count,133123,133123
unique,133123,2
top,BLArHPgxLx3Vc/zKb8hoVC0lrpD5WTr9e2DjD1UYUGI3MC...,False
freq,1,120643


In [8]:
tra.head()

Unnamed: 0,sid,type,query,nb_query_terms,rcount,pn,facets,products,dproducts,rh,...,ocarproducts,oquery,orcount,ofacets,opn,odproducts,oproducts,siteid,duration,type_simplified
0,xvmQh3WmJGKaplwGgHf4o1MNKg2/6IpkbKNh4nnwZibi3f...,CAROUSEL,,,,,,,,1094,...,,,,,,,,cF8tnO1rK7fIBxVIs+AW4w==,0 days 00:00:00.000000000,CAROUSEL
1,xvmQh3WmJGKaplwGgHf4o1MNKg2/6IpkbKNh4nnwZibi3f...,CAROUSEL,,,,,,,,1094,...,,,,,,,,cF8tnO1rK7fIBxVIs+AW4w==,0 days 00:00:11.179637600,CAROUSEL
2,xvmQh3WmJGKaplwGgHf4o1MNKg2/6IpkbKNh4nnwZibi3f...,CAROUSEL,,,,,,,,1094,...,,,,,,,,cF8tnO1rK7fIBxVIs+AW4w==,0 days 00:00:13.132755800,CAROUSEL
3,xvmQh3WmJGKaplwGgHf4o1MNKg2/6IpkbKNh4nnwZibi3f...,CAROUSEL,,,,,,,,1094,...,,,,,,,,cF8tnO1rK7fIBxVIs+AW4w==,0 days 00:02:17.397333300,CAROUSEL
4,KeKjpi6re4QRYxl76E8sLPJHxRCeapb4sb69s5hmPCV+Jn...,PA,,,,,,,,875,...,,,,,,,,cF8tnO1rK7fIBxVIs+AW4w==,0 days 00:00:00.000000000,PA


In [9]:
tra["type"] = tra["type"].apply(lambda x : process_string(x))
tra["type"].unique()

array(['CAROUSEL', 'PA', 'SEARCH', 'ADD_TO_BASKET_LR', 'PRODUCT_LR',
       'SHOW_CASE', 'PURCHASE_PRODUCT_UNKNOW_ORIGIN',
       'PURCHASE_PRODUCT_LR', 'PRODUCT_PA', 'PRODUCT_CAROUSEL',
       'ADD_TO_BASKET_CAROUSEL', 'LIST_PRODUCT', 'PRODUCT_LP',
       'PRODUCT_SHOW_CASE', 'ADD_TO_BASKET_LP', 'ADD_TO_BASKET_PA',
       'ADD_TO_BASKET_SHOW_CASE', 'PURCHASE_PRODUCT_CAROUSEL',
       'PURCHASE_PRODUCT_PA', 'PURCHASE_PRODUCT_LP',
       'PURCHASE_PRODUCT_SHOW_CASE'], dtype=object)

In [10]:
tra_one_list = tra.groupby('sid').agg({'type':lambda x: list(x)}).reset_index()

In [11]:
tra_one_list.head()


Unnamed: 0,sid,type
0,+++elmtsXqN289wWNi6auO1Fm7gyPkXmsKngig88cIqXDD...,"[PA, SEARCH, SEARCH, SEARCH, SEARCH, PRODUCT_L..."
1,++0tYP9PmT6jX9O1WjUhWd7w3hWV6xSRMBOdA7HMoBukKs...,"[CAROUSEL, CAROUSEL, CAROUSEL, CAROUSEL, CAROU..."
2,++2CIH+Rnf2MBamibl+EPSMDTKmweZzRgeX/VDBussbBR8...,"[SEARCH, PRODUCT_LR, PRODUCT_LR, SEARCH]"
3,++3a8LhdXKrKZJeNiBtuHj8znGF/eQADRi0GSnPSlqRajq...,"[SEARCH, SEARCH, PRODUCT_LR, ADD_TO_BASKET_LR,..."
4,++3dzXAmTuAQr+0il3jYZzqk8eoPk6TiffxCqNdQAKyBGp...,"[SEARCH, SEARCH, PRODUCT_LR, CAROUSEL, CAROUSE..."


In [12]:
tra_one_list['one_hot'] = tra_one_list["type"].apply(lambda x: [get_type_feature(s) for s in x])

In [13]:
tra_one_list.head()

Unnamed: 0,sid,type,one_hot
0,+++elmtsXqN289wWNi6auO1Fm7gyPkXmsKngig88cIqXDD...,"[PA, SEARCH, SEARCH, SEARCH, SEARCH, PRODUCT_L...","[([1, 0, 0, 0, 0], [0, 0, 0]), ([0, 0, 0, 0, 0..."
1,++0tYP9PmT6jX9O1WjUhWd7w3hWV6xSRMBOdA7HMoBukKs...,"[CAROUSEL, CAROUSEL, CAROUSEL, CAROUSEL, CAROU...","[([0, 0, 0, 1, 0], [0, 0, 0]), ([0, 0, 0, 1, 0..."
2,++2CIH+Rnf2MBamibl+EPSMDTKmweZzRgeX/VDBussbBR8...,"[SEARCH, PRODUCT_LR, PRODUCT_LR, SEARCH]","[([0, 0, 0, 0, 0], [0, 0, 0]), ([0, 0, 1, 0, 0..."
3,++3a8LhdXKrKZJeNiBtuHj8znGF/eQADRi0GSnPSlqRajq...,"[SEARCH, SEARCH, PRODUCT_LR, ADD_TO_BASKET_LR,...","[([0, 0, 0, 0, 0], [0, 0, 0]), ([0, 0, 0, 0, 0..."
4,++3dzXAmTuAQr+0il3jYZzqk8eoPk6TiffxCqNdQAKyBGp...,"[SEARCH, SEARCH, PRODUCT_LR, CAROUSEL, CAROUSE...","[([0, 0, 0, 0, 0], [0, 0, 0]), ([0, 0, 0, 0, 0..."


In [14]:
a = np.zeros(3, dtype=np.int)
op = np.add
b = np.array([1,0,0])

np.append(a,b
         )

array([0, 0, 0, 1, 0, 0])

In [15]:
tra_one_list["feature"] = tra_one_list["one_hot"].apply(matr_to_list)

In [16]:
tra_one_list.head()

Unnamed: 0,sid,type,one_hot,feature
0,+++elmtsXqN289wWNi6auO1Fm7gyPkXmsKngig88cIqXDD...,"[PA, SEARCH, SEARCH, SEARCH, SEARCH, PRODUCT_L...","[([1, 0, 0, 0, 0], [0, 0, 0]), ([0, 0, 0, 0, 0...","[1.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0]"
1,++0tYP9PmT6jX9O1WjUhWd7w3hWV6xSRMBOdA7HMoBukKs...,"[CAROUSEL, CAROUSEL, CAROUSEL, CAROUSEL, CAROU...","[([0, 0, 0, 1, 0], [0, 0, 0]), ([0, 0, 0, 1, 0...","[0.0, 0.0, 0.0, 5.0, 0.0, 0.0, 0.0, 0.0]"
2,++2CIH+Rnf2MBamibl+EPSMDTKmweZzRgeX/VDBussbBR8...,"[SEARCH, PRODUCT_LR, PRODUCT_LR, SEARCH]","[([0, 0, 0, 0, 0], [0, 0, 0]), ([0, 0, 1, 0, 0...","[0.0, 0.0, 2.0, 0.0, 0.0, 0.0, 0.0, 2.0]"
3,++3a8LhdXKrKZJeNiBtuHj8znGF/eQADRi0GSnPSlqRajq...,"[SEARCH, SEARCH, PRODUCT_LR, ADD_TO_BASKET_LR,...","[([0, 0, 0, 0, 0], [0, 0, 0]), ([0, 0, 0, 0, 0...","[0.0, 0.0, 4.0, 2.0, 0.0, 2.0, 0.0, 2.0]"
4,++3dzXAmTuAQr+0il3jYZzqk8eoPk6TiffxCqNdQAKyBGp...,"[SEARCH, SEARCH, PRODUCT_LR, CAROUSEL, CAROUSE...","[([0, 0, 0, 0, 0], [0, 0, 0]), ([0, 0, 0, 0, 0...","[0.0, 0.0, 1.0, 4.0, 0.0, 0.0, 0.0, 2.0]"


In [17]:
page_type = ['PA', 'LP', 'LR', 'CAROUSEL', 'SHOW_CASE']
event_type = ['ADD_TO_BASKET', 'PURCHASE_PRODUCT', 'PRODUCT']
tra_features = pd.DataFrame(tra_one_list["feature"].values.tolist(), columns=page_type+event_type)
tra_features["sid"] = tra_one_list["sid"]

In [20]:
tra_features.describe()
#tra_features.head()


Unnamed: 0,PA,LP,LR,CAROUSEL,SHOW_CASE,ADD_TO_BASKET,PURCHASE_PRODUCT,PRODUCT
count,133123.0,133123.0,133123.0,133123.0,133123.0,133123.0,133123.0,133123.0
mean,0.838668,0.182583,1.213524,3.003275,0.649039,0.3141,0.035433,2.432788
std,1.506739,1.107427,2.678119,5.818224,1.752899,1.023639,0.313648,5.646377
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
75%,1.0,0.0,1.0,4.0,0.0,0.0,0.0,3.0
max,196.0,87.0,217.0,423.0,73.0,60.0,14.0,577.0
