# W2vec features for carts (cross-validation and part_0 of test)

In this notebook, the w2vec features for cart model are built using w2vec model from "W2vec model for carts and orders" notebook. There are total 4 w2vec features for carts model, two of them are calculated for the last 5 session aids and candidate, another two - for last 20 session aids and candidate. Each group of features is calculated in a single cycle and also using pandarallel library, but anyway this calculation takes time. To decrease calculation time even further, the task is performed in two notebooks in parallel - this one, and another one with very similar code that calculates the same features for another chunk of test dataset and also for cross-validation dataset.
## Imports and definitions

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import gc
from humanize import naturalsize
from gensim.models import Word2Vec
from pandarallel import pandarallel

# functions and classes common for several notebooks of current project
import otto_common

In [2]:
# Returns min and max w2vec similarity for the candidate aid and last session aids.
def agg_min_max(x, w2v_model):
    similarities = []
    for item in x.aid:
        similarities.append(w2v_model.wv.similarity(item, x.cart_predictions))
    return [np.min(similarities), np.max(similarities)]

In [3]:
# Returns mean and min w2vec similarity for the candidate aid and last session aids.
def agg_mean_min(x, w2v_model):
    similarities = []
    for item in x.aid:
        similarities.append(w2v_model.wv.similarity(item, x.cart_predictions))
    return [np.mean(similarities), np.min(similarities)]

In [4]:
# Import candidates and last aids for each session, then calculate the w2vec features for them in chunks.
def calculate_w2vec_features(train_path, sessions_path, w2v_model,
                            time_delta, n_max, feature_type, feature_name1, feature_name2):
    
    # Import all the required data and merge it together.
    print(feature_type)
    df_sessions = pd.read_parquet(sessions_path)
    df_sessions = otto_common.filter_by_time_and_n_max(df_sessions, time_delta, n_max)
    df_sessions = (df_sessions.groupby('session').agg({'aid': lambda x: x.tolist()}))
    df = pd.read_parquet(train_path)
    df = pd.merge(df, df_sessions, how='left', on='session')
    
    # Remove all the columns not relevant to this calculation.
    df = df[['cart_predictions', 'aid']]
    del df_sessions
    gc.collect()
    
    # Prepare data chunks.
    i = 0
    chunk_size = 10000000
    while i < 1000000000:
        df_chunk = df.iloc[i:i+chunk_size,:].copy()
        
        # Calculate the features.
        #pandarallel.initialize(nb_workers=4, progress_bar=True)
        pandarallel.initialize(nb_workers=4)
        if feature_type == 'mean_min':
            df_chunk['features']  = df_chunk.parallel_apply(
                lambda x: agg_mean_min(x, w2v_model), axis=1)
        elif feature_type == 'min_max':
            df_chunk['features']  = df_chunk.parallel_apply(
                lambda x: agg_min_max(x, w2v_model), axis=1)
        else:
            print('feature_type_unknown')
            
        # Format the calculated features.
        df_chunk[[feature_name1,feature_name2]] = pd.DataFrame(df_chunk.features.tolist(), index=df_chunk.index)
        df_chunk = df_chunk[[feature_name1, feature_name2]]
        df_chunk[feature_name1] = df_chunk[feature_name1].astype(np.float32)
        df_chunk[feature_name2] = df_chunk[feature_name2].astype(np.float32)
        gc.collect()
        
        # Merge the chunks together.
        if i == 0:
            df_all = df_chunk.copy()
        else:
            df_all = pd.concat([df_all, df_chunk])
        print(i)
        i += chunk_size
        gc.collect()
        if df_chunk.shape[0] < chunk_size:
            return df_all

In [5]:
# Define all the parameters for the w2vec features, except for paths, that are different for cross-validation and test datasets.
def add_w2vec_data(train_path, sessions_path, w2vec_path):
    w2v_model = Word2Vec.load(w2vec_path)
    df_w2v_20 = calculate_w2vec_features(train_path, sessions_path, w2v_model, 3 * 60 * 60, 20, 'mean_min', 'w2v_20_mean', 'w2v_20_min')  
    gc.collect()
    df_w2v_5 = calculate_w2vec_features(train_path, sessions_path, w2v_model, 5 * 24 * 60 * 60, 5, 'min_max', 'w2v_5_min', 'w2v_5_max')
    gc.collect()
    df_all = pd.read_parquet(train_path)
    df_all = pd.concat([df_all, df_w2v_20, df_w2v_5], axis=1)
    return df_all

## W2vec features for Part_1 of test dataset

In [6]:
# All the paths and w2vec feature calculation.
test_path_part1 = '/kaggle/input/otto-feature-engineering-carts/test_features_cart_part_1.parquet'
sessions_path_test = '/kaggle/input/otto-prepare-cv/test.parquet'
w2vec_path_test = '/kaggle/input/otto-word2vec-exp/word2vec_test_exp.wordvectors'

df_test1 = add_w2vec_data(test_path_part1, sessions_path_test, w2vec_path_test)

mean_min
INFO: Pandarallel will run on 4 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
0
INFO: Pandarallel will run on 4 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
10000000
INFO: Pandarallel will run on 4 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
20000000
INFO: Pandarallel will run on 4 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
30000000
INFO: Pandarallel will run on 4 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
40000000
INFO: Pandarallel will run on 4 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
50000000
INFO: Pandarallel will run on 4 workers.
INFO: Pandarallel will use Memory file system to tr

In [7]:
# Check size and export to file.
size = df_test1.memory_usage(deep='True').sum()
print(naturalsize(size))
df_test1.to_parquet('test_features_with_w2v_cart_part_1.parquet')

6.5 GB
