In [1]:
import pandas as pd
from datetime import datetime
import numpy as np
import os
import gc
pd.set_option('display.max_columns', 100)
from scipy import stats
import gensim
from sklearn.decomposition import PCA
def get_prod_W2V(df_products,df_prior_final,df_train_final,data):
    '''
        Calculates each products vector representaion using Word 2 vector and projects the data into
        40 dim using PCA
        Args: 
            df_prior_final :  Users prior order history with products info
            df_train_final : Users current order with products info
            df_products : products info
            data : featurized DF excluing Product2 vec features
        Returns:
                Featurized DF
    '''
    products = df_products.set_index('product_id')
    df_prior_final['product_id'] = df_prior_final['product_id'].astype(str)
    df_train_final['product_id'] = df_train_final['product_id'].astype(str)
    train_products = df_train_final.groupby("order_id").apply(lambda order: order['product_id'].tolist())
    prior_products = df_prior_final.groupby("order_id").apply(lambda order: order['product_id'].tolist())
    sentences = prior_products.append(train_products)
    longest = np.max(sentences.apply(len))
    # preparing data in list of list format that Gensim expects
    sentences = sentences.values
    #print(longest)
    # We have used window=longest since if we use smaller window then products that are outside the window
    # will not be considered in same context though they are added to cart agaist same order. That's why
    # we are setting windsize large enough to accomodate all products agaist same order
    model = gensim.models.Word2Vec(sentences, size=100, window=longest, min_count=2, workers=4)
    vocab = list(model.wv.vocab.keys())
    pca = PCA(40)
    pca.fit(model.wv[vocab])
    #print(pca.explained_variance_ratio_)
    product2vec = pca.transform(model.wv[vocab])
    product2vec = pd.DataFrame(product2vec)
    product2vec.columns = ["pca_" + str(s) for s in product2vec.columns.tolist()]
    product2vec["product_id"] = vocab
    product2vec["product_id"] = product2vec["product_id"].astype(int)
    # product2vec.to_pickle("product2vec.pkl")
    #product2vec = pd.read_pickle("product2vec.pkl")
    #Merging with our data DF
    data = data.merge(product2vec,on='product_id',how='left')
    del product2vec
    gc.collect()
    return data