# Preprocessing steps

## 0) Read in data
## 1) Basic conversions and drops
## 2) Joining search queries to make single-query interaction vectors, and user interaction vectors
## 3) Create Context vector
## 4) Finally, an example on how to access

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import lenskit
from sklearn.preprocessing import OneHotEncoder
import sys
print(sys.executable)

## 0) Read in data


In [None]:
data_dir = '/home/awd275/Search_and_Discovery/sad_final_project/data/raw/small_100_results'

columns_to_read = ['search_result_id','search_request_id', 'hotel_id', 'user_id','label', 'check_in', 'check_out',
       'reward_program_hash', 'advance_purchase_days',
       'number_of_nights', 'number_of_rooms', 'number_of_adults',
       'srq_latitude', 'srq_longitude', 'check_in_weekday',
       'check_out_weekday', 'srq_weekhour', 'weekday_travel',
       'weekend_travel']

In [None]:
df = pd.read_parquet(data_dir,columns=columns_to_read)

In [None]:
df.head()

## 1) Basic conversions and drops


In [None]:
def df_conversions(df):
    '''
    IMPORTANT!!
    1) Adds a hotel_index column, this is assigns each hotel_id a number from 0 to len(hotel_id)-1. This index
    is used for our interaction vector, which is a len(hotel_id)-length vector with the interaction label
    as the entries.
    
    2) Drops users with no user_id (aka, anonymous/first time users)
    3) subtracts 1e^10 from user_id (ask eric why)
    4) converts date time strings into pandas datetime objects
    '''
    hotel_id_to_hotel_index = dict((hotel_id, i) for (i, hotel_id) in enumerate(df['hotel_id'].unique()))
    df['hotel_index']= df['hotel_id'].map(hotel_id_to_hotel_index)
    df.drop(df.loc[df['user_id'].isna()].index, inplace=True)
    df['user_id'] = df['user_id'] - 1e10
    df['check_in'] = pd.to_datetime(df['check_in'],yearfirst=True)
    df['check_out'] = pd.to_datetime(df['check_out'],yearfirst=True)
    return df

In [None]:
df = df_conversions(df)


In [None]:
df.head()


# Joining search queries by search_result_id, and getting single/entire interaction_vecs

In [55]:
def create_user_id_to_query_struct_dict(df):
    '''
    returns a dictionary of user_id -> "query_struct."
    check create_query_struct_for_user for what a "query_struct" is
    '''
    unique_user_ids = df['user_id'].unique()
    user_id_to_query_struct_dict = {user_id : create_query_struct_for_user(df,user_id)
                                    for user_id in unique_user_ids}
    return user_id_to_query_struct_dict


def create_query_struct_for_user(df,user_id):
    '''
    Returns a "query struct", which is a 2-tuple:
    
    (Assuming a given and fixed user_id):
    
    1st entry: dict of search_request_ids to a interaction_vec for that search request.
        Note: The interaction_vec is a dict of hotel_index -> label for that hotel_index.
              Importantly, this is a sparse vector format.
              Thus, the 1st entry is a dict{search_request_id -> dict{hotel_idx->label}}
              
    2nd return: user_vector containing all of the label/interactions w/user_id = user_id
    
    '''
    # Select only the entries for the user we care about
    df_user_id = df[df['user_id']==user_id]
    # get all of their searches (search_id)
    unique_search_ids_per_user = df_user_id['search_request_id'].unique()
    # Loop over each search, storing the interaction for each search query
    interaction_vecs_per_query = []
    for sr_id in unique_search_ids_per_user:
        # Select only entries for each search request
        df_sr_user_id = df_user_id[df_user_id['search_request_id']==sr_id] 
        # Create a dict of {hotel_index:label}
        interaction_sparse_vec = pd.Series(df_sr_user_id['label'].values,index=df_sr_user_id['hotel_index']).to_dict()
        # Add it to vector
        interaction_vecs_per_query.append(interaction_sparse_vec)
    
    #make a dict of search_ids to interactions_vec
    search_id_to_interaction_vec = dict(zip(unique_search_ids_per_user,interaction_vecs_per_query))
    
    # Merge all the interactions to get the user's entire interaction vec
    user_interaction_vec = merge_dicts_with_max(interaction_vecs_per_query)
    
    return search_id_to_interaction_vec,user_interaction_vec

def get_single_query_interaction_vec(user_id_to_query_struct_dict,user_id,sr_id):
    return user_id_to_query_struct_dict[user_id][0][sr_id]
def get_user_entire_interaction_vec(user_id_to_query_struct_dict,user_id):
    return user_id_to_query_struct_dict[user_id][1]

def merge_dicts_with_max(dict_list):
    ''' 
    merge a list of dictionaries
    if their keys overlap, return the max.
    e.g. {a:1,b:1}
         {b:2,c:2}
         merged into {a:1,b:2,c:2}
    
    We need this in order to merge a single users interaction vectors. Consider if a user
    does two queries, and ends up buying the same hotel twice. 
    '''
    return_dict = {}
    for dict_ in dict_list:
        for key in dict_:
            if key in return_dict:
                return_dict[key] = max(return_dict[key],dict_[key])
            else:
                return_dict[key] = dict_[key]
    return return_dict


In [11]:
user_id_to_query_struct_dict = create_user_id_to_query_struct_dict(df)

## Creating Context Vector and encodes the categorical variables


In [49]:
def create_context_df_and_cat_encoder(df,cat_vars_to_use):
    '''
    Creates the context_dataframe and cat_encoder
    
    returns a 2-tuple,
    1st: return context_dataframe, which is a dataframe with !search_request_id as the index!
    2nd: returns a sklearn OneHotEncoder, which has been trained on context_df['cat_vars_to_use']
    
    '''
    # Our df contains 100~ results for each search request id. Each of the 100 results have the same query info
    # e.g, they all have the same values for reward_program_has, check_in_weekday,number_of_nights,etc
    # We only need one row out of those 100 to properly get the context. 
    # Here, we grab the first row
    sr_id_to_first_index_df = pd.DataFrame([[key,val.values[0]]
                              for key,val in df.groupby('search_request_id').groups.items()], 
                              columns=['search_request_id','first_index'])
    context_df = df.loc[sr_id_to_first_index_df['first_index'].values]
    context_df.set_index('search_request_id',inplace=True)
    #Encode the categorical variables
    cat_onehot_enc = OneHotEncoder()
    cat_onehot_enc.fit(context_df[cat_vars_to_use])    
    
    
    return context_df,cat_onehot_enc

def create_context_vec(context_df, cat_onehot_enc, cat_vars_to_use, quant_vars_to_use, sr_id):
    '''
    returns a np.vector which contains the context information.
            The categorical features have already been encoded via cat_onehot_enc
    
    '''
    #Get User id for this query
    user_id = context_df.loc[sr_id]['user_id']
    # Get and encode the categorical features for this query
    context_cat_pre_enc = context_df.loc[sr_id][cat_vars_to_use]
    #Reshape if we're only dealing with one row
    if len(context_cat_pre_enc.shape) == 1:
        context_cat_pre_enc = context_cat_pre_enc.values.reshape(1,-1)
    context_cat_enc = cat_onehot_enc.transform(context_cat_pre_enc).todense()
    
    # Get the quantitative features for this query
    context_quant = context_df.loc[sr_id][quant_vars_to_use].to_numpy()
    if len(context_quant.shape) == 1:
        context_quant = context_cat_pre_enc.reshape(1,-1)
    #stack the encoded categorical features and quantitative features
    context_vec = np.hstack((context_cat_enc,context_quant))
    # return the context
    return context_vec


In [50]:
#Example 
test_sr_id = 10064244538

cat_vars_to_use = ['reward_program_hash',
                    'check_in_weekday',
                    'check_out_weekday',
                    'weekday_travel',
                    'weekend_travel']
quant_vars_to_use = ['check_in',
                     'check_out',
                     'advance_purchase_days',
                     'number_of_nights',
                     'number_of_rooms',
                     'number_of_adults',
                     'srq_latitude',
                     'srq_longitude',
                    ]

context_df, cat_onehot_enc = create_context_df_and_cat_encoder(df, cat_vars_to_use)
context_vec = create_context_vec(context_df, cat_onehot_enc, cat_vars_to_use, quant_vars_to_use, test_sr_id)

In [None]:
# This cell here to stop jupyter notebook from running to the finish
raise NotImplementedError

# Accessing Example

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import lenskit
from sklearn.preprocessing import OneHotEncoder
import sys
print(sys.executable)

In [52]:
def get_user_id_from_sr_id(context_df,sr_id):
    '''
    Make sure you use context df, as that is indexed by search_id (this will speed things up)
    '''
    return context_df.loc[sr_id]['user_id']

In [51]:
# Step 0 Read in Data
data_dir = '/home/awd275/Search_and_Discovery/sad_final_project/data/raw/small_100_results'

columns_to_read = ['search_result_id','search_request_id', 'hotel_id', 'user_id','label', 'check_in', 'check_out',
       'reward_program_hash', 'advance_purchase_days',
       'number_of_nights', 'number_of_rooms', 'number_of_adults',
       'srq_latitude', 'srq_longitude', 'check_in_weekday',
       'check_out_weekday', 'srq_weekhour', 'weekday_travel',
       'weekend_travel']
df = pd.read_parquet(data_dir,columns=columns_to_read)

# Step 1, basic drops and conversions
df = df_conversions(df)

# Step 2, create dict of user_id -> query_struct
user_id_to_query_struct_dict = create_user_id_to_query_struct_dict(df)


# Step 3, Create context_df and encoder vector

context_df, cat_onehot_enc = create_context_df_and_cat_encoder(df, cat_vars_to_use)
context_vec = create_context_vec(context_df, cat_onehot_enc, cat_vars_to_use, quant_vars_to_use, test_sr_id)

In [58]:
test_sr_id = 10064244538

cat_vars_to_use = ['reward_program_hash',
                    'check_in_weekday',
                    'check_out_weekday',
                    'weekday_travel',
                    'weekend_travel']
quant_vars_to_use = ['check_in',
                     'check_out',
                     'advance_purchase_days',
                     'number_of_nights',
                     'number_of_rooms',
                     'number_of_adults',
                     'srq_latitude',
                     'srq_longitude',
                    ]

# Create context vector
context_vec = create_context_vec(context_df, cat_onehot_enc, cat_vars_to_use, quant_vars_to_use, test_sr_id)
user_id = get_user_id_from_sr_id(context_df,test_sr_id)
# Get single_interaction_vec for this sr_id and user's entire interaction vec
single_query_interaction_vec = get_single_query_interaction_vec(user_id_to_query_struct_dict, user_id, test_sr_id)
user_entire_interaction_vec = get_user_entire_interaction_vec(user_id_to_query_struct_dict, user_id)


In [59]:
context_vec

matrix([[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
         0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
         0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
         0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0,
         0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
         0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0,
         1.0, 0.0, 0.0, 1.0, 3312343131, '6', '7', False, True]],
       dtype=object)

In [61]:
user_id

311836.0

In [60]:
single_query_interaction_vec

{0: 0,
 1: 0,
 4299: 0,
 23350: 0,
 23642: 0,
 40000: 0,
 24625: 0,
 14149: 0,
 34008: 0,
 9693: 0,
 4195: 0,
 12087: 2,
 30127: 0,
 30020: 0,
 9686: 0,
 18782: 0,
 23528: 0,
 13564: 0,
 24769: 0,
 29160: 0,
 4604: 0,
 28934: 0,
 298: 0,
 43659: 0,
 21410: 0,
 27862: 0,
 17332: 0,
 36062: 0,
 43872: 0,
 51595: 0,
 32243: 0,
 58402: 0,
 2413: 0,
 33592: 0,
 34747: 0,
 37597: 0,
 32990: 0,
 20658: 0,
 46501: 0,
 19823: 0,
 22933: 0,
 41953: 0,
 43156: 0,
 40350: 0,
 7951: 0,
 13294: 0,
 15456: 0,
 22450: 0,
 30126: 0,
 44734: 0,
 21409: 0,
 48162: 0,
 7828: 0,
 14678: 0,
 16991: 0,
 52609: 0,
 34022: 0,
 12553: 0,
 44131: 0,
 38624: 0,
 24173: 0,
 31547: 0,
 39047: 0,
 31315: 0,
 30968: 0,
 8622: 0,
 37898: 0,
 10164: 0,
 31243: 0,
 30553: 0,
 19313: 0,
 95025: 0,
 14528: 0,
 41208: 0,
 32067: 0,
 22846: 0,
 28462: 0,
 20762: 0,
 13851: 0,
 19955: 0,
 30034: 0,
 24024: 0,
 30373: 0,
 24174: 0,
 16449: 0,
 29271: 0,
 21168: 0,
 16450: 0,
 36007: 0,
 37897: 0,
 37582: 0,
 52319: 0,
 1199: 

In [62]:
user_entire_interaction_vec

{0: 0,
 1: 0,
 4299: 0,
 23350: 0,
 23642: 0,
 40000: 0,
 24625: 0,
 14149: 0,
 34008: 0,
 9693: 0,
 4195: 0,
 12087: 2,
 30127: 0,
 30020: 0,
 9686: 0,
 18782: 0,
 23528: 0,
 13564: 0,
 24769: 0,
 29160: 0,
 4604: 0,
 28934: 0,
 298: 0,
 43659: 0,
 21410: 0,
 27862: 0,
 17332: 0,
 36062: 0,
 43872: 0,
 51595: 0,
 32243: 0,
 58402: 0,
 2413: 0,
 33592: 0,
 34747: 0,
 37597: 0,
 32990: 0,
 20658: 0,
 46501: 0,
 19823: 0,
 22933: 0,
 41953: 0,
 43156: 0,
 40350: 0,
 7951: 0,
 13294: 0,
 15456: 0,
 22450: 0,
 30126: 0,
 44734: 0,
 21409: 0,
 48162: 0,
 7828: 0,
 14678: 0,
 16991: 0,
 52609: 0,
 34022: 0,
 12553: 0,
 44131: 0,
 38624: 0,
 24173: 0,
 31547: 0,
 39047: 0,
 31315: 0,
 30968: 0,
 8622: 0,
 37898: 0,
 10164: 0,
 31243: 0,
 30553: 0,
 19313: 0,
 95025: 0,
 14528: 0,
 41208: 0,
 32067: 0,
 22846: 0,
 28462: 0,
 20762: 0,
 13851: 0,
 19955: 0,
 30034: 0,
 24024: 0,
 30373: 0,
 24174: 0,
 16449: 0,
 29271: 0,
 21168: 0,
 16450: 0,
 36007: 0,
 37897: 0,
 37582: 0,
 52319: 0,
 1199: 

# Notes / BS below

All variables List

'search_result_id', # don't need  
'search_request_id', can use to join each search, doesnt go into model  
'hotel_id',              categorical  
'user_id',               categorical  
'label',                 ordinal (0,1,2,3)  
'check_in',              quantitative (after date time conversion)   
'check_out',             quantitative (after date time conversion)  
'reward_program_hash',  categorical  
'advance_purchase_days', quantitative  
'number_of_nights',      quantitative  
'number_of_rooms',       quantitative  
'number_of_adults',      quantitative  
'srq_latitude',          quantitative  
'srq_longitude',         quantitative  
'check_in_weekday',      categorical  
'check_out_weekday',     categorical  
'srq_weekhour',          categorical (Probably don't need)  
'weekday_travel',        categorical  
'weekend_travel'         categorical  
  
  
User_id    
hotel_id    


Categorical variables:  
    reward_program_hash  
    check_in_weekday  
    check_out_weekday  
    weekday_travel  
    weekend_travel   
    
quantitative variables  
advance_purchase_days  
number_of_nights  
number_of_adults  


Pytorch flow description:

Three inputs (two of them get concatenated): Single interaction for specific user, query, and all interactions for specific user.

Concatenate single interaction query for specific user and query, and send to one encoder.
Send all interactions for a specific user to another encoder



In [25]:
!/home/awd275/miniconda3/envs/dsga3001/bin/python3  /home/awd275/Search_and_Discovery/sad_final_project/src/data/data_preprocessing_multvae.py /scratch/work/js11133/sad_data/raw/full/train /scratch/work/js11133/sad_data/processed/full/train



Reading in data of shape (427504, 19) from /scratch/work/js11133/sad_data/raw/full/train/0012_part_00.parquet
Reading in data of shape (427556, 19) from /scratch/work/js11133/sad_data/raw/full/train/0008_part_00.parquet
Reading in data of shape (427460, 19) from /scratch/work/js11133/sad_data/raw/full/train/0049_part_00.parquet
Reading in data of shape (427612, 19) from /scratch/work/js11133/sad_data/raw/full/train/0025_part_00.parquet
Reading in data of shape (427459, 19) from /scratch/work/js11133/sad_data/raw/full/train/0061_part_00.parquet
Reading in data of shape (427550, 19) from /scratch/work/js11133/sad_data/raw/full/train/0060_part_00.parquet
Reading in data of shape (427521, 19) from /scratch/work/js11133/sad_data/raw/full/train/0004_part_00.parquet
Reading in data of shape (427550, 19) from /scratch/work/js11133/sad_data/raw/full/train/0034_part_00.parquet
Reading in data of shape (427560, 19) from /scratch/work/js11133/sad_data/raw/full/train/0081_part_00.parquet
Reading in

Reading in data of shape (427476, 19) from /scratch/work/js11133/sad_data/raw/full/train/0016_part_00.parquet
Reading in data of shape (427581, 19) from /scratch/work/js11133/sad_data/raw/full/train/0036_part_00.parquet
Reading in data of shape (427248, 19) from /scratch/work/js11133/sad_data/raw/full/train/0065_part_00.parquet
Reading in data of shape (427603, 19) from /scratch/work/js11133/sad_data/raw/full/train/0013_part_00.parquet
Reading in data of shape (427311, 19) from /scratch/work/js11133/sad_data/raw/full/train/0020_part_00.parquet
Reading in data of shape (427681, 19) from /scratch/work/js11133/sad_data/raw/full/train/0029_part_00.parquet
Reading in data of shape (427548, 19) from /scratch/work/js11133/sad_data/raw/full/train/0011_part_00.parquet
Reading in data of shape (427430, 19) from /scratch/work/js11133/sad_data/raw/full/train/0057_part_00.parquet
Reading in data of shape (427178, 19) from /scratch/work/js11133/sad_data/raw/full/train/0051_part_00.parquet
Reading in