In [22]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import lenskit
from sklearn.preprocessing import OneHotEncoder

In [23]:
data_dir = '/home/awd275/Search_and_Discovery/sad_final_project/data/raw/small_100_results'



In [24]:
import sys
print(sys.executable)

/home/awd275/miniconda3/envs/dsga3001/bin/python


In [25]:
columns_to_read = ['search_result_id','search_request_id', 'hotel_id', 'user_id','label', 'check_in', 'check_out',
       'reward_program_hash', 'advance_purchase_days',
       'number_of_nights', 'number_of_rooms', 'number_of_adults',
       'srq_latitude', 'srq_longitude', 'check_in_weekday',
       'check_out_weekday', 'srq_weekhour', 'weekday_travel',
       'weekend_travel']

In [26]:
df = pd.read_parquet(data_dir,columns=columns_to_read)

In [27]:
df.head()

Unnamed: 0,search_result_id,search_request_id,hotel_id,user_id,label,check_in,check_out,reward_program_hash,advance_purchase_days,number_of_nights,number_of_rooms,number_of_adults,srq_latitude,srq_longitude,check_in_weekday,check_out_weekday,srq_weekhour,weekday_travel,weekend_travel
0,1002704270245,10064244538,55669,10000310000.0,0,2019-04-06,2019-04-07,3312343131,33,1,1,2,26.823395,-80.138655,6,7,26.0,False,True
1,1002704270341,10064244538,23463,10000310000.0,0,2019-04-06,2019-04-07,3312343131,33,1,1,2,26.823395,-80.138655,6,7,26.0,False,True
2,1002704281668,10064244779,45493,10000100000.0,0,2019-03-17,2019-03-23,1035052858,13,6,1,1,37.7884,-122.4073,7,6,26.0,False,False
3,1002704286873,10064244877,769816,,0,2019-03-22,2019-03-23,1430218226,18,1,1,2,35.140618,-90.053659,5,6,26.0,False,True
4,1002704286969,10064244877,26231,,0,2019-03-22,2019-03-23,1430218226,18,1,1,2,35.140618,-90.053659,5,6,26.0,False,True


In [28]:

def df_conversions(df):
    hotel_id_to_hotel_index = dict((hotel_id, i) for (i, hotel_id) in enumerate(df['hotel_id'].unique()))
    df['hotel_index']= df['hotel_id'].map(hotel_id_to_hotel_index)
    df.drop(df.loc[df['user_id'].isna()].index, inplace=True)
    df['user_id'] = df['user_id'] - 1e10
    df['check_in'] = pd.to_datetime(df['check_in'],yearfirst=True)
    df['check_out'] = pd.to_datetime(df['check_out'],yearfirst=True)
    return df

In [29]:
df = df_conversions(df)


In [30]:
df.head()


Unnamed: 0,search_result_id,search_request_id,hotel_id,user_id,label,check_in,check_out,reward_program_hash,advance_purchase_days,number_of_nights,number_of_rooms,number_of_adults,srq_latitude,srq_longitude,check_in_weekday,check_out_weekday,srq_weekhour,weekday_travel,weekend_travel,hotel_index
0,1002704270245,10064244538,55669,311836.0,0,2019-04-06,2019-04-07,3312343131,33,1,1,2,26.823395,-80.138655,6,7,26.0,False,True,0
1,1002704270341,10064244538,23463,311836.0,0,2019-04-06,2019-04-07,3312343131,33,1,1,2,26.823395,-80.138655,6,7,26.0,False,True,1
2,1002704281668,10064244779,45493,95557.0,0,2019-03-17,2019-03-23,1035052858,13,6,1,1,37.7884,-122.4073,7,6,26.0,False,False,2
10,1002704352815,10064246256,102923,2855443.0,0,2019-03-08,2019-03-10,61545895,4,2,1,2,47.60621,-122.332071,5,7,27.0,False,True,10
11,1002704355431,10064246300,709768,2112352.0,0,2019-03-03,2019-03-04,755845129,-1,1,1,1,42.48059,-83.475491,7,1,27.0,False,False,11


# Joining search queries by search_result_id, and getting single/entire interaction_vecs

In [84]:
def merge_dicts_with_max(dict_list):
    ''' 
    merge a list of dictionaries
    if their keys overlap, return the max.
    e.g. {a:1,b:1}
         {b:2,c:2}
         merged into {a:1,b:2,c:2}
    '''
    return_dict = {}
    for dict_ in dict_list:
        for key in dict_:
            if key in return_dict:
                return_dict[key] = max(return_dict[key],dict_[key])
            else:
                return_dict[key] = dict_[key]
    return return_dict

def create_user_and_queries_interactions_vec(df,user_id):
    '''
    Returns a pair:
    1st return: dict of search_ids to interactions_vec
    
    2nd return: vector containing all of the label/interactions w/user_id = user_id
    
    '''
    # Select only the entries for the user we care about
    df_user_id = df[df['user_id']==user_id]
    # get all of their searches (search_id)
    unique_search_ids_per_user = df_user_id['search_request_id'].unique()
    # Loop over each search, storing the interaction for each search query
    interaction_vecs_per_query = []
    for sr_id in unique_search_ids_per_user:
        # Select only entries for each search request
        df_sr_user_id = df_user_id[df_user_id['search_request_id']==sr_id] 
        # Create a dict of {hotel_index:label}
        interaction_sparse_vec = pd.Series(df_sr_user_id['label'].values,index=df_sr_user_id['hotel_index']).to_dict()
        # Add it to vector
        interaction_vecs_per_query.append(interaction_sparse_vec)
    
    #make a dict of search_ids to interactions_vec
    search_id_to_interaction_vec = dict(zip(unique_search_ids_per_user,interaction_vecs_per_query))
    
    # Merge all the interactions to get the user's entire interaction vec
    user_interaction_vec = merge_dicts_with_max(interaction_vecs_per_query)
    
    return search_id_to_interaction_vec,user_interaction_vec

In [65]:
unique_user_ids = df['user_id'].unique()
interactions_struct = {user_id:create_user_and_queries_interactions_vec(df,user_id) for user_id in unique_user_ids}

### Creating Context Vector

In [34]:
# Our df contains 100~ results for each search request id. Each of the 100 results have the same query info
# e.g, they all have the same values for reward_program_has, check_in_weekday,number_of_nights,etc
#
sr_id_to_first_index_df = pd.DataFrame([[key,val.values[0]]
                          for key,val in df.groupby('search_request_id').groups.items()], 
                          columns=['search_request_id','first_index'])

In [35]:
df

Unnamed: 0,search_result_id,search_request_id,hotel_id,user_id,label,check_in,check_out,reward_program_hash,advance_purchase_days,number_of_nights,number_of_rooms,number_of_adults,srq_latitude,srq_longitude,check_in_weekday,check_out_weekday,srq_weekhour,weekday_travel,weekend_travel,hotel_index
0,1002704270245,10064244538,55669,311836.0,0,2019-04-06,2019-04-07,3312343131,33,1,1,2,26.823395,-80.138655,6,7,26.0,False,True,0
1,1002704270341,10064244538,23463,311836.0,0,2019-04-06,2019-04-07,3312343131,33,1,1,2,26.823395,-80.138655,6,7,26.0,False,True,1
2,1002704281668,10064244779,45493,95557.0,0,2019-03-17,2019-03-23,1035052858,13,6,1,1,37.788400,-122.407300,7,6,26.0,False,False,2
10,1002704352815,10064246256,102923,2855443.0,0,2019-03-08,2019-03-10,61545895,4,2,1,2,47.606210,-122.332071,5,7,27.0,False,True,10
11,1002704355431,10064246300,709768,2112352.0,0,2019-03-03,2019-03-04,755845129,-1,1,1,1,42.480590,-83.475491,7,1,27.0,False,False,11
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4722143,1006451956865,10073953696,113894,2937038.0,0,2019-10-14,2019-10-15,1035052858,11,1,1,2,29.760427,-95.369803,1,2,113.0,True,False,15726
4722144,1006451975201,10073953696,16069,2937038.0,0,2019-10-14,2019-10-15,1035052858,11,1,1,2,29.760427,-95.369803,1,2,113.0,True,False,26171
4722145,1006452026015,10075706583,27246,287006.0,0,2019-11-18,2019-11-20,1868001493,7,2,1,2,33.748995,-84.387982,1,3,43.0,True,False,16337
4722146,1006452035231,10075706583,43480,287006.0,0,2019-11-18,2019-11-20,1868001493,7,2,1,2,33.748995,-84.387982,1,3,43.0,True,False,12889


In [36]:
context_df = df.loc[sr_id_to_first_index_df['first_index'].values]

In [37]:
categorical_vars = ['reward_program_hash','check_in_weekday','check_out_weekday','weekday_travel','weekend_travel']
one_hot_enc = OneHotEncoder()
context_categorical = one_hot_enc.fit_transform(context_df[categorical_vars])
context_categorical = context_categorical.todense()


In [38]:
quantitative_vars = ['check_in',
                     'check_out',
                     'advance_purchase_days',
                     'number_of_nights',
                     'number_of_rooms',
                     'number_of_adults',
                     'srq_latitude',
                     'srq_longitude',
                    ]
context_quant = context_df[quantitative_vars].to_numpy()

In [39]:
context = np.hstack((context_categorical,context_quant))
context_df.set_index('search_request_id',inplace=True)

In [54]:
context_df.head()

Unnamed: 0_level_0,search_result_id,hotel_id,user_id,label,check_in,check_out,reward_program_hash,advance_purchase_days,number_of_nights,number_of_rooms,number_of_adults,srq_latitude,srq_longitude,check_in_weekday,check_out_weekday,srq_weekhour,weekday_travel,weekend_travel,hotel_index
search_request_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
10062081144,1002457719590,1074702,1614904.0,0,2019-01-10,2019-01-14,1430218226,9,4,1,2,37.7884,-122.4073,4,1,48.0,True,False,5972
10062081205,1002457725779,123897,2577971.0,0,2019-01-01,2019-01-06,3994977087,0,5,1,2,51.50789,-0.128145,2,7,48.0,False,False,3791
10062081208,1002457725664,99766,2238251.0,0,2019-03-28,2019-03-29,1035052858,86,1,1,1,47.9268,-122.3087,4,5,48.0,True,False,5973
10062082026,1002457817096,705272,1896492.0,0,2019-02-09,2019-02-13,2672104405,39,4,1,2,-12.0631,-77.0433,6,3,49.0,False,False,25232
10062082531,1002457874635,13616,2568142.0,0,2019-06-05,2019-06-11,126613354,155,6,1,2,38.713177,-9.138806,3,2,49.0,False,False,25233


KeyError: "None of ['search_request_id'] are in the columns"

# Accessing Example

In [75]:
test_id = 10064244538

context_cat_pre_encoded = context_df.loc[test_id][categorical_vars]
context_quant = context_df.loc[test_id][quantitative_vars]
user_id = context_df.loc[test_id]['user_id']
(user_interaction_dict, user_interaction_vec) = interactions_struct[user_id]
single_interaction_vec = user_interaction_dict[test_id]

In [82]:
context_cat_pre_encoded

reward_program_hash    3312343131
check_in_weekday                6
check_out_weekday               7
weekday_travel              False
weekend_travel               True
Name: 10064244538, dtype: object

In [83]:
context_quant

check_in                 2019-04-06 00:00:00
check_out                2019-04-07 00:00:00
advance_purchase_days                     33
number_of_nights                           1
number_of_rooms                            1
number_of_adults                           2
srq_latitude                       26.823395
srq_longitude                     -80.138655
Name: 10064244538, dtype: object

In [79]:
user_id

311836.0

In [80]:
single_interaction_vec

{0: 0,
 1: 0,
 4299: 0,
 23350: 0,
 23642: 0,
 40000: 0,
 24625: 0,
 14149: 0,
 34008: 0,
 9693: 0,
 4195: 0,
 12087: 2,
 30127: 0,
 30020: 0,
 9686: 0,
 18782: 0,
 23528: 0,
 13564: 0,
 24769: 0,
 29160: 0,
 4604: 0,
 28934: 0,
 298: 0,
 43659: 0,
 21410: 0,
 27862: 0,
 17332: 0,
 36062: 0,
 43872: 0,
 51595: 0,
 32243: 0,
 58402: 0,
 2413: 0,
 33592: 0,
 34747: 0,
 37597: 0,
 32990: 0,
 20658: 0,
 46501: 0,
 19823: 0,
 22933: 0,
 41953: 0,
 43156: 0,
 40350: 0,
 7951: 0,
 13294: 0,
 15456: 0,
 22450: 0,
 30126: 0,
 44734: 0,
 21409: 0,
 48162: 0,
 7828: 0,
 14678: 0,
 16991: 0,
 52609: 0,
 34022: 0,
 12553: 0,
 44131: 0,
 38624: 0,
 24173: 0,
 31547: 0,
 39047: 0,
 31315: 0,
 30968: 0,
 8622: 0,
 37898: 0,
 10164: 0,
 31243: 0,
 30553: 0,
 19313: 0,
 95025: 0,
 14528: 0,
 41208: 0,
 32067: 0,
 22846: 0,
 28462: 0,
 20762: 0,
 13851: 0,
 19955: 0,
 30034: 0,
 24024: 0,
 30373: 0,
 24174: 0,
 16449: 0,
 29271: 0,
 21168: 0,
 16450: 0,
 36007: 0,
 37897: 0,
 37582: 0,
 52319: 0,
 1199: 

In [81]:
user_interaction_vec

{0: 0,
 1: 0,
 4299: 0,
 23350: 0,
 23642: 0,
 40000: 0,
 24625: 0,
 14149: 0,
 34008: 0,
 9693: 0,
 4195: 0,
 12087: 2,
 30127: 0,
 30020: 0,
 9686: 0,
 18782: 0,
 23528: 0,
 13564: 0,
 24769: 0,
 29160: 0,
 4604: 0,
 28934: 0,
 298: 0,
 43659: 0,
 21410: 0,
 27862: 0,
 17332: 0,
 36062: 0,
 43872: 0,
 51595: 0,
 32243: 0,
 58402: 0,
 2413: 0,
 33592: 0,
 34747: 0,
 37597: 0,
 32990: 0,
 20658: 0,
 46501: 0,
 19823: 0,
 22933: 0,
 41953: 0,
 43156: 0,
 40350: 0,
 7951: 0,
 13294: 0,
 15456: 0,
 22450: 0,
 30126: 0,
 44734: 0,
 21409: 0,
 48162: 0,
 7828: 0,
 14678: 0,
 16991: 0,
 52609: 0,
 34022: 0,
 12553: 0,
 44131: 0,
 38624: 0,
 24173: 0,
 31547: 0,
 39047: 0,
 31315: 0,
 30968: 0,
 8622: 0,
 37898: 0,
 10164: 0,
 31243: 0,
 30553: 0,
 19313: 0,
 95025: 0,
 14528: 0,
 41208: 0,
 32067: 0,
 22846: 0,
 28462: 0,
 20762: 0,
 13851: 0,
 19955: 0,
 30034: 0,
 24024: 0,
 30373: 0,
 24174: 0,
 16449: 0,
 29271: 0,
 21168: 0,
 16450: 0,
 36007: 0,
 37897: 0,
 37582: 0,
 52319: 0,
 1199: 

# Notes / BS below

All variables List

'search_result_id', # don't need  
'search_request_id', can use to join each search, doesnt go into model  
'hotel_id',              categorical  
'user_id',               categorical  
'label',                 ordinal (0,1,2,3)  
'check_in',              quantitative (after date time conversion)   
'check_out',             quantitative (after date time conversion)  
'reward_program_hash',  categorical  
'advance_purchase_days', quantitative  
'number_of_nights',      quantitative  
'number_of_rooms',       quantitative  
'number_of_adults',      quantitative  
'srq_latitude',          quantitative  
'srq_longitude',         quantitative  
'check_in_weekday',      categorical  
'check_out_weekday',     categorical  
'srq_weekhour',          categorical (Probably don't need)  
'weekday_travel',        categorical  
'weekend_travel'         categorical  
  
  
User_id    
hotel_id    


Categorical variables:  
    reward_program_hash  
    check_in_weekday  
    check_out_weekday  
    weekday_travel  
    weekend_travel   
    
quantitative variables  
advance_purchase_days  
number_of_nights  
number_of_adults  


Pytorch flow description:

Three inputs (two of them get concatenated): Single interaction for specific user, query, and all interactions for specific user.

Concatenate single interaction query for specific user and query, and send to one encoder.
Send all interactions for a specific user to another encoder



In [42]:
df_sr_id['user_id'].iloc[0]

NameError: name 'df_sr_id' is not defined

In [None]:
def create_train_pair(df, sr_id):
    df_sr_id = df[df['search_request_id' == sr_id]]
    query_X = df['reward_program_hash','advance_purchase_days']
    label = df_sr_id['hotel_id','label']
    return user_id,query_X, label

In [None]:
dfpd.to_datetime(df_['check_in'],yearfirst=True)