In [2]:
import os
import pandas as pd
import numpy as np
import random
import json
import torch
import torch.utils.data as data

In [3]:
data_path = '../data/raw/small_100'
feature_path = 'src/data/schemas/output_data_schemas.json'

In [4]:
def read_parquet(data_path, num_partitions: None, randomize = True, verbose = True):
    files = os.listdir(data_path)
    if randomize:
        random.shuffle(files)
    
    if num_partitions is None:
        num_partitions = len(files)
    
    data = []
    num_reads = 0
    for file_path in files:
        if num_reads >= num_partitions:
            if verbose:
                print('Finished reading {} .parquet Files'.format(num_partitions))
            break
        
        _ , ext = os.path.splitext(file_path)
        
        if ext == '.parquet':
            fp = os.path.join(data_path, file_path)
            data.append(pd.read_parquet(os.path.join(data_path, file_path)))
            
            if verbose:
                print('Reading in data from {}'.format(fp))
                print('Data of shape {}'.format(data[-1].shape))
            
            num_reads += 1
        else: 
            continue
    data = pd.concat(data, axis=0)
    
    if verbose:
        print('Total dataframe of shape {}'.format(data.shape))
    
    return data

In [5]:
df = read_parquet(data_path, randomize = False, num_partitions = None)

Reading in data from ../data/raw/small_100/0017_part_00.parquet
Data of shape (49177, 68)
Reading in data from ../data/raw/small_100/0050_part_00.parquet
Data of shape (49072, 68)
Reading in data from ../data/raw/small_100/0074_part_00.parquet
Data of shape (49273, 68)
Reading in data from ../data/raw/small_100/0049_part_00.parquet
Data of shape (49153, 68)
Reading in data from ../data/raw/small_100/0033_part_00.parquet
Data of shape (49306, 68)
Reading in data from ../data/raw/small_100/0095_part_00.parquet
Data of shape (49172, 68)
Reading in data from ../data/raw/small_100/0046_part_00.parquet
Data of shape (49345, 68)
Reading in data from ../data/raw/small_100/0001_part_00.parquet
Data of shape (48765, 68)
Reading in data from ../data/raw/small_100/0018_part_00.parquet
Data of shape (49091, 68)
Reading in data from ../data/raw/small_100/0083_part_00.parquet
Data of shape (49067, 68)
Reading in data from ../data/raw/small_100/0025_part_00.parquet
Data of shape (49103, 68)
Reading in

In [6]:
df.columns

Index(['search_result_id', 'search_request_id', 'hotel_id', 'user_id',
       'srq_date_created', 'label', 'display_rank', 'price_rank',
       'rewards_rank', 'est_spread_rank', 'check_in', 'check_out',
       'reward_program_hash', 'site_hash', 'region_id', 'est_ttm',
       'average_published_tax_and_fees', 'average_published_price',
       'hotel_cumulative_share', 'total_rewards', 'advance_purchase_days',
       'number_of_nights', 'number_of_rooms', 'number_of_adults',
       'number_of_children', 'normalized_rewards',
       'previous_user_hotel_interaction', 'session_id', 'promotion_id',
       'anonymous_user', 'srq_latitude', 'srq_longitude', 'check_in_weekday',
       'check_out_weekday', 'srq_weekhour', 'travel_intent', 'weekday_travel',
       'weekend_travel', 'hotel_latitude', 'hotel_longitude', 'rating',
       'stars', 'number_of_reviews', 'srq_hotel_distance', 'srq_price_zscore',
       'srq_rewards_zscore', 'srq_distance_zscore', 'srq_rating_zscore',
       'srq_star

In [7]:
sum(df.user_id.isna())

1796352

In [28]:
temp = df[df.user_id.notnull()]

In [29]:
temp = temp[['search_request_id', 'hotel_id', 'user_id','label', 'check_in', 'check_out',
       'reward_program_hash', 'advance_purchase_days',
       'number_of_nights', 'number_of_rooms', 'number_of_adults',
       'srq_latitude', 'srq_longitude', 'check_in_weekday',
       'check_out_weekday', 'srq_weekhour', 'weekday_travel',
       'weekend_travel']]

In [43]:
max(temp.hotel_id)

2129121

In [51]:
temp.loc[temp.search_request_id == 10066088942].label.value_counts()

0    97
1    25
2     2
Name: label, dtype: int64