In [1]:
import os
import pandas as pd
import numpy as np
import random
import json
import torch
import torch.utils.data as data
import pickle
from scipy import sparse

In [2]:
data_path = '../data/raw/small_100'
feature_path = 'src/data/schemas/output_data_schemas.json'

In [31]:
def read_parquet(data_path, num_partitions: None, randomize = True, verbose = True, columns = ['hotel_id', 'user_id','label']):
    files = os.listdir(data_path)
    if randomize:
        random.shuffle(files)
    
    if num_partitions is None:
        num_partitions = len(files)
    
    data = []
    num_reads = 0
    for file_path in files:
        if num_reads >= num_partitions:
            if verbose:
                print('Finished reading {} .parquet Files'.format(num_partitions))
            break
        
        _ , ext = os.path.splitext(file_path)
        
        if ext == '.parquet':
            fp = os.path.join(data_path, file_path)
            data.append(pd.read_parquet(os.path.join(data_path, file_path), columns = columns))
            
            if verbose:
                print('Reading in data from {}'.format(fp))
                print('Data of shape {}'.format(data[-1].shape))
            
            num_reads += 1
        else: 
            continue
    data = pd.concat(data, axis=0)
    
    if verbose:
        print('Total dataframe of shape {}'.format(data.shape))
    
    return data

In [32]:
df = read_parquet(data_path, randomize = False, num_partitions = None)

Reading in data from ../data/raw/small_100/0017_part_00.parquet
Data of shape (49177, 3)
Reading in data from ../data/raw/small_100/0050_part_00.parquet
Data of shape (49072, 3)
Reading in data from ../data/raw/small_100/0074_part_00.parquet
Data of shape (49273, 3)
Reading in data from ../data/raw/small_100/0049_part_00.parquet
Data of shape (49153, 3)
Reading in data from ../data/raw/small_100/0033_part_00.parquet
Data of shape (49306, 3)
Reading in data from ../data/raw/small_100/0095_part_00.parquet
Data of shape (49172, 3)
Reading in data from ../data/raw/small_100/0046_part_00.parquet
Data of shape (49345, 3)
Reading in data from ../data/raw/small_100/0001_part_00.parquet
Data of shape (48765, 3)
Reading in data from ../data/raw/small_100/0018_part_00.parquet
Data of shape (49091, 3)
Reading in data from ../data/raw/small_100/0083_part_00.parquet
Data of shape (49067, 3)
Reading in data from ../data/raw/small_100/0025_part_00.parquet
Data of shape (49103, 3)
Reading in data from 

In [49]:
data_path = '../data/interim/final_reindex_train.csv'


In [50]:
new_data = pd.read_csv('../data/interim/final_reindex_train.csv')

In [78]:
sparse_data = sparse.csr_matrix((new_data.label, (new_data.user_id, new_data.hotel_id)))

In [75]:
sparse_data.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [73]:
(new_data.loc[new_data.user_id ==2].hotel_id == 10579).sum()

0

In [76]:
dense = sparse_data.toarray()

In [77]:
np.where(dense == 3)

(array([    2,     4,     4, ..., 16311, 16312, 16313]),
 array([ 2760,  2519,  3594, ..., 56888, 64275, 96557]))

In [8]:
with open( '../data/processed/user_to_queries.pkl','rb') as fp:
    a = pickle.load(fp)
    
with open(os.path.join('../data/processed/hotel_hash.json'), 'r') as fp:
    hotel_ids = json.load(fp)

In [10]:
user_interactions = {key:value[1] for (key, value) in a.items()}

In [26]:
user_interactions[0,1,2]

KeyError: (0, 1, 2)

In [17]:
len(hotel_ids.values())

96742

In [21]:
data = sparse.dok_matrix((len(b.keys()),len(hotel_ids.values())))

In [25]:
for i in user_interactions.keys():
    for j in user_interactions[i].keys():
        data[i,j] = user_interactions[i][j]
        


In [27]:
from torch.utils.data import Dataset

class BasicHotelDataset(Dataset):

    def __init__(self, data_path = None, dict_path = None):
        """
        Args
            data_path (string): Path to the csv file
        """
        if data_path is None:
            raise ValueError('Please specify data_path')
        if dict_path is None:
            raise ValueError('Need path of hashes')
        
        _ , ext = os.path.splitext(data_path)
        if ext != 'csv':
            raise ValueError('Incorrect File to upload')
        
        _, ext2 = os.path.splitext(dict_path)
        if ext2 != 'pkl':
            raise ValueError('Incorrect File to use as indicies')
        
        with open(data_path,'rb') as fp:
            self.data = pickle.load(fp)
        
        self.data = {key: value[1] for (key, value) in self.data.items()}

        with open(os.path.join(dict_path, 'hotel_hash.json'), 'r') as fp:
            self.hotel_length = len(json.load(fp))
        
        def __len__(self):
            return len(self.data.keys())

        def __getitem__(self, idx):
            if torch.is_tensor(idx):
                idx = idx.tolist

            user_interactions = [self.data[k] for k in idx] #list of dicts
            sparse_dok = sparse.dok_matrix((len(idx),self.hotel_length))
            for i in range(len(user_interactions)):
                for j in user_interactions[i].keys():
                    sparse_dok[i,j] = user_interactions[i][j]
           
            return torch.tensor(sparse_dok.toarray())