In [45]:
import os
import pandas as pd
import numpy as np
import random
import json
import torch
import torch.utils.data as data

from scipy import sparse

In [2]:
data_path = '../data/raw/small_100'
feature_path = 'src/data/schemas/output_data_schemas.json'

In [31]:
def read_parquet(data_path, num_partitions: None, randomize = True, verbose = True, columns = ['hotel_id', 'user_id','label']):
    files = os.listdir(data_path)
    if randomize:
        random.shuffle(files)
    
    if num_partitions is None:
        num_partitions = len(files)
    
    data = []
    num_reads = 0
    for file_path in files:
        if num_reads >= num_partitions:
            if verbose:
                print('Finished reading {} .parquet Files'.format(num_partitions))
            break
        
        _ , ext = os.path.splitext(file_path)
        
        if ext == '.parquet':
            fp = os.path.join(data_path, file_path)
            data.append(pd.read_parquet(os.path.join(data_path, file_path), columns = columns))
            
            if verbose:
                print('Reading in data from {}'.format(fp))
                print('Data of shape {}'.format(data[-1].shape))
            
            num_reads += 1
        else: 
            continue
    data = pd.concat(data, axis=0)
    
    if verbose:
        print('Total dataframe of shape {}'.format(data.shape))
    
    return data

In [32]:
df = read_parquet(data_path, randomize = False, num_partitions = None)

Reading in data from ../data/raw/small_100/0017_part_00.parquet
Data of shape (49177, 3)
Reading in data from ../data/raw/small_100/0050_part_00.parquet
Data of shape (49072, 3)
Reading in data from ../data/raw/small_100/0074_part_00.parquet
Data of shape (49273, 3)
Reading in data from ../data/raw/small_100/0049_part_00.parquet
Data of shape (49153, 3)
Reading in data from ../data/raw/small_100/0033_part_00.parquet
Data of shape (49306, 3)
Reading in data from ../data/raw/small_100/0095_part_00.parquet
Data of shape (49172, 3)
Reading in data from ../data/raw/small_100/0046_part_00.parquet
Data of shape (49345, 3)
Reading in data from ../data/raw/small_100/0001_part_00.parquet
Data of shape (48765, 3)
Reading in data from ../data/raw/small_100/0018_part_00.parquet
Data of shape (49091, 3)
Reading in data from ../data/raw/small_100/0083_part_00.parquet
Data of shape (49067, 3)
Reading in data from ../data/raw/small_100/0025_part_00.parquet
Data of shape (49103, 3)
Reading in data from 

In [49]:
data_path = '../data/interim/final_reindex_train.csv'


In [50]:
new_data = pd.read_csv('../data/interim/final_reindex_train.csv')

In [78]:
sparse_data = sparse.csr_matrix((new_data.label, (new_data.user_id, new_data.hotel_id)))

In [75]:
sparse_data.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [73]:
(new_data.loc[new_data.user_id ==2].hotel_id == 10579).sum()

0

In [76]:
dense = sparse_data.toarray()

In [77]:
np.where(dense == 3)

(array([    2,     4,     4, ..., 16311, 16312, 16313]),
 array([ 2760,  2519,  3594, ..., 56888, 64275, 96557]))