In [2]:
import pyarrow.parquet as parquet
import pyarrow.feather as feather
import pyarrow.orc as orc
import pyarrow.csv as csv
import pyarrow as pa
import pandas as pd
from timeit import default_timer as timer
from datetime import timedelta

reddit_place = '2022_place_canvas_history'
parquet_file = f'data/small/day1/{reddit_place}.parquet'
feather_file = f'data/small/day1/{reddit_place}.feather'
orc_file = f'data/small/day1/{reddit_place}.orc'
csv_file = f'data/small/day1/{reddit_place}.csv'


def time_func(func, *args, **kwargs):
    start = timer()
    result = func(*args, **kwargs)
    end = timer()
    total_in_seconds = end - start

    return result, total_in_seconds


def read_pyarrow_to_pandas(filename, filetype):
    """Read file to pyarrow table and convert to pandas dataframe"""
    if filetype == 'parquet':
        df = parquet.read_table(filename).to_pandas()
    elif filetype == 'orc':
        df = orc.read_table(filename).to_pandas()
    elif filetype == 'feather':
        df = feather.read_feather(filename)
    elif filetype == 'csv':
        df = csv.read_csv(filename).to_pandas()
    
    return df.shape


def read_pandas(filename, filetype):
    """Read file to pandas dataframe"""
    if filetype == 'parquet':
        df = pd.read_parquet(filename)
    elif filetype == 'orc':
        df = pd.read_orc(filename)
    elif filetype == 'feather':
        df = pd.read_feather(filename)
    elif filetype == 'csv':
        df = csv.read_csv(filename).to_pandas()
    
    return df.shape

In [31]:
#orc.ORCFile(orc_file).nrows
pa.csv.read_csv()
csv.read_csv()

TypeError: Do not call CSVStreamingReader's constructor directly, use pyarrow.csv.open_csv() instead.

In [24]:
pd.unique(orc.ORCFile(orc_file).read(columns=['user_id']).to_pandas()['user_id']).shape[0]

1995588

In [19]:
orc.ORCFile(orc_file).nstripes

25

In [29]:
orc.ORCFile(orc_file).read_stripe(0).to_pandas()

Unnamed: 0,timestamp,user_id,pixel_color,coordinate,conv_timestamp,__index_level_0__
0,2022-04-01 15:38:01.116 UTC,WYFuP/nwVCIBrw5XOVYKsHyo/fJpOJcIXCm75iKLqEz92B...,#000000,191662,2022-04-01 15:38:01.116,10119006
1,2022-04-01 15:38:01.124 UTC,1Fie0j8msAiBmD5+NfV4SdY6ilMDHV9XJc6zWdhgAyrc8H...,#000000,23918,2022-04-01 15:38:01.124,10119007
2,2022-04-01 15:38:01.13 UTC,qvqMCBzdQyIL1ET+iZvXilEjCrt7cjq3oPG3uUad8tWi1X...,#FFFFFF,722727,2022-04-01 15:38:01.130,10119008
3,2022-04-01 15:38:01.131 UTC,TGLWFhne7tzE8iBhUWm7K3m6SrH+2xBl599XMhdtvV6RqV...,#FF4500,64546,2022-04-01 15:38:01.131,10119009
4,2022-04-01 15:38:01.134 UTC,C1E2rtkIcBP+omujs+YcSHfdWfCBpcxq6uIQpSlekwfyIq...,#FFFFFF,90483,2022-04-01 15:38:01.134,10119010
...,...,...,...,...,...,...
502779,2022-04-01 16:07:25.616 UTC,zQsRzHfoACVitJvTkn+f8mjI7FrzrX1otiTYhdIsM3T4cP...,#000000,184680,2022-04-01 16:07:25.616,10621785
502780,2022-04-01 16:07:25.617 UTC,eOrq878qwLnBFKzcCJ8JICUFlYrxfeJ6m5spH8LF/iP0q6...,#000000,800123,2022-04-01 16:07:25.617,10621786
502781,2022-04-01 16:07:25.618 UTC,SBF9JwY4y64x+5Bs3C5wVh+i3FdeS5buL3SfN502AGPPbE...,#FF4500,727375,2022-04-01 16:07:25.618,10621787
502782,2022-04-01 16:07:25.62 UTC,UEO8hC5hjwstBDem0yNH68W1CWLx4GQI+LDGlTQmmOi4Oz...,#000000,25069,2022-04-01 16:07:25.620,10621788


In [32]:
df = pd.read_csv(csv_file, usecols=['user_id'])

In [33]:
df.head()

Unnamed: 0,user_id
0,WYFuP/nwVCIBrw5XOVYKsHyo/fJpOJcIXCm75iKLqEz92B...
1,1Fie0j8msAiBmD5+NfV4SdY6ilMDHV9XJc6zWdhgAyrc8H...
2,qvqMCBzdQyIL1ET+iZvXilEjCrt7cjq3oPG3uUad8tWi1X...
3,TGLWFhne7tzE8iBhUWm7K3m6SrH+2xBl599XMhdtvV6RqV...
4,C1E2rtkIcBP+omujs+YcSHfdWfCBpcxq6uIQpSlekwfyIq...


In [34]:
df2 = pd.read_csv(csv_file)

In [51]:
feather.read_feather(feather_file, use_threads=True)

Unnamed: 0,timestamp,user_id,pixel_color,coordinate,conv_timestamp,__index_level_0__
0,2022-04-01 15:38:01.116 UTC,WYFuP/nwVCIBrw5XOVYKsHyo/fJpOJcIXCm75iKLqEz92B...,#000000,191662,2022-04-01 15:38:01.116000+00:00,10119006
1,2022-04-01 15:38:01.124 UTC,1Fie0j8msAiBmD5+NfV4SdY6ilMDHV9XJc6zWdhgAyrc8H...,#000000,23918,2022-04-01 15:38:01.124000+00:00,10119007
2,2022-04-01 15:38:01.13 UTC,qvqMCBzdQyIL1ET+iZvXilEjCrt7cjq3oPG3uUad8tWi1X...,#FFFFFF,722727,2022-04-01 15:38:01.130000+00:00,10119008
3,2022-04-01 15:38:01.131 UTC,TGLWFhne7tzE8iBhUWm7K3m6SrH+2xBl599XMhdtvV6RqV...,#FF4500,64546,2022-04-01 15:38:01.131000+00:00,10119009
4,2022-04-01 15:38:01.134 UTC,C1E2rtkIcBP+omujs+YcSHfdWfCBpcxq6uIQpSlekwfyIq...,#FFFFFF,90483,2022-04-01 15:38:01.134000+00:00,10119010
...,...,...,...,...,...,...
12278380,2022-04-01 23:59:59.993 UTC,uWEWZve8leRocb4Dipm7yCLM3gN6Weq3uh982BxOFuLUJF...,#FFFFFF,566552,2022-04-01 23:59:59.993000+00:00,34997263
12278381,2022-04-01 23:59:59.994 UTC,X9wV3qCoxF1tysu8za1DMRfGCVzXTTv2RRZi6PqTLcHIZt...,#000000,618533,2022-04-01 23:59:59.994000+00:00,34997264
12278382,2022-04-01 23:59:59.994 UTC,ahlMMSchqTWmpXhQ8fW+nAVqtMvX/SneiGhY7Mr+o4xzOH...,#FF4500,371451,2022-04-01 23:59:59.994000+00:00,34997265
12278383,2022-04-01 23:59:59.995 UTC,JEMIFsk+fG9KnqzfraQG5V4rjg2OvrHLFz/AGBtRM1zBVZ...,#000000,566449,2022-04-01 23:59:59.995000+00:00,34997266


In [52]:
parquet.read_table(parquet_file, use_threads=True).to_pandas()

Unnamed: 0,timestamp,user_id,pixel_color,coordinate,conv_timestamp,__index_level_0__
0,2022-04-01 15:38:01.116 UTC,WYFuP/nwVCIBrw5XOVYKsHyo/fJpOJcIXCm75iKLqEz92B...,#000000,191662,2022-04-01 15:38:01.116000+00:00,10119006
1,2022-04-01 15:38:01.124 UTC,1Fie0j8msAiBmD5+NfV4SdY6ilMDHV9XJc6zWdhgAyrc8H...,#000000,23918,2022-04-01 15:38:01.124000+00:00,10119007
2,2022-04-01 15:38:01.13 UTC,qvqMCBzdQyIL1ET+iZvXilEjCrt7cjq3oPG3uUad8tWi1X...,#FFFFFF,722727,2022-04-01 15:38:01.130000+00:00,10119008
3,2022-04-01 15:38:01.131 UTC,TGLWFhne7tzE8iBhUWm7K3m6SrH+2xBl599XMhdtvV6RqV...,#FF4500,64546,2022-04-01 15:38:01.131000+00:00,10119009
4,2022-04-01 15:38:01.134 UTC,C1E2rtkIcBP+omujs+YcSHfdWfCBpcxq6uIQpSlekwfyIq...,#FFFFFF,90483,2022-04-01 15:38:01.134000+00:00,10119010
...,...,...,...,...,...,...
12278380,2022-04-01 23:59:59.993 UTC,uWEWZve8leRocb4Dipm7yCLM3gN6Weq3uh982BxOFuLUJF...,#FFFFFF,566552,2022-04-01 23:59:59.993000+00:00,34997263
12278381,2022-04-01 23:59:59.994 UTC,X9wV3qCoxF1tysu8za1DMRfGCVzXTTv2RRZi6PqTLcHIZt...,#000000,618533,2022-04-01 23:59:59.994000+00:00,34997264
12278382,2022-04-01 23:59:59.994 UTC,ahlMMSchqTWmpXhQ8fW+nAVqtMvX/SneiGhY7Mr+o4xzOH...,#FF4500,371451,2022-04-01 23:59:59.994000+00:00,34997265
12278383,2022-04-01 23:59:59.995 UTC,JEMIFsk+fG9KnqzfraQG5V4rjg2OvrHLFz/AGBtRM1zBVZ...,#000000,566449,2022-04-01 23:59:59.995000+00:00,34997266


In [13]:
pd.unique(csv.read_csv(csv_file, read_options=csv.ReadOptions(use_threads=True)).to_pandas()['user_id']).shape[0]

1995588

In [9]:
pd.unique(csv.read_csv(csv_file, read_options=csv.ReadOptions(use_threads=False)).to_pandas()['user_id']).shape[0]

1995588

In [11]:
pd.unique(csv.read_csv(csv_file).to_pandas()['user_id']).shape[0]

1995588

In [None]:
df2.head()

In [19]:
rp = parquet.read_table(parquet_file, memory_map=True)

In [20]:
del rp

In [None]:
rp2 = parquet.read_table(parquet_file, filters=[()])


In [46]:

#pa.TimestampArray.from_pandas()
pa.TimestampArray.from_pandas
pd.Timestamp(2022, 4, 1, 16, 0, 0, 0, 0, 'utc')

AttributeError: 'builtin_function_or_method' object has no attribute 'from_pandas'

In [47]:

t = parquet.read_table(parquet_file, columns=['conv_timestamp'])

In [57]:
import datetime
from datetime import timezone
datetime.datetime(2022, 4, 1, 16, tzinfo=timezone.utc)

'2022-04-01T16:00:00+00:00'

In [None]:
parquet.read_table(parquet_file)

In [61]:
df = parquet.read_table(parquet_file, filters=[('conv_timestamp', '<', datetime.datetime(2022, 4, 1, 16, tzinfo=timezone.utc))]).to_pandas()

In [68]:
df = parquet.read_table(parquet_file).to_pandas()

In [174]:
un = parquet.read_table(f'data/raw/parquet/{reddit_place}.parquet', columns=['pixel_color'])['pixel_color'].combine_chunks().unique()
print(len(un))

32


In [173]:
if t:
    del t

In [74]:
import pyarrow.compute as pc
pc.count_distinct(table['pixel_color'])

<pyarrow.Int64Scalar: 4064>

In [118]:
ap = parquet.read_table(parquet_file)
r = pc.count_distinct(ap['user_id'].combine_chunks())
print(r)

1995588


In [165]:
ap = len(parquet.read_table(parquet_file, columns=['pixel_color'])['pixel_color'].combine_chunks().unique())
print(ap)

16


In [161]:
cv = len(csv.read_csv(csv_file)['pixel_color'].combine_chunks().unique())
print(cv)

16


In [162]:
orca = len(orc.read_table(orc_file, columns=['pixel_color'])['pixel_color'].combine_chunks().unique())
print(orca)

16


In [164]:
print(len(feather.read_table(feather_file, columns=['pixel_color'])['pixel_color'].combine_chunks().unique()))

16


In [138]:
df = ap['pixel_color'].combine_chunks().value_counts().to_pandas()
df.columns()

AttributeError: 'Series' object has no attribute 'columns'

In [91]:
pc.count_distinct(ap['pixel_color'], mode='only_valid')


<pyarrow.Int64Scalar: 192>

In [94]:
pc.count_distinct(ap['pixel_color'][:10])

<pyarrow.Int64Scalar: 4>

In [112]:
len(ap['pixel_color'].combine_chunks().unique())

16

In [109]:
sum = pa.scalar(0)
for chunk in ap['pixel_color'].iterchunks():
    sum = pc.add(sum, pc.count_distinct(chunk).cast('int64'))

In [110]:
sum

<pyarrow.Int64Scalar: 192>

In [63]:
df['conv_timestamp'].describe()

  df['conv_timestamp'].describe()


count                              2413550
unique                             2082516
top       2022-04-01 15:17:21.204000+00:00
freq                                    40
first     2022-04-01 12:44:10.315000+00:00
last      2022-04-01 15:59:59.990000+00:00
Name: conv_timestamp, dtype: object

In [67]:
filter = df['conv_timestamp'] == '2022-04-01 12:44:10.315000+00:00'
df[filter].head()

Unnamed: 0,timestamp,user_id,pixel_color,coordinate,conv_timestamp,__index_level_0__
374770,2022-04-01 12:44:10.315 UTC,lEjremCtNoQaJ6KGBSWsatGEMXwjqoQqGZesWxHdyPetpA...,#7EED56,4242,2022-04-01 12:44:10.315000+00:00,18070668
