In [6]:
import pyarrow.parquet as parquet
import pyarrow.feather as feather
import pyarrow.orc as orc
import pyarrow.csv as csv
import pandas as pd
from timeit import default_timer as timer
from datetime import timedelta
import pyarrow as pa

reddit_place = '2022_place_canvas_history'
parquet_file = f'data/small/day1/{reddit_place}.parquet'
feather_file = f'data/small/day1/{reddit_place}.feather'
orc_file = f'data/small/day1/{reddit_place}.orc'
csv_file = f'data/small/day1/{reddit_place}.csv'
large_parquet = f'data/convtimestamp/{reddit_place}.parquet'
large_csv = f'data/convtimestamp/{reddit_place}.csv'

In [2]:
df = csv.read_csv(csv_file).to_pandas()

In [4]:
table = pa.Table.from_pandas(df)

In [5]:
with pa.CompressedOutputStream(f'{csv_file}.zstd', 'zstd') as out:
    csv.write_csv(table, out)

In [6]:
with pa.CompressedOutputStream(f'{parquet_file}.zstd', 'zstd') as out:
    parquet.write_table(table, out)

In [9]:
with pa.CompressedInputStream(pa.OSFile(f'{csv_file}.zstd'), 'zstd') as input:
    csv_table = csv.read_csv(input)

csv_table.num_rows

12278385

In [None]:
df = parquet.read_table(parquet_file, columns=['user_id'])

In [10]:
pq = parquet.ParquetFile(parquet_file)

In [11]:
data = {
    "columns": pq.metadata.num_columns,
    "rows": pq.metadata.num_rows,
    "row_groups": pq.metadata.num_row_groups
}
print(data)

{'columns': 5, 'rows': 12278385, 'row_groups': 1}


In [12]:
pq.schema.names

['timestamp', 'user_id', 'pixel_color', 'coordinate', 'conv_timestamp']

In [15]:
t = csv.read_csv('data/original/2022_place_canvas_history.csv')

Error: Canceled future for execute_request message before replies were done

In [14]:
t.schema.names

['timestamp', 'user_id', 'pixel_color', 'coordinate', 'conv_timestamp']

In [5]:
table = parquet.read_table(parquet_file, columns=['conv_timestamp', 'user_id', 'coordinate'], filters=[('pixel_color', '=', '#FFFFFF')])

In [50]:
users = parquet.read_table(parquet_file, columns=['user_id'])['user_id']
unique_users = users.unique()
print(len(unique_users))

1995588


In [27]:
pa.compute.count_distinct(parquet.read_table(parquet_file, columns=['pixel_color'])['pixel_color'].combine_chunks())

<pyarrow.Int64Scalar: 16>

In [49]:
len(orc.read_table(orc_file, columns=['user_id'])['user_id'].unique())

1995588

In [53]:
len(feather.read_table(feather_file, columns=['user_id'])['user_id'].unique())

1995588

In [54]:
len(csv.read_csv(csv_file)['user_id'].unique())

1995588

In [139]:
r = pa.compute.value_counts(parquet.read_table(parquet_file, columns=['pixel_color'])['pixel_color'].combine_chunks()).flatten()
d = pd.Series(r[1], r[0])
print(d)

#000000    2905926
#FFFFFF    1832310
#FF4500    1769590
#FFD635     874595
#2450A4    1149588
#FF99AA     586856
#D4D7D9     177423
#9C6926     169399
#7EED56     375585
#00A368     555354
#51E9F4     522074
#FFA800     408955
#3690EA     294541
#898D90     195329
#B44AC0     109915
#811E9F     350945
dtype: int64


In [147]:
table = parquet.read_table(parquet_file, columns=['pixel_color'])
value_counts = pa.compute.value_counts(table['pixel_color'].combine_chunks()).flatten()
pd.Series(value_counts[1], value_counts[0])

#000000    2905926
#FFFFFF    1832310
#FF4500    1769590
#FFD635     874595
#2450A4    1149588
#FF99AA     586856
#D4D7D9     177423
#9C6926     169399
#7EED56     375585
#00A368     555354
#51E9F4     522074
#FFA800     408955
#3690EA     294541
#898D90     195329
#B44AC0     109915
#811E9F     350945
dtype: int64

In [137]:
print(d.index)

Index(['#000000', '#FFFFFF', '#FF4500', '#FFD635', '#2450A4', '#FF99AA',
       '#D4D7D9', '#9C6926', '#7EED56', '#00A368', '#51E9F4', '#FFA800',
       '#3690EA', '#898D90', '#B44AC0', '#811E9F'],
      dtype='object')


In [114]:
values = [v['values'] for v in r]
counts = [v['counts'] for v in r]

list(zip(values, counts))

[('#000000', 2905926),
 ('#FFFFFF', 1832310),
 ('#FF4500', 1769590),
 ('#FFD635', 874595),
 ('#2450A4', 1149588),
 ('#FF99AA', 586856),
 ('#D4D7D9', 177423),
 ('#9C6926', 169399),
 ('#7EED56', 375585),
 ('#00A368', 555354),
 ('#51E9F4', 522074),
 ('#FFA800', 408955),
 ('#3690EA', 294541),
 ('#898D90', 195329),
 ('#B44AC0', 109915),
 ('#811E9F', 350945)]

In [132]:
b = pd.Series([1, 2, 3, 4, 5], ['a', 'b', 'c', 'd', 'e'])
print(b)

a    1
b    2
c    3
d    4
e    5
dtype: int64


In [148]:
value_counts = pa.compute.value_counts(orc.read_table(orc_file, columns=['pixel_color'])['pixel_color'].combine_chunks()).flatten()
pd.Series(value_counts[1], value_counts[0])

#000000    2905926
#FFFFFF    1832310
#FF4500    1769590
#FFD635     874595
#2450A4    1149588
#FF99AA     586856
#D4D7D9     177423
#9C6926     169399
#7EED56     375585
#00A368     555354
#51E9F4     522074
#FFA800     408955
#3690EA     294541
#898D90     195329
#B44AC0     109915
#811E9F     350945
dtype: int64

In [9]:
import gc
gc.collect()

0

In [10]:
table = parquet.read_table(large_parquet, columns=['pixel_color'])

In [11]:
result = pa.compute.value_counts(table['pixel_color']).flatten()

In [15]:
pd.Series(result[1], result[0]).sort_values(ascending=False)

#000000    33707371
#FFFFFF    32251013
#FF4500    14411389
#2450A4     9989854
#FFD635     8519392
#BE0039     5911641
#51E9F4     5700301
#811E9F     5245484
#FFA800     5059970
#FF99AA     4917801
#3690EA     4058046
#00A368     3892844
#898D90     3459390
#7EED56     3417232
#D4D7D9     3324082
#9C6926     2473639
#FFB470     2104848
#FF3881     1458772
#B44AC0     1287672
#6D482F     1261416
#00CC78     1200067
#493AC1     1139350
#FFF8B8      954606
#515252      868769
#6D001A      621194
#DE107F      589211
#00756F      572572
#6A5CFF      499233
#94B3FF      454142
#009EAA      436068
#E4ABFF      350873
#00CCC0      214862
dtype: int64

In [17]:
gc.collect()

0