### Numpy

In [2]:
import numpy as np

### Pandas

In [3]:
import pandas as pd

### Pyarrow

In [11]:
import pyarrow as pa

### Parquet from pyarrow

In [9]:
import pyarrow.parquet as pq

### 1. Generate a dataframe

In [7]:
df = pd.DataFrame({'column_1': [-5, np.nan, 2.5],
                   'column_2': ['apple', 'banana', 'orange'],
                   'column_3': [True, False, True]},
                   index=('row_1','row_2','row_3'))
df.head()

Unnamed: 0,column_1,column_2,column_3
row_1,-5.0,apple,True
row_2,,banana,False
row_3,2.5,orange,True


### 2. Get table from pandas dataframe

In [18]:
table = pa.Table.from_pandas(df)

(3, 4)

### 2.1 Print amount of rows (num_rows) and columns (num_columns)

In [19]:
table.shape

(3, 4)

### 2.2 Print schema

In [21]:
table.schema

column_1: double
column_2: string
column_3: bool
__index_level_0__: string
-- schema metadata --
pandas: '{"index_columns": ["__index_level_0__"], "column_indexes": [{"na' + 676

### 3. Write table to .parquet format

In [22]:
pq.write_table(table, 'example.parquet')

### 4. Read .parquet file

In [23]:
table_recovered = pq.read_table('example.parquet')

In [24]:
table_recovered.to_pandas()

Unnamed: 0,column_1,column_2,column_3
row_1,-5.0,apple,True
row_2,,banana,False
row_3,2.5,orange,True


### 5. Omitting the DataFrame index (passing `preserve_index=False`)

### 5.1 Generate another dataframe

In [30]:
df = pd.DataFrame({'column_1': [-5, np.nan, 2.5],
                   'column_2': ['apple', 'banana', 'orange'],
                   'column_3': [True, False, True]},
                   index=('row_1','row_2','row_3'))

### 5.2 Get table with preserve_index=False

In [31]:
table = pa.Table.from_pandas(df, preserve_index=False)

### 5.3 Table result

In [32]:
table.to_pandas()

Unnamed: 0,column_1,column_2,column_3
0,-5.0,apple,True
1,,banana,False
2,2.5,orange,True


### 6. Reading .parquet with ParquetFile class (more features)

In [33]:
parquet_file = pq.ParquetFile('example.parquet')

### 6.1 Metadata

In [35]:
parquet_file.metadata

<pyarrow._parquet.FileMetaData object at 0x7f3b07c784a0>
  created_by: parquet-cpp version 1.5.1-SNAPSHOT
  num_columns: 4
  num_rows: 3
  num_row_groups: 1
  format_version: 1.0
  serialized_size: 2708

### 6.2 Schema

In [37]:
parquet_file.schema

<pyarrow._parquet.ParquetSchema object at 0x7f3b07bd1580>
required group field_id=0 schema {
  optional double field_id=1 column_1;
  optional binary field_id=2 column_2 (String);
  optional boolean field_id=3 column_3;
  optional binary field_id=4 __index_level_0__ (String);
}

### 7 Row group
A `.parquet file` consists of `multiple row groups` acording to Apache Parquet doc. The `read_table` method will read all of the `row groups` and concatenate them into a `single table`. 

In [38]:
parquet_file.num_row_groups

1

### 7.1 Read individual row groups

In [41]:
parquet_file.read_row_group(0).to_pandas()

Unnamed: 0,column_1,column_2,column_3
row_1,-5.0,apple,True
row_2,,banana,False
row_3,2.5,orange,True


### 7.2 Create a parquet file with multiple row groups by using `ParquetWriter` class

In [42]:
writer = pq.ParquetWriter('example_result.parquet', table.schema)

### 7.3 Creating a .parquet file with 2 row groups

In [49]:
with pq.ParquetWriter('example_result.parquet', table.schema) as writer:
    for i in range(2):
        writer.write_table(table)

### 7.4 Read parquet and print result

In [54]:
def read_parquet(file):
    parquet_file = pq.ParquetFile(file)
    return parquet_file

In [55]:
file = 'example_result.parquet'
read_parquet(file).metadata

<pyarrow._parquet.FileMetaData object at 0x7f3b07b6b630>
  created_by: parquet-cpp version 1.5.1-SNAPSHOT
  num_columns: 3
  num_rows: 6
  num_row_groups: 2
  format_version: 1.0
  serialized_size: 2157

### 8 Compression (gzip, brotli or snappy[default])

In [51]:
table.to_pandas()

Unnamed: 0,column_1,column_2,column_3
0,-5.0,apple,True
1,,banana,False
2,2.5,orange,True


### 8.1 Compress and print result

In [57]:
file = 'example_compress.snappy.parquet'
pq.write_table(table, file, compression='snappy')
read_parquet(file).metadata

<pyarrow._parquet.FileMetaData object at 0x7f3ac81019f0>
  created_by: parquet-cpp version 1.5.1-SNAPSHOT
  num_columns: 3
  num_rows: 3
  num_row_groups: 1
  format_version: 1.0
  serialized_size: 1907

### 9 Partitioned Dataset

In [68]:
table.to_pandas()

Unnamed: 0,column_1,column_2,column_3
0,-5.0,apple,True
1,,banana,False
2,2.5,orange,True


### 9.1 Writing Partitioned Dataset

In [70]:
# Local dataset write
pq.write_to_dataset(table, root_path='dataset_name',
                    partition_cols=['column_2'])

In [71]:
!tree -C -I 'example*|*.ipynb'

[01;34m.[00m
└── [01;34mdataset_name[00m
    ├── [01;34mcolumn_2=apple[00m
    │   └── 8b5a106227e14a7b83bac02e9e51ed53.parquet
    ├── [01;34mcolumn_2=banana[00m
    │   └── ad7f234de53f4efd885be79921b4fc06.parquet
    └── [01;34mcolumn_2=orange[00m
        └── 149899fcdf4c4ec49098f2367193ae7f.parquet

4 directories, 3 files


### 9.2 Reading from Partitioned Dataset

In [74]:
dataset = pq.ParquetDataset('dataset_name/')
table_result = dataset.read()
table_result.to_pandas()

Unnamed: 0,column_1,column_3,column_2
0,-5.0,True,apple
1,,False,banana
2,2.5,True,orange
