In [1]:
from random import sample

import pyarrow.parquet as pq
from pathlib import Path

In [2]:
data_path = Path.cwd().parent / "data"

netflix_raw_path = (data_path / "netflixFromArrow.parquet").as_posix()

print(netflix_raw_path)

/home/pliu/git/ParquetDuckDB/data/netflixFromArrow.parquet


In [3]:
sf_fire_path = "/mnt/hgfs/ubuntu_share/data_set/sf_fire/sf_fire_snappy.parquet"


In [4]:
def show_parquet_schema(file_path):
    # create a pyarrow parquet file object
    parquet_file = pq.ParquetFile(file_path)
    # print the metadata summery
    print(parquet_file.schema)


In [5]:
def show_parquet_summery_metadata(file_path:str):
    """
    This function shows the metadata summary of the parquet file
    :param file_path:
    :return:
    """
    # create a pyarrow parquet file object
    parquet_file = pq.ParquetFile(file_path)
    # print the metadata summery
    print(parquet_file.metadata)

In [6]:
def show_parquet_detail_metadata(file_path:str):
    # create a pyarrow parquet file object
    parquet_file = pq.ParquetFile(file_path)
    # Get number of row groups
    num_row_groups = parquet_file.num_row_groups
    print(f"Number of row groups: {num_row_groups}")

    # Iterate through row groups
    for i in range(num_row_groups):
        row_group_metadata = parquet_file.metadata.row_group(i)
        print(f"\nRow Group {i}:")
        print(f"  Number of Rows: {row_group_metadata.num_rows}")
        print(f"  Total Byte Size: {row_group_metadata.total_byte_size}")

        # Get column metadata within the row group
        for col in range(row_group_metadata.num_columns):
            col_metadata = row_group_metadata.column(col)
            print(f"  Column {col}: {col_metadata.path_in_schema}")
            print(f"    Encoded Bytes: {col_metadata.total_compressed_size}")
            print(f"    Uncompressed Bytes: {col_metadata.total_uncompressed_size}")
            col_stats = col_metadata.statistics
            if col_stats:
                print(f"    Min: {col_stats.min}")
                print(f"    Max: {col_stats.max}")
                print(f"    Null Count: {col_stats.null_count}")
                print(f"    Distinct Count: {col_stats.distinct_count}")
            else:
                print("    No statistics available")

In [11]:
show_parquet_schema(netflix_raw_path)

<pyarrow._parquet.ParquetSchema object at 0x7900c9b43bc0>
required group field_id=-1 schema {
  optional double field_id=-1 As_of;
  optional double field_id=-1 Rank;
  optional binary field_id=-1 Year_to_Date_Rank (String);
  optional binary field_id=-1 Last_Week_Rank (String);
  optional binary field_id=-1 Title (String);
  optional binary field_id=-1 Type (String);
  optional binary field_id=-1 Netflix_Exclusive (String);
  optional double field_id=-1 Netflix_Release_Date;
  optional double field_id=-1 Days_In_Top_10;
  optional double field_id=-1 Viewership_Score;
}



In [18]:
show_parquet_summery_metadata(netflix_raw_path)

<class 'pyarrow.parquet.core.ParquetFile'>
<pyarrow._parquet.FileMetaData object at 0x7d42841b3560>
  created_by: parquet-cpp-arrow version 14.0.0
  num_columns: 10
  num_rows: 7100
  num_row_groups: 1
  format_version: 2.6
  serialized_size: 11729


In [23]:
show_parquet_detail_metadata(netflix_raw_path)

Number of row groups: 1

Row Group 0:
  Number of Rows: 7100
  Total Byte Size: 82017
  Column 0: As_of
    Encoded Bytes: 4910
    Uncompressed Bytes: 7899
    Min: 22006.0
    Max: 22715.0
    Null Count: 0
    Distinct Count: None
  Column 1: Rank
    Encoded Bytes: 341
    Uncompressed Bytes: 3734
    Min: 1.0
    Max: 10.0
    Null Count: 0
    Distinct Count: None
  Column 2: Year_to_Date_Rank
    Encoded Bytes: 3675
    Uncompressed Bytes: 3675
    Min: -
    Max: 9
    Null Count: 0
    Distinct Count: None
  Column 3: Last_Week_Rank
    Encoded Bytes: 3516
    Uncompressed Bytes: 3665
    Min: -
    Max: 9
    Null Count: 0
    Distinct Count: None
  Column 4: Title
    Encoded Bytes: 18546
    Uncompressed Bytes: 21999
    Min: #Alive
    Max: ÃƒÂ‰lite
    Null Count: 0
    Distinct Count: None
  Column 5: Type
    Encoded Bytes: 1903
    Uncompressed Bytes: 1900
    Min: Concert
    Max: TV Show
    Null Count: 0
    Distinct Count: None
  Column 6: Netflix_Exclusive
    Enc

In [26]:
show_parquet_summery_metadata(sf_fire_path)

<pyarrow._parquet.FileMetaData object at 0x7d428406e110>
  created_by: parquet-cpp version 1.5.1-SNAPSHOT
  num_columns: 34
  num_rows: 5500519
  num_row_groups: 1
  format_version: 1.0
  serialized_size: 15726


In [27]:
show_parquet_detail_metadata(sf_fire_path)

Number of row groups: 1

Row Group 0:
  Number of Rows: 5500519
  Total Byte Size: 624925346
  Column 0: CallNumber
    Encoded Bytes: 22193216
    Uncompressed Bytes: 22192083
    Min: 1030101
    Max: 210630242
    Null Count: 0
    Distinct Count: None
  Column 1: UnitID
    Encoded Bytes: 6786693
    Uncompressed Bytes: 6789212
    Min: 27
    Max: VAN9
    Null Count: 0
    Distinct Count: None
  Column 2: IncidentNumber
    Encoded Bytes: 22193224
    Uncompressed Bytes: 22192083
    Min: 30612
    Max: 21027766
    Null Count: 0
    Distinct Count: None
  Column 3: CallType
    Encoded Bytes: 2725065
    Uncompressed Bytes: 3363929
    Min: Administrative
    Max: Watercraft in Distress
    Null Count: 0
    Distinct Count: None
  Column 4: CallDate
    Encoded Bytes: 8893484
    Uncompressed Bytes: 9012863
    Min: 01/01/2001
    Max: 12/31/2020
    Null Count: 0
    Distinct Count: None
  Column 5: WatchDate
    Encoded Bytes: 8892937
    Uncompressed Bytes: 9013011
    Min: 0

In [28]:
sf_fire_file = pq.ParquetFile(sf_fire_path)

In [5]:
sf_fire_table = pq.read_table(sf_fire_path)
out_path = data_path / "sf_fire_snappy.parquet"
pq.write_table(sf_fire_table,out_path,compression="snappy",row_group_size=500000,data_page_size=65536)

In [6]:
sf_fire_new_path = (data_path / "sf_fire_snappy.parquet").as_posix()

In [12]:
show_parquet_summery_metadata(sf_fire_new_path)

<pyarrow._parquet.FileMetaData object at 0x7900c9b964d0>
  created_by: parquet-cpp-arrow version 19.0.0
  num_columns: 34
  num_rows: 5500519
  num_row_groups: 12
  format_version: 2.6
  serialized_size: 54665


In [13]:
show_parquet_detail_metadata(sf_fire_new_path)

Number of row groups: 12

Row Group 0:
  Number of Rows: 500000
  Total Byte Size: 116609411
  Column 0: CallNumber
    Encoded Bytes: 2133805
    Uncompressed Bytes: 2133636
    Min: 1030101
    Max: 210630242
    Null Count: 0
    Distinct Count: None
  Column 1: UnitID
    Encoded Bytes: 558947
    Uncompressed Bytes: 560278
    Min: 50
    Max: VAN9
    Null Count: 0
    Distinct Count: None
  Column 2: IncidentNumber
    Encoded Bytes: 2133804
    Uncompressed Bytes: 2133636
    Min: 30612
    Max: 21027766
    Null Count: 0
    Distinct Count: None
  Column 3: CallType
    Encoded Bytes: 247854
    Uncompressed Bytes: 304484
    Min: Administrative
    Max: Watercraft in Distress
    Null Count: 0
    Distinct Count: None
  Column 4: CallDate
    Encoded Bytes: 731920
    Uncompressed Bytes: 804797
    Min: 01/01/2001
    Max: 12/31/2020
    Null Count: 0
    Distinct Count: None
  Column 5: WatchDate
    Encoded Bytes: 732926
    Uncompressed Bytes: 804992
    Min: 01/01/2001
  

In [7]:
import numpy as np
import pandas as pd

In [8]:
npz_path = "/mnt/hgfs/ubuntu_share/data_set/kaggle/france_immobilier/transactions.npz"

immo_out_path = (data_path / "fr_immo_transactions.parquet").as_posix()



In [None]:
# write parquet
with np.load(npz_path) as npz_data:

    df_dict = {key: pd.Series(value) for key, value in npz_data.items()}
    # Convert to Pandas DataFrame
    df = pd.DataFrame(df_dict)
    # Save as Parquet
    df.to_parquet(immo_out_path, engine="pyarrow")

In [25]:

key_list1 = ["id_transaction","date_transaction","prix","departement","id_ville","ville","code_postal","adresse","type_batiment","n_pieces","surface_habitable","latitude","longitude"]

# write csv
with np.load(npz_path) as npz_data:
    data = {}
    for key, value in dict(npz_data).items():
        if key in prod_key:
            print(f"treating {key}")
            if value.dtype == np.uint8:
                out_val = [s.decode("utf-8") for s in value.tobytes().split(b"\x00")]
            else:
                out_val = value
            data[key] =  out_val
        else:
            continue
    print(f"data shape: {data.keys()}")
    df = pd.DataFrame(data)
    resu_pdf = pd.DataFrame.from_dict(data)
    resu_pdf.to_parquet(immo_out_path, engine="pyarrow")


treating id_transaction
treating date_transaction
treating prix
treating departement
treating id_ville
treating ville
treating code_postal
treating adresse
treating type_batiment
treating n_pieces
treating surface_habitable
treating latitude
treating longitude
data shape: dict_keys(['id_transaction', 'date_transaction', 'prix', 'departement', 'id_ville', 'ville', 'code_postal', 'adresse', 'type_batiment', 'n_pieces', 'surface_habitable', 'latitude', 'longitude'])


In [26]:
pdf = pd.read_parquet(immo_out_path, engine="pyarrow")
print(pdf.head())

   id_transaction date_transaction      prix departement  id_ville  \
0          141653       2014-01-02  197000.0          01       427   
1          141970       2014-01-02  157500.0          01       451   
2          139240       2014-01-02  112000.0          01       365   
3          146016       2014-01-02  173020.0          01       202   
4          145911       2014-01-03   88000.0          01       283   

                  ville  code_postal                      adresse  \
0               TREVOUX         1600           6346 MTE DES LILAS   
1                VIRIAT         1440       1369 RTE DE STRASBOURG   
2  SAINT-JEAN-SUR-VEYLE         1290   5174  SAINT JEAN SUR VEYLE   
3               LAGNIEU         1150  21 GR GRANDE RUE DE BULLIEZ   
4               OYONNAX         1100          29B RUE DE LA FORGE   

  type_batiment  n_pieces  surface_habitable   latitude  longitude  
0   Appartement         4                 84  45.942301   4.770694  
1        Maison         4 

In [27]:
print(pdf.shape)

(9141573, 13)


In [28]:
print(pdf.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9141573 entries, 0 to 9141572
Data columns (total 13 columns):
 #   Column             Dtype         
---  ------             -----         
 0   id_transaction     int32         
 1   date_transaction   datetime64[ns]
 2   prix               float64       
 3   departement        object        
 4   id_ville           int32         
 5   ville              object        
 6   code_postal        int32         
 7   adresse            object        
 8   type_batiment      object        
 9   n_pieces           int32         
 10  surface_habitable  int32         
 11  latitude           float64       
 12  longitude          float64       
dtypes: datetime64[ns](1), float64(3), int32(5), object(4)
memory usage: 732.3+ MB
None
