# Imports

In [40]:
import pandas as pd
import os
import timeit
import sys

In [41]:
print(f'Python version: {sys.version}')
print(f'Pandas version: {pd.__version__}')


Python version: 3.10.12 | packaged by Anaconda, Inc. | (main, Jul  5 2023, 19:01:18) [MSC v.1916 64 bit (AMD64)]
Pandas version: 1.4.2


# Global Variables

In [42]:
input_csv_file = "D:/data/london_crime_by_lsoa.csv"
output_feather_file = "data/london_crime_by_lsoa.feather"
output_parquet_file = "data/london_crime_by_lsoa.parquet"

# Read the CSV file

In [43]:
start_csv = timeit.default_timer()
df_csv = pd.read_csv(input_csv_file)
stop_csv = timeit.default_timer()


In [44]:
size_csv_file = os.path.getsize(input_csv_file)

# Conversion to Feather

In [45]:
df_csv.to_feather(output_feather_file, compression='zstd')

In [46]:
start_feather = timeit.default_timer()
df_feather = pd.read_feather(output_feather_file)
stop_feather = timeit.default_timer()


## Analysis

In [47]:
size_feather_file = os.path.getsize(output_feather_file)

In [48]:
print(
    f'[INFO] A remarkable {100*(1-size_feather_file/size_csv_file):.1f}% reduction in file size.')
print(
    f'[INFO] The time taken to read the Feather file was {stop_feather-start_feather:.2f} seconds.')


[INFO] A remarkable 69.4% reduction in file size.
[INFO] The time taken to read the Feather file was 2.69 seconds.


# Conversion to Parquet

In [49]:
df_csv.to_parquet(output_parquet_file, compression="gzip")


In [50]:
start_parquet = timeit.default_timer()
df_parquet = pd.read_parquet(output_parquet_file)
stop_parquet = timeit.default_timer()


## Analysis

In [51]:
size_parquet_file = os.path.getsize(output_parquet_file)


In [52]:
print(
    f'[INFO] A remarkable {100*(1-size_parquet_file/size_csv_file):.1f}% reduction in file size.')
print(
    f'[INFO] The time taken to read the Parquet file was {stop_parquet-start_parquet:.2f} seconds.')


[INFO] A remarkable 93.6% reduction in file size.
[INFO] The time taken to read the Parquet file was 3.09 seconds.


# Conclusions