In [2]:
import pandas as pd
import numpy as np

def get_dataset(size):
    # Create Fake Dataset
    df = pd.DataFrame()
    df['size'] = np.random.choice(['big','medium','small'], size)
    df['age'] = np.random.randint(1, 50, size)
    df['team'] = np.random.choice(['red','blue','yellow','green'], size)
    df['win'] = np.random.choice(['yes','no'], size)
    dates = pd.date_range('2020-01-01', '2022-12-31')
    df['date'] = np.random.choice(dates, size)
    df['prob'] = np.random.uniform(0, 1, size)
    return df

def set_dtypes(df):
    df['size'] = df['size'].astype('category')
    df['team'] = df['team'].astype('category')
    df['age'] = df['age'].astype('int16')
    df['win'] = df['win'].map({'yes':True, 'no': False})
    df['prob'] = df['prob'].astype('float32')
    return df

In [3]:
df = get_dataset(5_000_000)

In [6]:
df_2 = set_dtypes(df)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000000 entries, 0 to 4999999
Data columns (total 6 columns):
 #   Column  Dtype         
---  ------  -----         
 0   size    object        
 1   age     int32         
 2   team    object        
 3   win     object        
 4   date    datetime64[ns]
 5   prob    float64       
dtypes: datetime64[ns](1), float64(1), int32(1), object(3)
memory usage: 209.8+ MB


In [7]:
df_2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000000 entries, 0 to 4999999
Data columns (total 6 columns):
 #   Column  Dtype         
---  ------  -----         
 0   size    category      
 1   age     int16         
 2   team    category      
 3   win     bool          
 4   date    datetime64[ns]
 5   prob    float32       
dtypes: bool(1), category(2), datetime64[ns](1), float32(1), int16(1)
memory usage: 81.1 MB


In [8]:
print('Reading and writing CSV')
df = get_dataset(1_000_000)
df = set_dtypes(df)
%time df.to_csv('test.csv')
%time df_csv = pd.read_csv('test.csv')

Reading and writing CSV
CPU times: total: 6.67 s
Wall time: 6.9 s
CPU times: total: 1.45 s
Wall time: 1.55 s


In [9]:
print('Reading and writing Pickle')
df = get_dataset(1_000_000)
df = set_dtypes(df)
%time df.to_pickle('test.pickle')
%time df_pickle = pd.read_pickle('test.pickle')

Reading and writing Pickle
CPU times: total: 15.6 ms
Wall time: 14 ms
CPU times: total: 15.6 ms
Wall time: 25 ms


In [10]:
print('Reading and writing Parquet')
df = get_dataset(1_000_000)
df = set_dtypes(df)
%time df.to_parquet('test.parquet')
%time df_parquet = pd.read_parquet('test.parquet')

Reading and writing Parquet
CPU times: total: 344 ms
Wall time: 334 ms
CPU times: total: 188 ms
Wall time: 88.4 ms


In [5]:
print('Reading and writing Feather')
df = get_dataset(5_000_000)
df = set_dtypes(df)
%time df.to_feather('test.feather')
%time df_feather = pd.read_feather('test.feather')

Reading and writing Feather
CPU times: total: 391 ms
Wall time: 204 ms
CPU times: total: 266 ms
Wall time: 111 ms
