In [2]:
import pandas as pd
import numpy as np

import joblib

# import feather
# import pickle

### Create `Fake  Data/Dataset`

### Following are 2 custom functions to create fake data

In [3]:
def get_dataset(size):
    # Create Fake Dataset
    df = pd.DataFrame()
    df['size'] = np.random.choice(['big','medium','small'], size)
    df['age'] = np.random.randint(1, 50, size)
    df['team'] = np.random.choice(['red','blue','yellow','green'], size)
    df['win'] = np.random.choice(['yes','no'], size)
    dates = pd.date_range('2020-01-01', '2022-12-31')
    df['date'] = np.random.choice(dates, size)
    df['prob'] = np.random.uniform(0, 1, size)
    return df

def set_dtypes(df):
    df['size'] = df['size'].astype('category')
    df['team'] = df['team'].astype('category')
    df['age'] = df['age'].astype('int16')
    df['win'] = df['win'].map({'yes':True, 'no': False})
    df['prob'] = df['prob'].astype('float32')
    return df

###  Various `File Formats` to save & read dataframe

### For `CSV` file type

In [4]:
print('Reading and writing CSV')


df = get_dataset(5_000_000)            # yo line le get_dataset() function lai call garcha to create fake data i.e 5_000_000 chai data size ho

df = set_dtypes(df)                    # setting the data types of those data

%time df.to_csv('test.csv')            # to_csv() function creates a csv file
%time df_csv = pd.read_csv('test.csv') # read_csv() to read/load a csv file

# NOTE: %time function gives the cpu execution time to execute the code


# # Drawbacks of a .csv file:
# NOTE: It takes highest(huge) amount of time to write and read/load the data from a csv file
# NOTE: csv ma save garyo vani, pachi csv read garda, data ko datatype haru nai lost huncha

Reading and writing CSV
CPU times: total: 22 s
Wall time: 22.6 s
CPU times: total: 4.02 s
Wall time: 4.19 s


In [5]:
# NOTE: can read only specific columns directly from a csv file type
pd.read_csv('test.csv', usecols=['date', 'win'])

Unnamed: 0,win,date
0,True,2022-05-18
1,True,2022-05-11
2,True,2020-05-03
3,True,2021-04-21
4,False,2022-09-26
...,...,...
4999995,True,2021-12-28
4999996,False,2022-06-07
4999997,False,2022-04-06
4999998,False,2021-02-14


In [6]:
## For viewing detail info of a file in ubuntu or windows:
!ls -GFlash test.csv

238M -rw-r--r-- 1 Dhiraj 238M May 31 06:34 test.csv


### For `Pickle` File type

In [7]:
print('Reading and writing Pickle')

df = get_dataset(5_000_000)
df = set_dtypes(df)

%time df.to_pickle('test.pickle')                # to_pickle() function create a pickle file
%time df_pickle = pd.read_pickle('test.pickle')  # read_pickle() function to read/load a pickle file

# # Merits of pickle file:
# NOTE: pickle file ma save garyo vani, pachi pickle file read garda, data ko datatype haru chai lost vako hudaina

Reading and writing Pickle
CPU times: total: 78.1 ms
Wall time: 323 ms
CPU times: total: 46.9 ms
Wall time: 62.5 ms


In [8]:
# # Demerits of pickle file:
# # NOTE: cannot read only specific columns from a pickle file type

# pd.read_pickle('test.pickle', columns=['date', 'win'])

In [9]:
## For viewing detail info of a file in ubuntu or windows:
!ls -GFlash test.pickle

82M -rw-r--r-- 1 Dhiraj 82M May 31 06:35 test.pickle


### For `Joblib` File type

In [10]:
print('Reading and writing Joblib file')

df = get_dataset(5_000_000)
df = set_dtypes(df)

%time joblib.dump(df, 'test.joblib')

%time df_joblib = joblib.load('test.joblib')

Reading and writing Joblib file
CPU times: total: 62.5 ms
Wall time: 80.1 ms
CPU times: total: 46.9 ms
Wall time: 62.3 ms


In [11]:
## For viewing detail info of a file in ubuntu or windows:
!ls -GFlash test.joblib

82M -rw-r--r-- 1 Dhiraj 82M May 31 06:35 test.joblib


### For `Parquet` File type

### To read a parquet file, first u have to install:

#### `pip install pyarrow`
#### `pip install fastparquet`

In [12]:
print('Reading and writing Parquet')

df = get_dataset(5_000_000)
df = set_dtypes(df)

%time df.to_parquet('test.parquet')                # to_parquet() function to create a parquet file

%time df_parquet = pd.read_parquet('test.parquet') # read_parquet() function to read/load a parquet file

Reading and writing Parquet
CPU times: total: 969 ms
Wall time: 907 ms
CPU times: total: 812 ms
Wall time: 320 ms


In [13]:
# Merits of parquet file type:

# NOTE: Reading/Loading .parquet file is so fast

# Can read in specific columns using parquet file type

pd.read_parquet('test.parquet', columns=['date', 'win'])

Unnamed: 0,date,win
0,2021-05-06,False
1,2022-03-19,False
2,2022-08-28,True
3,2020-07-17,False
4,2022-12-30,False
...,...,...
4999995,2022-04-04,True
4999996,2021-04-08,True
4999997,2020-10-14,False
4999998,2020-08-05,False


In [14]:
## For viewing detail info of a file in ubuntu or windows:
!ls -GFlash test.parquet

33M -rw-r--r-- 1 Dhiraj 33M May 31 06:35 test.parquet


### For `Feather` File type

In [15]:
print('Reading and writing Feather')

df = get_dataset(5_000_000)
df = set_dtypes(df)

%time df.to_feather('test.feather')

%time df_feather = pd.read_feather('test.feather')

Reading and writing Feather
CPU times: total: 375 ms
Wall time: 219 ms
CPU times: total: 359 ms
Wall time: 156 ms


In [16]:
# Merits of a .feather file type:

# NOTE: Reading/Loading .feather file is superfast(i.e faster than .parquet file type)

In [17]:
## For viewing detail info of a file in ubuntu or windows:
!ls -GFlash test.feather

49M -rw-r--r-- 1 Dhiraj 49M May 31 06:35 test.feather
