# Read XLSX and write to feather file format for faster subsequent read

__Motivation:__ Reading large Excel files into pandas dataframe takes a long time.
To speed up loading time for subsequent data processing and analysis,
a faster method is to save the dataframe into binary file format like feather or
parquet to read for future sessions.

## Read Excel files into pandas.DataFrame 

In [None]:
import os
import re
import pandas as pd


def read_combine(io):
    """
    Read a list of Excel files and combine them into a panadas DataFrame
    
    Args:
        io (list): list of fully qualified filenames
        
    Returns:
        DataFrame
    """
    list_df = []
    for each in io:
        # print("...reading", os.path.basename(each)) 
        df = pd.read_excel(each)
        df["filename"] = os.path.basename(each)
        list_df.append(df)
    
    return pd.concat(list_df)


def get_abspath(path, pattern):
    """
    Search directory non-recursively for filename by name patterns
    
    Parameters
    ----------
    path : 
    patthern : 

    Returns
    -------
    a list of absolute path, use os.path.basename to get the filename
    """
    return [os.path.join(path, x) for x in os.listdir(path) if re.search(pattern, x)]


def print_filesize(abspath):
    df=pd.DataFrame({'filename': [os.path.basename(x) for x in abspath],
                     'size': [os.stat(x).st_size for x in abspath]}) \
        .set_index('filename')
    print(df)

data_dir = os.path.join(os.getcwd(), 'data')

`successful_events`

In [None]:
%%timeit -r 1 -n 1 
successful_events_fn = get_abspath(data_dir, '^os_successful_events_[A-Z]\.xlsx')
print_filesize(successful_events_fn)

successful_events = read_combine(successful_events_fn)

In [None]:
successful_events.info(show_counts=True)

In [None]:
successful_events.drop(["Unnamed: 0"], axis=1, inplace=True)

In [None]:
successful_events.event_timestamp = pd.to_datetime(successful_events.event_timestamp)
successful_events.listing_time = pd.to_datetime(successful_events.listing_time)
successful_events.quantity = pd.to_numeric(successful_events.quantity, errors='coerce')

`coolcatsnft`

In [None]:
%%timeit -r 1 -n 1

cool_cats_nft_dir = os.path.join(data_dir, 'cool-cats-nft')
coolcatsnft_fn = get_abspath(cool_cats_nft_dir, '^coolcatsnft_A\d\.xlsx')
print_filesize(coolcatsnft_fn)

coolcatsnft = read_combine(coolcatsnft_fn)

In [None]:
coolcatsnft.info(show_counts=True)

In [None]:
coolcatsnft = coolcatsnft[coolcatsnft.msg == "success"]
coolcatsnft.drop(["Unnamed: 0", "Unnamed: 1", "msg", "FILTER"], axis=1, inplace=True)

In [None]:
coolcatsnft.event_timestamp = pd.to_datetime(coolcatsnft.event_timestamp)
coolcatsnft.listing_time = pd.to_datetime(coolcatsnft.listing_time)
coolcatsnft.quantity = pd.to_numeric(coolcatsnft.quantity, errors='coerce')

`wallets_successful_events`

In [None]:
%%timeit -r 1 -n 1

wallets_successful_event_dir = os.path.join(data_dir, 'wallets successful event')
wallets_successful_event_fn = get_abspath(wallets_successful_event_dir, '\.xlsx$')
print_filesize(wallets_successful_event_fn)
wallets_successful_event = read_combine(wallets_successful_event_fn)

## Write `DataFrame` to feather format and read it

`successful_events`

In [None]:
start_time = time.time()
df=successful_events.reset_index()
df.to_feather(os.path.join(data_dir, 'os_successful_events.feather'))
total_time = time.time() - start_time
print("total minutes to write feather file:", total_time / 60)

In [None]:
start_time = time.time()
df = pd.read_feather(os.path.join(data_dir, 'os_successful_events.feather'))
total_time = time.time() - start_time
print("total minutes to load feather file:", total_time / 60)

In [None]:
df.info()

`coolcatsnft`

In [None]:
start_time = time.time()
df=coolcatsnft.reset_index()
df.to_feather(os.path.join(data_dir, 'cool-cats-nft.feather'))
total_time = time.time() - start_time
print("total minutes to write feather file:", total_time / 60)

In [None]:
start_time = time.time()
df = pd.read_feather(os.path.join(data_dir, 'cool-cats-nft.feather'))
total_time = time.time() - start_time
print("total minutes to load feather file:", total_time / 60)

In [None]:
df.info()

## Write `DataFrame` to parquet format and read it (WIP)