# Read XLSX and write to feather file format for faster subsequent read

__Motivation:__ Reading large Excel files into pandas dataframe takes a long time.
To speed up loading time for subsequent data processing and analysis,
a faster method is to save the dataframe into binary file format like feather or
parquet to read for future sessions.

## Read Excel files into pandas.DataFrame 

In [1]:
import pandas as pd


def read_combine(io):
    """
    Read a list of Excel files and combine them into a panadas DataFrame
    
    Args:
        io (list): list of fully qualified filenames
        
    Returns:
        DataFrame
    """
    
    list_df = (pd.read_excel(each) for each in io)
    return pd.concat(list_df)

Search directory non-recursively for filename patterns

In [5]:
import os
import re

data_dir = os.path.join(os.getcwd(), 'data')
cool_cats_nft_dir = os.path.join(data_dir, 'cool-cats-nft')
wallets_successful_event_dir = os.path.join(data_dir, 'wallets successful event')
                                            
successful_events_fn = [x for x in os.listdir(data_dir) if re.search('^os_successful_events_[A-Z]\.xlsx', x)]
print(successful_events_fn)
coolcatsnft_fn = [x for x in os.listdir(cool_cats_nft_dir) if re.search('^coolcatsnft_A\d\.xlsx', x)]
print(coolcatsnft_fn)
wallets_successful_event_fn = [x for x in os.listdir(wallets_successful_event_dir) if re.search('\.xlsx$', x)]
print(wallets_successful_event_fn)

['os_successful_events_A.xlsx', 'os_successful_events_B.xlsx']
['coolcatsnft_A1.xlsx', 'coolcatsnft_A2.xlsx', 'coolcatsnft_A3.xlsx']
['alien-frens-evolution_success1.xlsx', 'boredapekennelclub_success1.xlsx', 'boredapeyachtclub_success1.xlsx', 'clonex_success1.xlsx', 'coolcatsnft_A1.xlsx', 'coolcatsnft_A2.xlsx', 'coolcatsnft_A3.xlsx', 'coolcatsnft_AA1.xlsx', 'coolcatsnft_AA11.xlsx', 'coolcatsnft_AA2.xlsx', 'coolcatsnft_AA22.xlsx', 'coolcatsnft_補跑B.xlsx', 'coolcatsnft_補跑E.xlsx', 'cryptoadz-by-gremplin_success1.xlsx', 'cryptoskulls_success1.xlsx', 'cyberkongz-vx_success1.xlsx', 'doodlesofficial_success1.xlsx', 'hapeprime_success1.xlsx', 'kaiju-kingz_success1.xlsx', 'karafuru_success1.xlsx', 'lazy-lions_success1.xlsx', 'mekaverse_success1.xlsx', 'mfers_success1.xlsx', 'mutant-ape-yacht-club_success1.xlsx', 'phantabear_success1.xlsx', 'proof-moonbirds_success1.xlsx', 'rektguy_rarelandnft_pieceofshit_ivedoneit_success1.xlsx', 'world-of-women-nft_success1.xlsx']


`successful_events`

In [3]:
import time

files = [os.path.join(data_dir, x) for x in successful_events_fn]
start_time = time.time()
successful_events = read_combine(files)
total_time = time.time() - start_time
print("total minutes to load:", total_time / 60)

total minutes to load: 13.612128965059917


In [4]:
successful_events.info(show_counts=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1270525 entries, 0 to 451391
Data columns (total 26 columns):
 #   Column                  Non-Null Count    Dtype  
---  ------                  --------------    -----  
 0   Unnamed: 0              1270525 non-null  int64  
 1   event_timestamp         1270525 non-null  object 
 2   event_type              1270525 non-null  object 
 3   token_id                1265379 non-null  float64
 4   num_sales               1265379 non-null  float64
 5   listing_time            1216939 non-null  object 
 6   token_owner_address     1265379 non-null  object 
 7   token_seller_address    1270525 non-null  object 
 8   deal_price              1270525 non-null  float64
 9   payment_token_symbol    1270525 non-null  object 
 10  payment_token_decimals  1270525 non-null  int64  
 11  payment_token_usdprice  1270436 non-null  float64
 12  quantity                1269968 non-null  float64
 13  starting_price          0 non-null        float64
 14  end

In [5]:
successful_events.drop(["Unnamed: 0"], axis=1, inplace=True)

In [6]:
successful_events.event_timestamp = pd.to_datetime(successful_events.event_timestamp)
successful_events.listing_time = pd.to_datetime(successful_events.listing_time)
successful_events.quantity = pd.to_numeric(successful_events.quantity, errors='coerce')

`coolcatsnft`

In [7]:
import time

files = [os.path.join(cool_cats_nft_dir, x) for x in coolcatsnft_fn]
start_time = time.time()
coolcatsnft = read_combine(files)
total_time = time.time() - start_time
print("total minutes to load:", total_time / 60)

total minutes to load: 27.89791278044383


In [8]:
coolcatsnft.info(show_counts=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2399422 entries, 0 to 399421
Data columns (total 32 columns):
 #   Column                  Non-Null Count    Dtype  
---  ------                  --------------    -----  
 0   Unnamed: 0              2399422 non-null  int64  
 1   Unnamed: 1              2399422 non-null  int64  
 2   event_timestamp         2398450 non-null  object 
 3   event_type              2398450 non-null  object 
 4   token_id                2382796 non-null  object 
 5   num_sales               2382796 non-null  float64
 6   listing_time            2268796 non-null  object 
 7   token_owner_address     2382796 non-null  object 
 8   token_seller_address    2395806 non-null  object 
 9   deal_price              2398450 non-null  float64
 10  payment_token_symbol    2398411 non-null  object 
 11  payment_token_decimals  2398445 non-null  float64
 12  payment_token_usdprice  2397937 non-null  float64
 13  quantity                2394188 non-null  object 
 14  sta

In [9]:
coolcatsnft = coolcatsnft[coolcatsnft.msg == "success"]
coolcatsnft.drop(["Unnamed: 0", "Unnamed: 1", "msg", "FILTER"], axis=1, inplace=True)

In [10]:
coolcatsnft.event_timestamp = pd.to_datetime(coolcatsnft.event_timestamp)
coolcatsnft.listing_time = pd.to_datetime(coolcatsnft.listing_time)
coolcatsnft.quantity = pd.to_numeric(coolcatsnft.quantity, errors='coerce')

## Write `DataFrame` to feather format and read it

`successful_events`

In [11]:
start_time = time.time()
df=successful_events.reset_index()
df.to_feather(os.path.join(data_dir, 'os_successful_events.feather'))
total_time = time.time() - start_time
print("total minutes to write feather file:", total_time / 60)

total minutes to write feather file: 0.05428362687428792


In [12]:
start_time = time.time()
df = pd.read_feather(os.path.join(data_dir, 'os_successful_events.feather'))
total_time = time.time() - start_time
print("total minutes to load feather file:", total_time / 60)

total minutes to load feather file: 0.08501499096552531


In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1270525 entries, 0 to 1270524
Data columns (total 26 columns):
 #   Column                  Non-Null Count    Dtype         
---  ------                  --------------    -----         
 0   index                   1270525 non-null  int64         
 1   event_timestamp         1270525 non-null  datetime64[ns]
 2   event_type              1270525 non-null  object        
 3   token_id                1265379 non-null  float64       
 4   num_sales               1265379 non-null  float64       
 5   listing_time            1216939 non-null  datetime64[ns]
 6   token_owner_address     1265379 non-null  object        
 7   token_seller_address    1270525 non-null  object        
 8   deal_price              1270525 non-null  float64       
 9   payment_token_symbol    1270525 non-null  object        
 10  payment_token_decimals  1270525 non-null  int64         
 11  payment_token_usdprice  1270436 non-null  float64       
 12  quantity      

`coolcatsnft`

In [14]:
start_time = time.time()
df=coolcatsnft.reset_index()
df.to_feather(os.path.join(data_dir, 'cool-cats-nft.feather'))
total_time = time.time() - start_time
print("total minutes to write feather file:", total_time / 60)

total minutes to write feather file: 0.14564001560211182


In [15]:
start_time = time.time()
df = pd.read_feather(os.path.join(data_dir, 'cool-cats-nft.feather'))
total_time = time.time() - start_time
print("total minutes to load feather file:", total_time / 60)

total minutes to load feather file: 0.15044233401616414


In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2398450 entries, 0 to 2398449
Data columns (total 29 columns):
 #   Column                  Dtype         
---  ------                  -----         
 0   index                   int64         
 1   event_timestamp         datetime64[ns]
 2   event_type              object        
 3   token_id                object        
 4   num_sales               float64       
 5   listing_time            datetime64[ns]
 6   token_owner_address     object        
 7   token_seller_address    object        
 8   deal_price              float64       
 9   payment_token_symbol    object        
 10  payment_token_decimals  float64       
 11  payment_token_usdprice  float64       
 12  quantity                float64       
 13  starting_price          float64       
 14  ending_price            float64       
 15  approved_account        float64       
 16  asset_bundle            object        
 17  auction_type            object        
 18  bi

## Write `DataFrame` to parquet format and read it (WIP)