# Read XLSX and write to feather file format for faster subsequent read

__Motivation:__ Reading Excel files into pandas dataframe takes a long time.
To speed up loading time for subsequent data processing and analysis,
a faster method is to save the dataframe into feather format and
read it for future sessions.

## Read Excel files into pandas.DataFrame 

In [6]:
import pandas as pd


def read_combine(io):
    """
    Read a list of Excel files and combine them into a panadas DataFrame
    
    Args:
        io (list): list of fully qualified filenames
        
    Returns:
        DataFrame
    """
    
    list_df = (pd.read_excel(each) for each in io)
    return pd.concat(list_df)

Search directory non-recursively for filename patterns

In [11]:
import os
import re

data_dir = os.path.join(os.getcwd(), 'data')
cool_cats_nft_dir = os.path.join(data_dir, 'cool-cats-nft')

successful_events = [x for x in os.listdir(data_dir) if re.search('^os_successful_events_[A-Z]\.xlsx', x)]
print(successful_events)
coolcatsnft = [x for x in os.listdir(cool_cats_nft_dir) if re.search('^coolcatsnft_A\d\.xlsx', x)]
print(coolcatsnft)

['os_successful_events_A.xlsx', 'os_successful_events_B.xlsx']
['coolcatsnft_A1.xlsx', 'coolcatsnft_A2.xlsx', 'coolcatsnft_A3.xlsx']


`successful_events`

In [7]:
import time

files = [os.path.join(data_dir, x) for x in successful_events]
start_time = time.time()
successful_events = read_combine(files)
total_time = time.time() - start_time
print("total minutes to load:", total_time / 60)

['c:\\Users\\lawrence\\GitHub\\OpenSea-API-crawler\\data\\os_successful_events_A.xlsx', 'c:\\Users\\lawrence\\GitHub\\OpenSea-API-crawler\\data\\os_successful_events_B.xlsx']
total minutes to load: 17.273423171043397


In [None]:
successful_events.info(show_counts=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2399422 entries, 0 to 399421
Data columns (total 32 columns):
 #   Column                  Non-Null Count    Dtype  
---  ------                  --------------    -----  
 0   Unnamed: 0              2399422 non-null  int64  
 1   Unnamed: 1              2399422 non-null  int64  
 2   event_timestamp         2398450 non-null  object 
 3   event_type              2398450 non-null  object 
 4   token_id                2382796 non-null  object 
 5   num_sales               2382796 non-null  float64
 6   listing_time            2268796 non-null  object 
 7   token_owner_address     2382796 non-null  object 
 8   token_seller_address    2395806 non-null  object 
 9   deal_price              2398450 non-null  float64
 10  payment_token_symbol    2398411 non-null  object 
 11  payment_token_decimals  2398445 non-null  float64
 12  payment_token_usdprice  2397937 non-null  float64
 13  quantity                2394188 non-null  object 
 14  sta

`coolcatsnft`

In [None]:
import time

files = [os.path.join(cool_cats_nft_dir, x) for x in coolcatsnft]
start_time = time.time()
coolcatsnft = read_combine(files)
total_time = time.time() - start_time
print("total minutes to load:", total_time / 60)

total minutes to load: 22.441298762957256


In [4]:
coolcatsnft.info(show_counts=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2399422 entries, 0 to 399421
Data columns (total 32 columns):
 #   Column                  Non-Null Count    Dtype  
---  ------                  --------------    -----  
 0   Unnamed: 0              2399422 non-null  int64  
 1   Unnamed: 1              2399422 non-null  int64  
 2   event_timestamp         2398450 non-null  object 
 3   event_type              2398450 non-null  object 
 4   token_id                2382796 non-null  object 
 5   num_sales               2382796 non-null  float64
 6   listing_time            2268796 non-null  object 
 7   token_owner_address     2382796 non-null  object 
 8   token_seller_address    2395806 non-null  object 
 9   deal_price              2398450 non-null  float64
 10  payment_token_symbol    2398411 non-null  object 
 11  payment_token_decimals  2398445 non-null  float64
 12  payment_token_usdprice  2397937 non-null  float64
 13  quantity                2394188 non-null  object 
 14  sta

In [5]:
coolcatsnft = coolcatsnft[coolcatsnft.msg == "success"]
coolcatsnft.drop(["Unnamed: 0", "Unnamed: 1", "msg", "FILTER"], axis=1, inplace=True)

In [6]:
coolcatsnft.event_timestamp = pd.to_datetime(coolcatsnft.event_timestamp)
coolcatsnft.listing_time = pd.to_datetime(coolcatsnft.listing_time)
coolcatsnft.quantity = pd.to_numeric(coolcatsnft.quantity, errors='coerce')

## Write `DataFrame` to feather format and read it

In [7]:
start_time = time.time()
df=coolcatsnft.reset_index()
df.to_feather(os.path.join(data_dir, 'cool-cats-nft.feather'))
total_time = time.time() - start_time
print("total minutes to write feather file:", total_time / 60)

total minutes to write feather file: 0.12054988543192545


In [8]:
start_time = time.time()
df = pd.read_feather(os.path.join(data_dir, 'cool-cats-nft.feather'))
total_time = time.time() - start_time
print("total minutes to load feather file:", total_time / 60)

total minutes to load feather file: 0.13761663834253948


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2398450 entries, 0 to 2398449
Data columns (total 29 columns):
 #   Column                  Dtype  
---  ------                  -----  
 0   index                   int64  
 1   event_timestamp         object 
 2   event_type              object 
 3   token_id                object 
 4   num_sales               float64
 5   listing_time            object 
 6   token_owner_address     object 
 7   token_seller_address    object 
 8   deal_price              float64
 9   payment_token_symbol    object 
 10  payment_token_decimals  float64
 11  payment_token_usdprice  float64
 12  quantity                float64
 13  starting_price          float64
 14  ending_price            float64
 15  approved_account        float64
 16  asset_bundle            object 
 17  auction_type            object 
 18  bid_amount              float64
 19  transaction_hash        object 
 20  block_hash              object 
 21  block_number            float64