# Split and Read 2.5 GB of XLSX and write to feather file format for faster subsequent read

__Motivation:__ Reading 2.5 GB of dataset as separate large Excel files into
pandas dataframe in one-go failed after 90 minutes on my Latitude XPS 15 with
32GB of RAM.
There was not enough memory to buffer and hold the data in memory.
The old approach failed before the data could be converted into feather format.

## Read Excel files into pandas.DataFrame 

In [2]:
# Helper functions

import os
import re
import pandas as pd


def read_combine(io):
    """
    Read a list of Excel files and combine them into a panadas DataFrame
    
    Parameters
    ----------
        io (list): list of fully qualified filenames
        
    Returns
    -------
    DataFrame with filenames appended at the last column.
    """
    list_df = []
    for each in io:
        # print("...reading", os.path.basename(each)) 
        df = pd.read_excel(each)
        df["filename"] = os.path.basename(each)
        list_df.append(df)
    
    return pd.concat(list_df)


def get_abspath(path, pattern):
    """
    Search directory non-recursively for filename by name patterns
    
    Parameters
    ----------
    path : 
    pattern : 

    Returns
    -------
    a list of absolute path, use os.path.basename to get the filename
    """
    return [os.path.join(path, x) for x in os.listdir(path) if re.search(pattern, x)]


def print_filesize(abspath):
    df=pd.DataFrame({'filename': [os.path.basename(x) for x in abspath],
                     'size': [os.stat(x).st_size for x in abspath]}) \
        .set_index('filename')
    print(df)
    return df

data_dir = os.path.join(os.getcwd(), 'data', 'wallets successful event')

In [32]:
wallets_fn = get_abspath(data_dir, '\.xlsx$')
print_filesize(wallets_fn)

                                                         size
filename                                                     
alien-frens-evolution_success1.xlsx                  64074307
boredapekennelclub_success1.xlsx                    139048703
boredapeyachtclub_success1.xlsx                      61715648
clonex_success1.xlsx                                 44406751
coolcatsnft_A1.xlsx                                 241609963
coolcatsnft_A2.xlsx                                 240563942
coolcatsnft_A3.xlsx                                 101283300
coolcatsnft_AA1.xlsx                                124645577
coolcatsnft_AA11.xlsx                               125791182
coolcatsnft_AA2.xlsx                                123305452
coolcatsnft_AA22.xlsx                               126019118
coolcatsnft_補跑B.xlsx                                   868343
coolcatsnft_補跑E.xlsx                                  2109287
cryptoadz-by-gremplin_success1.xlsx                  89797994
cryptosk

Unnamed: 0_level_0,size
filename,Unnamed: 1_level_1
alien-frens-evolution_success1.xlsx,64074307
boredapekennelclub_success1.xlsx,139048703
boredapeyachtclub_success1.xlsx,61715648
clonex_success1.xlsx,44406751
coolcatsnft_A1.xlsx,241609963
coolcatsnft_A2.xlsx,240563942
coolcatsnft_A3.xlsx,101283300
coolcatsnft_AA1.xlsx,124645577
coolcatsnft_AA11.xlsx,125791182
coolcatsnft_AA2.xlsx,123305452


Arbitrarily split the list of files

In [34]:
split_list = [wallets_fn[0:4], wallets_fn[4:7], wallets_fn[7:13], wallets_fn[13:17], wallets_fn[17:24], wallets_fn[24:28]]

### Split 0

In [18]:
df0 = read_combine(split_list[0])

In [26]:
df0.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1341738 entries, 0 to 187317
Data columns (total 33 columns):
 #   Column                  Non-Null Count    Dtype  
---  ------                  --------------    -----  
 0   Unnamed: 0              1341738 non-null  int64  
 1   event_timestamp         1340709 non-null  object 
 2   event_type              1340709 non-null  object 
 3   token_id                1328134 non-null  object 
 4   num_sales               1328134 non-null  float64
 5   listing_time            1190041 non-null  object 
 6   token_owner_address     1328134 non-null  object 
 7   token_seller_address    1334049 non-null  object 
 8   from_account_address    780 non-null      object 
 9   deal_price              1340701 non-null  float64
 10  payment_token_symbol    1339859 non-null  object 
 11  payment_token_decimals  1339873 non-null  float64
 12  payment_token_usdprice  1339571 non-null  float64
 13  quantity                1327657 non-null  object 
 14  sta

In [25]:
# helper function
def change_dtype(df):
    df.event_timestamp = pd.to_datetime(df.event_timestamp)
    df.listing_time = pd.to_datetime(df.listing_time)
    df.quantity = pd.to_numeric(df.quantity, errors='coerce')

In [35]:
df0 = df0[df0.msg == "success"]
df0.drop(["Unnamed: 0", "pages", "msg", "next_param"], axis=1, inplace=True)
change_dtype(df0)

In [41]:
%%timeit -r 1 -n 1
df0.reset_index(inplace=True)
df0.to_feather(os.path.join(data_dir, 'wallets_successful_events_0.feather'))

4.59 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [43]:
%%timeit -r 5 -n 1
df0 = pd.read_feather(os.path.join(data_dir, 'wallets_successful_events_0.feather'))

7.77 s ± 3.64 s per loop (mean ± std. dev. of 5 runs, 1 loop each)


### Split 1

In [46]:
df1 = read_combine(split_list[1])

In [54]:
df1.info(show_counts=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2399422 entries, 0 to 399421
Data columns (total 32 columns):
 #   Column                  Non-Null Count    Dtype  
---  ------                  --------------    -----  
 0   Unnamed: 0              2399422 non-null  int64  
 1   event_timestamp         2398450 non-null  object 
 2   event_type              2398450 non-null  object 
 3   token_id                2382796 non-null  object 
 4   num_sales               2382796 non-null  float64
 5   listing_time            2268796 non-null  object 
 6   token_owner_address     2382796 non-null  object 
 7   token_seller_address    2395806 non-null  object 
 8   deal_price              2398450 non-null  float64
 9   payment_token_symbol    2398411 non-null  object 
 10  payment_token_decimals  2398445 non-null  float64
 11  payment_token_usdprice  2397937 non-null  float64
 12  quantity                2394188 non-null  object 
 13  starting_price          0 non-null        float64
 14  end

In [57]:
df1 = df1[df1.msg == "success"]
df1.drop(["Unnamed: 0", "pages", "msg", "FILTER"], axis=1, inplace=True)
change_dtype(df1)

In [58]:
%%timeit -r 1 -n 1
df1.reset_index(inplace=True)
df1.to_feather(os.path.join(data_dir, 'wallets_successful_events_1.feather'))

6.78 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [66]:
df = pd.read_feather(os.path.join(data_dir, 'wallets_successful_events_1.feather'))

In [65]:
df.filename.unique()

array(['coolcatsnft_A1.xlsx', 'coolcatsnft_A2.xlsx',
       'coolcatsnft_A3.xlsx'], dtype=object)

In [67]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2398450 entries, 0 to 2398449
Data columns (total 29 columns):
 #   Column                  Dtype         
---  ------                  -----         
 0   index                   int64         
 1   event_timestamp         datetime64[ns]
 2   event_type              object        
 3   token_id                object        
 4   num_sales               float64       
 5   listing_time            datetime64[ns]
 6   token_owner_address     object        
 7   token_seller_address    object        
 8   deal_price              float64       
 9   payment_token_symbol    object        
 10  payment_token_decimals  float64       
 11  payment_token_usdprice  float64       
 12  quantity                float64       
 13  starting_price          float64       
 14  ending_price            float64       
 15  approved_account        float64       
 16  asset_bundle            object        
 17  auction_type            object        
 18  bi

### Split 2

In [None]:
df2 = read_combine(split_list[2])

In [None]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1341738 entries, 0 to 1341737
Data columns (total 30 columns):
 #   Column                  Non-Null Count    Dtype         
---  ------                  --------------    -----         
 0   index                   1341738 non-null  int64         
 1   event_timestamp         1340709 non-null  datetime64[ns]
 2   event_type              1340709 non-null  object        
 3   token_id                1328134 non-null  object        
 4   num_sales               1328134 non-null  float64       
 5   listing_time            1190041 non-null  datetime64[ns]
 6   token_owner_address     1328134 non-null  object        
 7   token_seller_address    1334049 non-null  object        
 8   from_account_address    780 non-null      object        
 9   deal_price              1340701 non-null  float64       
 10  payment_token_symbol    1339859 non-null  object        
 11  payment_token_decimals  1339873 non-null  float64       
 12  payment_token_

In [None]:
%%timeit -r 5 -n 1
df2 = pd.read_feather(os.path.join(data_dir, 'wallets_successful_events_2.feather'))

In [None]:
df2.info()

... get it down while I sleep

In [76]:
for i in range(7, len(wallets_fn)):
    fn = wallets_fn[i]
    to_fn = os.path.join(data_dir, '0', os.path.basename(fn)[:-4] + 'feather')

    print(fn, os.stat(fn).st_size)
    df = pd.read_excel(fn)
    df["filename"] = os.path.basename(fn)
    df.reset_index(inplace=True)
    df.to_feather(to_fn)

c:\Users\lawrence\GitHub\OpenSea-API-crawler\data\wallets successful event\coolcatsnft_AA1.xlsx 124645577
c:\Users\lawrence\GitHub\OpenSea-API-crawler\data\wallets successful event\coolcatsnft_AA11.xlsx 125791182
c:\Users\lawrence\GitHub\OpenSea-API-crawler\data\wallets successful event\coolcatsnft_AA2.xlsx 123305452
c:\Users\lawrence\GitHub\OpenSea-API-crawler\data\wallets successful event\coolcatsnft_AA22.xlsx 126019118
c:\Users\lawrence\GitHub\OpenSea-API-crawler\data\wallets successful event\coolcatsnft_補跑B.xlsx 868343
c:\Users\lawrence\GitHub\OpenSea-API-crawler\data\wallets successful event\coolcatsnft_補跑E.xlsx 2109287
c:\Users\lawrence\GitHub\OpenSea-API-crawler\data\wallets successful event\cryptoadz-by-gremplin_success1.xlsx 89797994
c:\Users\lawrence\GitHub\OpenSea-API-crawler\data\wallets successful event\cryptoskulls_success1.xlsx 59505067
c:\Users\lawrence\GitHub\OpenSea-API-crawler\data\wallets successful event\cyberkongz-vx_success1.xlsx 29765080
c:\Users\lawrence\GitHub

## Write `DataFrame` to parquet format and read it (WIP)