In [2]:
# Helper functions

import os
import re
import pandas as pd


def read_combine(io):
    """
    Read a list of Excel files and combine them into a panadas DataFrame
    
    Parameters
    ----------
        io (list): list of fully qualified filenames
        
    Returns
    -------
    DataFrame with filenames appended at the last column.
    """
    list_df = []
    for each in io:
        # print("...reading", os.path.basename(each)) 
        df = pd.read_excel(each)
        df["filename"] = os.path.basename(each)
        list_df.append(df)
    
    return pd.concat(list_df)


def get_abspath(path, pattern):
    """
    Search directory non-recursively for filename by name patterns
    
    Parameters
    ----------
    path : 
    pattern : 

    Returns
    -------
    a list of absolute path, use os.path.basename to get the filename
    """
    return [os.path.join(path, x) for x in os.listdir(path) if re.search(pattern, x)]


def print_filesize(abspath):
    df=pd.DataFrame({'filename': [os.path.basename(x) for x in abspath],
                     'size': [os.stat(x).st_size for x in abspath]}) \
        .set_index('filename')
    print(df)
    return df

data_dir = os.path.join(os.getcwd(), 'data', 'wallets successful event')

In [3]:
wallets_fn = get_abspath(data_dir, '\.xlsx$')
fsizes = print_filesize(wallets_fn)
print("total size:", round(sum(fsizes["size"]) / 1024**3, 2), "gb")

                                                         size
filename                                                     
alien-frens-evolution_success1.xlsx                  64074307
boredapekennelclub_success1.xlsx                    139048703
boredapeyachtclub_success1.xlsx                      61715648
clonex_success1.xlsx                                 44406751
coolcatsnft_A1.xlsx                                 241609963
coolcatsnft_A2.xlsx                                 240563942
coolcatsnft_A3.xlsx                                 101283300
coolcatsnft_AA1.xlsx                                124645577
coolcatsnft_AA11.xlsx                               125791182
coolcatsnft_AA2.xlsx                                123305452
coolcatsnft_AA22.xlsx                               126019118
coolcatsnft_補跑B.xlsx                                   868343
coolcatsnft_補跑E.xlsx                                  2109287
cryptoadz-by-gremplin_success1.xlsx                  89797994
cryptosk

Arbitrarily split the list of files

In [3]:
split_list = [wallets_fn[0:4], wallets_fn[4:7], wallets_fn[7:13], wallets_fn[13:17], wallets_fn[17:24], wallets_fn[24:28]]

... or read each Excel and save as feather individually before any post read processing

In [None]:
for fn in wallets_fn:
    to_fn = os.path.join(data_dir, '0', os.path.basename(fn)[:-4] + 'feather')

    print("... reading", os.path.basename(fn), os.stat(fn).st_size)
    df = pd.read_excel(fn)
    df["filename"] = os.path.basename(fn)
    df.reset_index(inplace=True)
    print("... saving", os.path.basename(fn), os.stat(fn).st_size)
    df.to_feather(to_fn)

# Post Processing

In [12]:
fs = get_abspath(os.path.join(data_dir, 'feather'), '\.feather$')
lst = (pd.read_feather(each) for each in fs)

In [13]:
df = pd.concat(lst)

In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11322150 entries, 0 to 333345
Data columns (total 35 columns):
 #   Column                  Dtype  
---  ------                  -----  
 0   index                   int64  
 1   Unnamed: 0              int64  
 2   event_timestamp         object 
 3   event_type              object 
 4   token_id                object 
 5   num_sales               float64
 6   listing_time            object 
 7   token_owner_address     object 
 8   token_seller_address    object 
 9   from_account_address    object 
 10  deal_price              float64
 11  payment_token_symbol    object 
 12  payment_token_decimals  float64
 13  payment_token_usdprice  float64
 14  quantity                object 
 15  starting_price          float64
 16  ending_price            float64
 17  approved_account        float64
 18  asset_bundle            object 
 19  auction_type            object 
 20  bid_amount              float64
 21  transaction_hash        object 

In [15]:
df.filename.unique()

array(['alien-frens-evolution_success1.xlsx',
       'boredapekennelclub_success1.xlsx',
       'boredapeyachtclub_success1.xlsx', 'clonex_success1.xlsx',
       'coolcatsnft_A1.xlsx', 'coolcatsnft_A2.xlsx',
       'coolcatsnft_A3.xlsx', 'coolcatsnft_AA1.xlsx',
       'coolcatsnft_AA11.xlsx', 'coolcatsnft_AA2.xlsx',
       'coolcatsnft_AA22.xlsx', 'coolcatsnft_補跑B.xlsx',
       'coolcatsnft_補跑E.xlsx', 'cryptoadz-by-gremplin_success1.xlsx',
       'cryptoskulls_success1.xlsx', 'cyberkongz-vx_success1.xlsx',
       'doodlesofficial_success1.xlsx', 'hapeprime_success1.xlsx',
       'kaiju-kingz_success1.xlsx', 'karafuru_success1.xlsx',
       'lazy-lions_success1.xlsx', 'mekaverse_success1.xlsx',
       'mfers_success1.xlsx', 'mutant-ape-yacht-club_success1.xlsx',
       'phantabear_success1.xlsx', 'proof-moonbirds_success1.xlsx',
       'rektguy_rarelandnft_pieceofshit_ivedoneit_success1.xlsx',
       'world-of-women-nft_success1.xlsx'], dtype=object)

In [16]:
df = df[df.msg == "success"]
df.drop(["index", "Unnamed: 0", "pages", "msg", "FILTER", "next_param"], axis=1, inplace=True)

In [17]:
df.info(show_counts=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11312039 entries, 0 to 333345
Data columns (total 29 columns):
 #   Column                  Non-Null Count     Dtype  
---  ------                  --------------     -----  
 0   event_timestamp         11312039 non-null  object 
 1   event_type              11312039 non-null  object 
 2   token_id                11217880 non-null  object 
 3   num_sales               11217880 non-null  float64
 4   listing_time            10612413 non-null  object 
 5   token_owner_address     11217880 non-null  object 
 6   token_seller_address    11293040 non-null  object 
 7   from_account_address    3871 non-null      object 
 8   deal_price              11312025 non-null  float64
 9   payment_token_symbol    11307893 non-null  object 
 10  payment_token_decimals  11307998 non-null  float64
 11  payment_token_usdprice  11305808 non-null  float64
 12  quantity                11272470 non-null  object 
 13  starting_price          0 non-null        

In [18]:
df.filename.unique()

array(['alien-frens-evolution_success1.xlsx',
       'boredapekennelclub_success1.xlsx',
       'boredapeyachtclub_success1.xlsx', 'clonex_success1.xlsx',
       'coolcatsnft_A1.xlsx', 'coolcatsnft_A2.xlsx',
       'coolcatsnft_A3.xlsx', 'coolcatsnft_AA1.xlsx',
       'coolcatsnft_AA11.xlsx', 'coolcatsnft_AA2.xlsx',
       'coolcatsnft_AA22.xlsx', 'coolcatsnft_補跑B.xlsx',
       'coolcatsnft_補跑E.xlsx', 'cryptoadz-by-gremplin_success1.xlsx',
       'cryptoskulls_success1.xlsx', 'cyberkongz-vx_success1.xlsx',
       'doodlesofficial_success1.xlsx', 'hapeprime_success1.xlsx',
       'kaiju-kingz_success1.xlsx', 'karafuru_success1.xlsx',
       'lazy-lions_success1.xlsx', 'mekaverse_success1.xlsx',
       'mfers_success1.xlsx', 'mutant-ape-yacht-club_success1.xlsx',
       'phantabear_success1.xlsx', 'proof-moonbirds_success1.xlsx',
       'rektguy_rarelandnft_pieceofshit_ivedoneit_success1.xlsx',
       'world-of-women-nft_success1.xlsx'], dtype=object)

In [19]:
del(df)