# Split-Read 1.3 GB of XLSX and write to feather file format for faster subsequent read

Separately reading 1.3 GB of large Excel files into pandas dataframe and
write to feather with minimum post processing

## Helper functions

In [1]:
import os
import re
import pandas as pd


def read_combine(io):
    """
    Read a list of Excel files and combine them into a panadas DataFrame
    
    Parameters
    ----------
        io (list): list of fully qualified filenames
        
    Returns
    -------
    DataFrame with filenames appended at the last column.
    """
    list_df = []
    for each in io:
        # print("...reading", os.path.basename(each)) 
        df = pd.read_excel(each)
        df["filename"] = os.path.basename(each)
        list_df.append(df)
    
    return pd.concat(list_df)


def get_abspath(path, pattern):
    """
    Search directory non-recursively for filename by name patterns
    
    Parameters
    ----------
    path : 
    pattern : 

    Returns
    -------
    a list of absolute path, use os.path.basename to get the filename
    """
    return [os.path.join(path, x) for x in os.listdir(path) if re.search(pattern, x)]


def print_filesize(abspath):
    df=pd.DataFrame({'filename': [os.path.basename(x) for x in abspath],
                     'size': [os.stat(x).st_size for x in abspath]}) \
        .set_index('filename')
    print(df)
    return df

data_dir = os.path.join(os.getcwd(), 'data', 'wallets successful event', 'NFT20_success12錢包補跑')

## Excel files

In [2]:
wallets_fn = get_abspath(data_dir, '\.xlsx$')
fsizes = print_filesize(wallets_fn)
print("total size:", round(sum(fsizes["size"]) / 1024**3, 2), "gb")

                                 size
filename                             
NFT20_success12錢包補跑10.xlsx  116457078
NFT20_success12錢包補跑11.xlsx   45945607
NFT20_success12錢包補跑12.xlsx  208155032
NFT20_success12錢包補跑13.xlsx   49759561
NFT20_success12錢包補跑14.xlsx   17605926
NFT20_success12錢包補跑15.xlsx   19959764
NFT20_success12錢包補跑16.xlsx   42507839
NFT20_success12錢包補跑3.xlsx    80994161
NFT20_success12錢包補跑4.xlsx   187439764
NFT20_success12錢包補跑5.xlsx   109275195
NFT20_success12錢包補跑6.xlsx   180793046
NFT20_success12錢包補跑7.xlsx   170969390
NFT20_success12錢包補跑8.xlsx    97641792
NFT20_success12錢包補跑9.xlsx    48691456
total size: 1.28 gb


# Read Excel files into pandas.DataFrame and save as feather

...read each Excel and save as feather individually before any post read processing

In [3]:
for fn in wallets_fn:
    to_fn = os.path.join(data_dir, os.path.basename(fn)[:-4] + 'feather')

    print("... reading", os.path.basename(fn), os.stat(fn).st_size)
    df = pd.read_excel(fn)
    df["filename"] = os.path.basename(fn)
    df.reset_index(inplace=True)
    print("... saving", os.path.basename(fn), os.stat(fn).st_size)
    df.to_feather(to_fn)

... reading NFT20_success12錢包補跑10.xlsx 116457078
... saving NFT20_success12錢包補跑10.xlsx 116457078
... reading NFT20_success12錢包補跑11.xlsx 45945607
... saving NFT20_success12錢包補跑11.xlsx 45945607
... reading NFT20_success12錢包補跑12.xlsx 208155032
... saving NFT20_success12錢包補跑12.xlsx 208155032
... reading NFT20_success12錢包補跑13.xlsx 49759561
... saving NFT20_success12錢包補跑13.xlsx 49759561
... reading NFT20_success12錢包補跑14.xlsx 17605926
... saving NFT20_success12錢包補跑14.xlsx 17605926
... reading NFT20_success12錢包補跑15.xlsx 19959764
... saving NFT20_success12錢包補跑15.xlsx 19959764
... reading NFT20_success12錢包補跑16.xlsx 42507839
... saving NFT20_success12錢包補跑16.xlsx 42507839
... reading NFT20_success12錢包補跑3.xlsx 80994161
... saving NFT20_success12錢包補跑3.xlsx 80994161
... reading NFT20_success12錢包補跑4.xlsx 187439764
... saving NFT20_success12錢包補跑4.xlsx 187439764
... reading NFT20_success12錢包補跑5.xlsx 109275195
... saving NFT20_success12錢包補跑5.xlsx 109275195
... reading NFT20_success12錢包補跑6.xlsx 180793046


# Post Processing

In [4]:
fs = get_abspath(data_dir, '\.feather$')
lst = (pd.read_feather(each) for each in fs)

In [5]:
df = pd.concat(lst)

In [6]:
df.info(show_counts=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5030747 entries, 0 to 164693
Data columns (total 40 columns):
 #   Column                            Non-Null Count    Dtype  
---  ------                            --------------    -----  
 0   index                             5030747 non-null  int64  
 1   Unnamed: 0                        5030747 non-null  int64  
 2   event_timestamp                   5029616 non-null  object 
 3   event_type                        5029616 non-null  object 
 4   token_id                          4989545 non-null  object 
 5   num_sales                         4989545 non-null  float64
 6   listing_time                      4667175 non-null  object 
 7   token_owner_address               4989545 non-null  object 
 8   token_seller_address              5016702 non-null  object 
 9   from_account_address              2519 non-null     object 
 10  deal_price                        5029609 non-null  float64
 11  payment_token_symbol              5028

In [8]:
df = df[df.msg == "success"]
df.drop(["index", "Unnamed: 0", "pages", "msg", "next_param"], axis=1, inplace=True)

In [9]:
df.event_timestamp = pd.to_datetime(df.event_timestamp)
df.listing_time = pd.to_datetime(df.listing_time)
df.quantity = pd.to_numeric(df.quantity, errors='coerce')

In [10]:
df.info(show_counts=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5029616 entries, 0 to 164693
Data columns (total 35 columns):
 #   Column                            Non-Null Count    Dtype         
---  ------                            --------------    -----         
 0   event_timestamp                   5029616 non-null  datetime64[ns]
 1   event_type                        5029616 non-null  object        
 2   token_id                          4989545 non-null  object        
 3   num_sales                         4989545 non-null  float64       
 4   listing_time                      4667175 non-null  datetime64[ns]
 5   token_owner_address               4989545 non-null  object        
 6   token_seller_address              5016702 non-null  object        
 7   from_account_address              2519 non-null     object        
 8   deal_price                        5029609 non-null  float64       
 9   payment_token_symbol              5028062 non-null  object        
 10  payment_token_decim

# Split and write to feather

In [11]:
grp = df.groupby("filename")

for name, group in grp:
    print(name)
    g=group.reset_index().drop("filename", axis=1)
    g.to_feather(os.path.join(data_dir, 'feather', os.path.basename(name)[:-4] + 'feather'), compression='zstd')

NFT20_success12錢包補跑10.xlsx
NFT20_success12錢包補跑11.xlsx
NFT20_success12錢包補跑12.xlsx
NFT20_success12錢包補跑13.xlsx
NFT20_success12錢包補跑14.xlsx
NFT20_success12錢包補跑15.xlsx
NFT20_success12錢包補跑16.xlsx
NFT20_success12錢包補跑3.xlsx
NFT20_success12錢包補跑4.xlsx
NFT20_success12錢包補跑5.xlsx
NFT20_success12錢包補跑6.xlsx
NFT20_success12錢包補跑7.xlsx
NFT20_success12錢包補跑8.xlsx
NFT20_success12錢包補跑9.xlsx


# Checking out the data

In [12]:
df.groupby(by="collection_slug", as_index=False).size().sort_values(by="size", ascending=False).head(20)

Unnamed: 0,collection_slug,size
7978,cryptokitties,54789
18018,lazy-lions,41189
26328,rarible,32799
23917,parallelalpha,29807
34954,world-of-women-nft,28703
24278,phantabear,27144
2965,axie,26667
4728,boredapeyachtclub,24912
4598,bored-ape-kennel-club,23312
6656,cool-cats-nft,23249


In [13]:
top20_nft = df.groupby(by="collection_slug").size().sort_values(ascending=False).iloc[:20].index
top20_nft

Index(['cryptokitties', 'lazy-lions', 'rarible', 'parallelalpha',
       'world-of-women-nft', 'phantabear', 'axie', 'boredapeyachtclub',
       'bored-ape-kennel-club', 'cool-cats-nft', 'pudgypenguins',
       'thewickedcraniums', 'ape-gang-old', 'fameladysquad',
       'mutant-ape-yacht-club', 'robotos-official', 'adam-bomb-squad',
       'deadfellaz', 'cryptoadz-by-gremplin', 'creatureworld'],
      dtype='object', name='collection_slug')