# Read and combine feather files into a dataframe

__Motivation:__ 

## Read feather files into pandas.DataFrame 

In [1]:
# Helper functions

import os
import re
import pandas as pd


def get_abspath(path, pattern):
    """
    Search directory non-recursively for filename by name patterns
    
    Parameters
    ----------
    path : 
    pattern : 

    Returns
    -------
    a list of absolute path, use os.path.basename to get the filename
    """
    return [os.path.join(path, x) for x in os.listdir(path) if re.search(pattern, x)]


def print_filesize(abspath):
    df=pd.DataFrame({'filename': [os.path.basename(x) for x in abspath],
                     'size': [os.stat(x).st_size for x in abspath]}) \
        .set_index('filename')
    print(df)
    return df

data_dir = os.path.join(os.getcwd(), 'data', 'wallets successful event')

In [2]:
wallets_fn = get_abspath(os.path.join(data_dir, 'feather'), '\.feather$')
fsize = print_filesize(wallets_fn)

                                                         size
filename                                                     
alien-frens-evolution_success1.feather               46287690
boredapekennelclub_success1.feather                 102316282
boredapeyachtclub_success1.feather                   43145602
clonex_success1.feather                              31176802
coolcatsnft_A1.feather                              162405530
coolcatsnft_A2.feather                              161271826
coolcatsnft_A3.feather                               65333498
coolcatsnft_AA1.feather                              81152522
coolcatsnft_AA11.feather                             81240850
coolcatsnft_AA2.feather                              80436530
coolcatsnft_AA22.feather                             80853482
coolcatsnft_補跑B.feather                                506650
coolcatsnft_補跑E.feather                               1263210
cryptoadz-by-gremplin_success1.feather               64466754
cryptosk

In [7]:
lst = (pd.read_feather(each).assign(filename=os.path.basename(each)) for each in wallets_fn)
df = pd.concat(lst).drop("index", axis=1)

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11312039 entries, 0 to 332720
Data columns (total 29 columns):
 #   Column                  Dtype         
---  ------                  -----         
 0   event_timestamp         datetime64[ns]
 1   event_type              object        
 2   token_id                object        
 3   num_sales               float64       
 4   listing_time            datetime64[ns]
 5   token_owner_address     object        
 6   token_seller_address    object        
 7   from_account_address    object        
 8   deal_price              float64       
 9   payment_token_symbol    object        
 10  payment_token_decimals  float64       
 11  payment_token_usdprice  float64       
 12  quantity                float64       
 13  starting_price          float64       
 14  ending_price            float64       
 15  approved_account        float64       
 16  asset_bundle            object        
 17  auction_type            object        
 18  bi

In [18]:
nft20 = pd.read_csv(os.path.join(os.getcwd(), 'NFT_20_list.csv'))
nft20_collection_slug = [os.path.basename(url) for url in nft20.collection_url]
nft20_collection_slug

['boredapeyachtclub',
 'mutant-ape-yacht-club',
 'azuki',
 'clonex',
 'proof-moonbirds',
 'doodles-official',
 'meebits',
 'cool-cats-nft',
 'bored-ape-kennel-club',
 'cryptoadz-by-gremplin',
 'world-of-women-nft',
 'hapeprime',
 'mekaverse',
 'karafuru',
 'invisiblefriends',
 'mfers',
 'phantabear',
 'cyberkongz-vx',
 'coolpetsnft',
 'lazy-lions',
 'kaiju-kingz']

In [19]:
len(nft20_collection_slug)

21

\* _N.b._ The NFT_20 list contains 21 manually selected collections.

In [24]:
df.groupby("collection_slug").size().sort_values(ascending=False).head(21)

collection_slug
cryptokitties            87113
cool-cats-nft            75452
parallelalpha            72462
rarible                  65798
boredapeyachtclub        50417
mutant-ape-yacht-club    50392
lazy-lions               45897
pudgypenguins            44149
world-of-women-nft       41750
cryptoadz-by-gremplin    38680
deadfellaz               35854
creatureworld            35487
robotos-official         35063
adam-bomb-squad          34632
bored-ape-kennel-club    34503
phantabear               32451
thewickedcraniums        32349
ape-gang-old             31988
axie                     31580
cyberkongz-vx            30596
doodles-official         29697
dtype: int64

In [15]:
nft20_wallets = df[df.collection_slug.isin(nft20_collection_slug)]
nft20_wallets.info(show_counts=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 608899 entries, 214 to 332710
Data columns (total 29 columns):
 #   Column                  Non-Null Count   Dtype         
---  ------                  --------------   -----         
 0   event_timestamp         608899 non-null  datetime64[ns]
 1   event_type              608899 non-null  object        
 2   token_id                608309 non-null  object        
 3   num_sales               608309 non-null  float64       
 4   listing_time            562462 non-null  datetime64[ns]
 5   token_owner_address     608309 non-null  object        
 6   token_seller_address    608899 non-null  object        
 7   from_account_address    150 non-null     object        
 8   deal_price              608899 non-null  float64       
 9   payment_token_symbol    608777 non-null  object        
 10  payment_token_decimals  608798 non-null  float64       
 11  payment_token_usdprice  608777 non-null  float64       
 12  quantity                6088

In [26]:
nft20_wallets.groupby("collection_slug").size().sort_values(ascending=False).head(21)

collection_slug
cool-cats-nft            75452
boredapeyachtclub        50417
mutant-ape-yacht-club    50392
lazy-lions               45897
world-of-women-nft       41750
cryptoadz-by-gremplin    38680
bored-ape-kennel-club    34503
phantabear               32451
cyberkongz-vx            30596
doodles-official         29697
meebits                  23286
mekaverse                22296
clonex                   20165
mfers                    19977
coolpetsnft              18974
azuki                    18741
hapeprime                16552
kaiju-kingz              14103
karafuru                 12433
proof-moonbirds           9042
invisiblefriends          3495
dtype: int64

In [16]:
nft20_wallets.wallet_address_input.nunique()

84641

In [25]:
nft20_wallets.groupby("collection_slug")["wallet_address_input"].nunique().sort_values(ascending=False)

collection_slug
mutant-ape-yacht-club    15838
lazy-lions               10237
cool-cats-nft             9346
boredapeyachtclub         9170
world-of-women-nft        8790
phantabear                8767
mekaverse                 8461
cryptoadz-by-gremplin     7855
hapeprime                 7759
bored-ape-kennel-club     7751
doodles-official          7662
cyberkongz-vx             5428
clonex                    5424
kaiju-kingz               5157
coolpetsnft               4873
karafuru                  4644
mfers                     4636
meebits                   4378
proof-moonbirds           4171
azuki                     3577
invisiblefriends          1514
Name: wallet_address_input, dtype: int64

## Write `DataFrame` to parquet format and read it (WIP)