# Read and combine feather files into a dataframe

__Motivation:__ 

## Read feather files into pandas.DataFrame 

In [1]:
# Helper functions

import os
import re
import pandas as pd


def get_abspath(path, pattern):
    """
    Search directory non-recursively for filename by name patterns
    
    Parameters
    ----------
    path : 
    pattern : 

    Returns
    -------
    a list of absolute path, use os.path.basename to get the filename
    """
    return [os.path.join(path, x) for x in os.listdir(path) if re.search(pattern, x)]


def print_filesize(abspath):
    df=pd.DataFrame({'filename': [os.path.basename(x) for x in abspath],
                     'size': [os.stat(x).st_size for x in abspath]}) \
        .set_index('filename')
    print(df)
    return df

data_dir = os.path.join(os.getcwd(), 'data', 'wallets successful event')

In [2]:
wallets_fn = get_abspath(os.path.join(data_dir, 'feather'), '\.feather$')
fsize = print_filesize(wallets_fn)

                                                         size
filename                                                     
alien-frens-evolution_success1.feather               46287690
boredapekennelclub_success1.feather                 102316282
boredapeyachtclub_success1.feather                   43145602
clonex_success1.feather                              31176802
coolcatsnft_A1.feather                              162405530
coolcatsnft_A2.feather                              161271826
coolcatsnft_A3.feather                               65333498
coolcatsnft_AA1.feather                              81152522
coolcatsnft_AA11.feather                             81240850
coolcatsnft_AA2.feather                              80436530
coolcatsnft_AA22.feather                             80853482
coolcatsnft_補跑B.feather                                506650
coolcatsnft_補跑E.feather                               1263210
cryptoadz-by-gremplin_success1.feather               64466754
cryptosk

In [3]:
lst = (pd.read_feather(each).assign(filename=os.path.basename(each)) for each in wallets_fn)
df = pd.concat(lst).drop("index", axis=1)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 16341655 entries, 0 to 332720
Data columns (total 35 columns):
 #   Column                            Dtype         
---  ------                            -----         
 0   event_timestamp                   datetime64[ns]
 1   event_type                        object        
 2   token_id                          object        
 3   num_sales                         float64       
 4   listing_time                      datetime64[ns]
 5   token_owner_address               object        
 6   token_seller_address              object        
 7   from_account_address              object        
 8   deal_price                        float64       
 9   payment_token_symbol              object        
 10  payment_token_decimals            float64       
 11  payment_token_usdprice            float64       
 12  quantity                          float64       
 13  starting_price                    float64       
 14  ending_price      

In [5]:
nft20 = pd.read_csv(os.path.join(os.getcwd(), 'NFT_20_list.csv'))
nft20_collection_slug = [os.path.basename(url) for url in nft20.collection_url]
nft20_collection_slug

['boredapeyachtclub',
 'mutant-ape-yacht-club',
 'azuki',
 'clonex',
 'proof-moonbirds',
 'doodles-official',
 'meebits',
 'cool-cats-nft',
 'bored-ape-kennel-club',
 'cryptoadz-by-gremplin',
 'world-of-women-nft',
 'hapeprime',
 'mekaverse',
 'karafuru',
 'invisiblefriends',
 'mfers',
 'phantabear',
 'cyberkongz-vx',
 'coolpetsnft',
 'lazy-lions',
 'kaiju-kingz']

In [6]:
len(nft20_collection_slug)

21

\* _N.b._ The NFT_20 list contains 21 manually selected collections.

In [7]:
df.groupby("collection_slug").size().sort_values(ascending=False).head(21)

collection_slug
cryptokitties            141902
parallelalpha            102269
cool-cats-nft             98701
rarible                   98597
lazy-lions                87086
boredapeyachtclub         75329
world-of-women-nft        70453
pudgypenguins             65318
mutant-ape-yacht-club     64945
phantabear                59595
axie                      58247
bored-ape-kennel-club     57815
cryptoadz-by-gremplin     51987
robotos-official          49584
deadfellaz                49579
adam-bomb-squad           48830
creatureworld             48606
thewickedcraniums         48601
ape-gang-old              47822
cyberkongz-vx             41806
supducks                  40003
dtype: int64

In [8]:
nft20_wallets = df[df.collection_slug.isin(nft20_collection_slug)]
nft20_wallets.info(show_counts=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 894186 entries, 214 to 332710
Data columns (total 35 columns):
 #   Column                            Non-Null Count   Dtype         
---  ------                            --------------   -----         
 0   event_timestamp                   894186 non-null  datetime64[ns]
 1   event_type                        894186 non-null  object        
 2   token_id                          893300 non-null  object        
 3   num_sales                         893300 non-null  float64       
 4   listing_time                      826665 non-null  datetime64[ns]
 5   token_owner_address               893300 non-null  object        
 6   token_seller_address              894186 non-null  object        
 7   from_account_address              276 non-null     object        
 8   deal_price                        894186 non-null  float64       
 9   payment_token_symbol              894002 non-null  object        
 10  payment_token_decimals        

In [9]:
nft20_wallets.groupby("collection_slug").size().sort_values(ascending=False).head(21)

collection_slug
cool-cats-nft            98701
lazy-lions               87086
boredapeyachtclub        75329
world-of-women-nft       70453
mutant-ape-yacht-club    64945
phantabear               59595
bored-ape-kennel-club    57815
cryptoadz-by-gremplin    51987
cyberkongz-vx            41806
doodles-official         39065
meebits                  35632
mekaverse                30119
coolpetsnft              28955
clonex                   27678
azuki                    26644
hapeprime                24612
mfers                    23897
kaiju-kingz              19443
karafuru                 15175
proof-moonbirds          11034
invisiblefriends          4215
dtype: int64

In [10]:
nft20_wallets.wallet_address_input.nunique()

104947

In [11]:
nft20_wallets.groupby("collection_slug")["wallet_address_input"].nunique().sort_values(ascending=False)

collection_slug
mutant-ape-yacht-club    16876
lazy-lions               12289
cool-cats-nft            11223
world-of-women-nft       10491
doodles-official          9924
boredapeyachtclub         9800
phantabear                9599
bored-ape-kennel-club     9368
mekaverse                 9345
cryptoadz-by-gremplin     8925
hapeprime                 8785
coolpetsnft               8350
clonex                    7989
meebits                   6936
cyberkongz-vx             6824
azuki                     6268
kaiju-kingz               5882
mfers                     5867
karafuru                  5709
proof-moonbirds           4887
invisiblefriends          1816
Name: wallet_address_input, dtype: int64

>> 這邊請惟翔研究如何刪除duplicated rows systematically

In [12]:
bad = nft20_wallets.duplicated(['event_timestamp', 'collection_slug', 'token_id', 'wallet_address_input'], keep=False)
sum(bad)

612623

In [13]:
nft20_wallets[bad].sort_values('event_timestamp').set_index('filename')

Unnamed: 0_level_0,event_timestamp,event_type,token_id,num_sales,listing_time,token_owner_address,token_seller_address,from_account_address,deal_price,payment_token_symbol,...,created_date,collection_slug,contract_address,wallet_address_input,custom_event_name,dev_fee_payment_event,dev_seller_fee_basis_points,transaction_from_account_address,transaction_to_account_address,winner_account_address
filename,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
coolcatsnft_AA22.feather,2021-04-30 12:38:59,successful,33,2.0,NaT,0xed2ab4948ba6a909a7751dec4f34f303eb8c7236,0x8e05bd9fa3059ec69c15bc1a6f4d94f0ac26ce00,,1.600000e+17,WETH,...,2021-04-30T12:39:43.636056,boredapeyachtclub,0x7be8076f4ea4a4ad08075c2508e481d6c946d12b,0x8e05bd9fa3059ec69c15bc1a6f4d94f0ac26ce00,,,,,,
NFT20_success12錢包補跑12.feather,2021-04-30 12:38:59,successful,33,2.0,NaT,0xed2ab4948ba6a909a7751dec4f34f303eb8c7236,0x8e05bd9fa3059ec69c15bc1a6f4d94f0ac26ce00,,1.600000e+17,WETH,...,2021-04-30T12:39:43.636056,boredapeyachtclub,0x7be8076f4ea4a4ad08075c2508e481d6c946d12b,0x8e05bd9fa3059ec69c15bc1a6f4d94f0ac26ce00,,"{'asset': None, 'asset_bundle': None, 'event_t...",0.0,0x8e05bd9fa3059ec69c15bc1a6f4d94f0ac26ce00,0x7be8076f4ea4a4ad08075c2508e481d6c946d12b,0x0b742783bfac8d4b6d332e5d1b63f433fcd8c0a0
coolcatsnft_A2.feather,2021-04-30 12:38:59,successful,33,2.0,NaT,0xed2ab4948ba6a909a7751dec4f34f303eb8c7236,0x8e05bd9fa3059ec69c15bc1a6f4d94f0ac26ce00,,1.600000e+17,WETH,...,2021-04-30T12:39:43.636056,boredapeyachtclub,0x7be8076f4ea4a4ad08075c2508e481d6c946d12b,0x8e05bd9fa3059ec69c15bc1a6f4d94f0ac26ce00,,,,,,
NFT20_success12錢包補跑12.feather,2021-05-01 00:15:48,successful,586,5.0,2021-04-30 23:36:58.000000,0xf896527c49b44aab3cf22ae356fa3af8e331f280,0x88be3fa60ede9f532af10aba5690dfc254db929b,,1.000000e+17,ETH,...,2021-05-01T00:16:17.700252,boredapeyachtclub,0x7be8076f4ea4a4ad08075c2508e481d6c946d12b,0x88be3fa60ede9f532af10aba5690dfc254db929b,,"{'asset': None, 'asset_bundle': None, 'event_t...",0.0,0x0b742783bfac8d4b6d332e5d1b63f433fcd8c0a0,0x7be8076f4ea4a4ad08075c2508e481d6c946d12b,0x0b742783bfac8d4b6d332e5d1b63f433fcd8c0a0
boredapeyachtclub_success1.feather,2021-05-01 00:15:48,successful,586,5.0,2021-04-30 23:36:58.000000,0xf896527c49b44aab3cf22ae356fa3af8e331f280,0x88be3fa60ede9f532af10aba5690dfc254db929b,,1.000000e+17,ETH,...,2021-05-01T00:16:17.700252,boredapeyachtclub,0x7be8076f4ea4a4ad08075c2508e481d6c946d12b,0x88be3fa60ede9f532af10aba5690dfc254db929b,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
NFT20_success12錢包補跑10.feather,2022-06-02 01:34:58,successful,9443,2.0,2022-06-01 22:31:32.794843,0x8f46d017455920ecbf72d04d95505208aaf03581,0xe3f663418251186888935dc1c4979fa3a3da1bac,,2.160000e+19,ETH,...,2022-06-02T01:35:54.717142,proof-moonbirds,0x7f268357a8c2552623316e2562d90e642bb538e5,0x8f46d017455920ecbf72d04d95505208aaf03581,,"{'asset': None, 'asset_bundle': None, 'event_t...",500.0,0x8f46d017455920ecbf72d04d95505208aaf03581,0x7f268357a8c2552623316e2562d90e642bb538e5,0x8f46d017455920ecbf72d04d95505208aaf03581
NFT20_success12錢包補跑10.feather,2022-06-05 15:37:39,successful,7167,5.0,2022-06-05 07:16:44.000000,0x0cc901998f14e88b7669e7a01863acc8f64905f7,0x4fa94ca44d1c403c7a864a4e6fb885cc3f6e92bb,,4.190000e+18,ETH,...,2022-06-05T15:37:46.798515,world-of-women-nft,0x7f268357a8c2552623316e2562d90e642bb538e5,0x4fa94ca44d1c403c7a864a4e6fb885cc3f6e92bb,,,400.0,0x0cc901998f14e88b7669e7a01863acc8f64905f7,0x7f268357a8c2552623316e2562d90e642bb538e5,0x0cc901998f14e88b7669e7a01863acc8f64905f7
NFT20_success12錢包補跑8.feather,2022-06-05 15:37:39,successful,7167,5.0,2022-06-05 07:16:44.000000,0x0cc901998f14e88b7669e7a01863acc8f64905f7,0x4fa94ca44d1c403c7a864a4e6fb885cc3f6e92bb,,4.190000e+18,ETH,...,2022-06-05T15:37:46.798515,world-of-women-nft,0x7f268357a8c2552623316e2562d90e642bb538e5,0x4fa94ca44d1c403c7a864a4e6fb885cc3f6e92bb,,,400.0,0x0cc901998f14e88b7669e7a01863acc8f64905f7,0x7f268357a8c2552623316e2562d90e642bb538e5,0x0cc901998f14e88b7669e7a01863acc8f64905f7
NFT20_success12錢包補跑14.feather,2022-06-07 09:08:06,successful,7694,4.0,2022-06-07 08:40:07.000000,0xbb0baa3ca39863a2a48c7d10fde5d166a60e363d,0xf02aaa8e504daa449d4636a22c9be2b572f6b796,,2.660000e+18,ETH,...,2022-06-07T09:08:30.638440,mfers,0x7f268357a8c2552623316e2562d90e642bb538e5,0xf02aaa8e504daa449d4636a22c9be2b572f6b796,,,250.0,0xbb0baa3ca39863a2a48c7d10fde5d166a60e363d,0x7f268357a8c2552623316e2562d90e642bb538e5,0xbb0baa3ca39863a2a48c7d10fde5d166a60e363d


## Event Contracts

What can we do this this?

In [15]:
df.contract_address.unique()

array(['0x7f268357a8c2552623316e2562d90e642bb538e5',
       '0x7be8076f4ea4a4ad08075c2508e481d6c946d12b',
       '0xcd4ec7b66fbc029c116ba9ffb3e59351c20b5b06',
       '0xb1690c08e213a35ed9bab7b318de14420fb57d8c',
       '0x93f2a75d771628856f37f256da95e99ea28aafbe',
       '0xf4985070ce32b6b1994329df787d1acc9a2dd9e2',
       '0xfbeef911dc5821886e1dda71586d90ed28174b7d',
       '0x2947f98c42597966a0ec25e92843c09ac17fbaa7',
       '0x41a322b28d0ff354040e2cbc676f0320d8c8850d',
       '0x131aebbfe55bca0c9eaad4ea24d386c5c082dd58',
       '0x8e5660b4ab70168b5a6feea0e0315cb49c8cd539',
       '0x491c05896ef656d7fee0fb90ce487315ff0ac14c',
       '0x7e3abde9d9e80fa2d1a02c89e0eae91b233cde35',
       '0x080bf510fcbf18b91105470639e9561022937712',
       '0x7a6425c9b3f5521bfa5d71df710a2fb80508319b',
       '0x23b45c658737b12f1748ce56e9b6784b5e9f3ff8',
       '0x8c9f364bf7a56ed058fc63ef81c6cf09c833e656',
       '0x6d7c44773c52d396f43c2d511b81aa168e9a7a42',
       '0xb47e3cd837ddf8e4c57f05d70ab865de6e19

## Write `DataFrame` to parquet format and read it (WIP)