# Read and combine feather files into a dataframe

__Motivation:__ Subset only the asset events involved the 21 selected NFT collections
needed for the FOMO study,
thus reducing the memory and calculation overhead of subsequent steps in machine
learning.

Of the 17,357 thousand asset events sourced, the FOMO study requires only
approximately 600 thousand events involved the selected NFT.
The subset requires only 268 MB of memory.
Loading the full dataset would unnecessarily consumes 4.8 GB of memory.

The source dataset contains many duplicated records. Duplicates within the subset
are removed and the final set is saved as a parquet file.

## Helper functions

In [1]:
import os
import re
import pandas as pd


def get_abspath(path, pattern):
    """
    Search directory non-recursively for filename by name patterns
    
    Parameters
    ----------
    path : 
    pattern : 

    Returns
    -------
    a list of absolute path, use os.path.basename to get the filename
    """
    return [os.path.join(path, x) for x in os.listdir(path) if re.search(pattern, x)]


def print_filesize(abspath):
    df=pd.DataFrame({'filename': [os.path.basename(x) for x in abspath],
                     'size': [os.stat(x).st_size for x in abspath]}) \
        .set_index('filename')
    print(df)
    print("total size:", round(sum(df["size"]) / 1024**3, 2), "gb")
    return df

In [2]:
data_dir = os.path.join(os.getcwd(), 'data', 'wallets successful event')
wallets_fn = get_abspath(os.path.join(data_dir, 'feather'), '\.feather$')
fsize = print_filesize(wallets_fn)

                                                         size
filename                                                     
alien-frens-evolution_success1.feather               46287690
boredapekennelclub_success1.feather                 102316282
boredapeyachtclub_success1.feather                   43145602
clonex_success1.feather                              31176802
coolcatsnft_A1.feather                              162405530
coolcatsnft_A2.feather                              161271826
coolcatsnft_A3.feather                               65333498
coolcatsnft_AA1.feather                              81152522
coolcatsnft_AA11.feather                             81240850
coolcatsnft_AA2.feather                              80436530
coolcatsnft_AA22.feather                             80853482
coolcatsnft_補跑B.feather                                506650
coolcatsnft_補跑E.feather                               1263210
cryptoadz-by-gremplin_success1.feather               64466754
cryptosk

## Read feather files into pandas.DataFrame 

In [3]:
lst = (pd.read_feather(each).assign(filename=os.path.basename(each)) for each in wallets_fn)
df = pd.concat(lst)

In [4]:
df.shape

(17357359, 36)

In [5]:
print('Memeory used in GB:', df.memory_usage().sum() / 1024**3)

Memeory used in GB: 4.784928880631924


Top collections in the dataset by the number of _successful_ asset events

In [6]:
df.groupby("collection_slug") \
    .agg({'event_timestamp': 'size', 'wallet_address_input': 'nunique'}) \
    .sort_values('event_timestamp', ascending=False) \
    .head(21)

Unnamed: 0_level_0,event_timestamp,wallet_address_input
collection_slug,Unnamed: 1_level_1,Unnamed: 2_level_1
cryptokitties,158632,3527
parallelalpha,109435,6209
rarible,106216,7344
cool-cats-nft,102621,12197
lazy-lions,87793,12510
boredapeyachtclub,78261,11311
world-of-women-nft,71285,10822
mutant-ape-yacht-club,68643,19147
pudgypenguins,68488,7350
phantabear,63970,10341


## Subset only selected collections

\* _N.b._ The NFT_20 list contains 21 manually selected collections.

Let `nft20_wallets` be the subset containing only events involving the selected collections.

In [7]:
nft20 = pd.read_csv(os.path.join(os.getcwd(), 'NFT_20_list.csv'))
nft20_collection_slug = [os.path.basename(url) for url in nft20.collection_url]
nft20_collection_slug

['boredapeyachtclub',
 'mutant-ape-yacht-club',
 'azuki',
 'clonex',
 'proof-moonbirds',
 'doodles-official',
 'meebits',
 'cool-cats-nft',
 'bored-ape-kennel-club',
 'cryptoadz-by-gremplin',
 'world-of-women-nft',
 'hapeprime',
 'mekaverse',
 'karafuru',
 'invisiblefriends',
 'mfers',
 'phantabear',
 'cyberkongz-vx',
 'coolpetsnft',
 'lazy-lions',
 'kaiju-kingz']

In [8]:
len(nft20_collection_slug)

21

In [9]:
nft20_wallets = df[df.collection_slug.isin(nft20_collection_slug)]
nft20_wallets.info(show_counts=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 944951 entries, 214 to 332710
Data columns (total 36 columns):
 #   Column                            Non-Null Count   Dtype         
---  ------                            --------------   -----         
 0   index                             944951 non-null  int64         
 1   event_timestamp                   944951 non-null  datetime64[ns]
 2   event_type                        944951 non-null  object        
 3   token_id                          944000 non-null  object        
 4   num_sales                         944000 non-null  float64       
 5   listing_time                      873217 non-null  datetime64[ns]
 6   token_owner_address               944000 non-null  object        
 7   token_seller_address              944951 non-null  object        
 8   from_account_address              305 non-null     object        
 9   deal_price                        944951 non-null  float64       
 10  payment_token_symbol          

_\* tip:_ another way to query dataframe memory usage

In [10]:
nft20_wallets.memory_usage(index=True).sum() / 1024**2

266.7479476928711

In [11]:
nft20_wallets.wallet_address_input.nunique()

120178

### Size of each collection group
by number of successful events, i.e. transaction or trades, and unique wallet_address_input, wallets

In [12]:
nft20_wallets.groupby("collection_slug").agg({'event_type': 'count', 'wallet_address_input': 'nunique'}).sort_values(by='event_type', ascending=False)

Unnamed: 0_level_0,event_type,wallet_address_input
collection_slug,Unnamed: 1_level_1,Unnamed: 2_level_1
cool-cats-nft,102621,12197
lazy-lions,87793,12510
boredapeyachtclub,78261,11311
world-of-women-nft,71285,10822
mutant-ape-yacht-club,68643,19147
phantabear,63970,10341
bored-ape-kennel-club,59773,10661
cryptoadz-by-gremplin,53066,9270
meebits,44634,9544
cyberkongz-vx,41851,6843


### Check for duplicates

In [13]:
bad = nft20_wallets.duplicated(['event_timestamp', 'collection_slug', 'token_id', 'wallet_address_input'], keep=False)
sum(bad)

620277

In [14]:
nft20_wallets[bad].sort_values(['event_timestamp', 'winner_account_address']).set_index('filename')

Unnamed: 0_level_0,index,event_timestamp,event_type,token_id,num_sales,listing_time,token_owner_address,token_seller_address,from_account_address,deal_price,...,created_date,collection_slug,contract_address,wallet_address_input,custom_event_name,dev_fee_payment_event,dev_seller_fee_basis_points,transaction_from_account_address,transaction_to_account_address,winner_account_address
filename,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
NFT20_success12錢包補跑12.feather,572182,2021-04-30 12:38:59,successful,33,2.0,NaT,0xed2ab4948ba6a909a7751dec4f34f303eb8c7236,0x8e05bd9fa3059ec69c15bc1a6f4d94f0ac26ce00,,1.600000e+17,...,2021-04-30T12:39:43.636056,boredapeyachtclub,0x7be8076f4ea4a4ad08075c2508e481d6c946d12b,0x8e05bd9fa3059ec69c15bc1a6f4d94f0ac26ce00,,"{'asset': None, 'asset_bundle': None, 'event_t...",0.0,0x8e05bd9fa3059ec69c15bc1a6f4d94f0ac26ce00,0x7be8076f4ea4a4ad08075c2508e481d6c946d12b,0x0b742783bfac8d4b6d332e5d1b63f433fcd8c0a0
coolcatsnft_A2.feather,652355,2021-04-30 12:38:59,successful,33,2.0,NaT,0xed2ab4948ba6a909a7751dec4f34f303eb8c7236,0x8e05bd9fa3059ec69c15bc1a6f4d94f0ac26ce00,,1.600000e+17,...,2021-04-30T12:39:43.636056,boredapeyachtclub,0x7be8076f4ea4a4ad08075c2508e481d6c946d12b,0x8e05bd9fa3059ec69c15bc1a6f4d94f0ac26ce00,,,,,,
coolcatsnft_AA22.feather,152356,2021-04-30 12:38:59,successful,33,2.0,NaT,0xed2ab4948ba6a909a7751dec4f34f303eb8c7236,0x8e05bd9fa3059ec69c15bc1a6f4d94f0ac26ce00,,1.600000e+17,...,2021-04-30T12:39:43.636056,boredapeyachtclub,0x7be8076f4ea4a4ad08075c2508e481d6c946d12b,0x8e05bd9fa3059ec69c15bc1a6f4d94f0ac26ce00,,,,,,
NFT20_success12錢包補跑12.feather,167032,2021-05-01 00:15:48,successful,586,5.0,2021-04-30 23:36:58.000000,0xf896527c49b44aab3cf22ae356fa3af8e331f280,0x88be3fa60ede9f532af10aba5690dfc254db929b,,1.000000e+17,...,2021-05-01T00:16:17.700252,boredapeyachtclub,0x7be8076f4ea4a4ad08075c2508e481d6c946d12b,0x88be3fa60ede9f532af10aba5690dfc254db929b,,"{'asset': None, 'asset_bundle': None, 'event_t...",0.0,0x0b742783bfac8d4b6d332e5d1b63f433fcd8c0a0,0x7be8076f4ea4a4ad08075c2508e481d6c946d12b,0x0b742783bfac8d4b6d332e5d1b63f433fcd8c0a0
boredapeyachtclub_success1.feather,266633,2021-05-01 00:15:48,successful,586,5.0,2021-04-30 23:36:58.000000,0xf896527c49b44aab3cf22ae356fa3af8e331f280,0x88be3fa60ede9f532af10aba5690dfc254db929b,,1.000000e+17,...,2021-05-01T00:16:17.700252,boredapeyachtclub,0x7be8076f4ea4a4ad08075c2508e481d6c946d12b,0x88be3fa60ede9f532af10aba5690dfc254db929b,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
NFT20_success12錢包補跑4.feather,240751,2022-06-02 01:34:58,successful,9443,2.0,2022-06-01 22:31:32.794843,0x8f46d017455920ecbf72d04d95505208aaf03581,0xe3f663418251186888935dc1c4979fa3a3da1bac,,2.160000e+19,...,2022-06-02T01:35:54.717142,proof-moonbirds,0x7f268357a8c2552623316e2562d90e642bb538e5,0x8f46d017455920ecbf72d04d95505208aaf03581,,,500.0,0x8f46d017455920ecbf72d04d95505208aaf03581,0x7f268357a8c2552623316e2562d90e642bb538e5,0x8f46d017455920ecbf72d04d95505208aaf03581
NFT20_success12錢包補跑10.feather,179059,2022-06-05 15:37:39,successful,7167,5.0,2022-06-05 07:16:44.000000,0x0cc901998f14e88b7669e7a01863acc8f64905f7,0x4fa94ca44d1c403c7a864a4e6fb885cc3f6e92bb,,4.190000e+18,...,2022-06-05T15:37:46.798515,world-of-women-nft,0x7f268357a8c2552623316e2562d90e642bb538e5,0x4fa94ca44d1c403c7a864a4e6fb885cc3f6e92bb,,,400.0,0x0cc901998f14e88b7669e7a01863acc8f64905f7,0x7f268357a8c2552623316e2562d90e642bb538e5,0x0cc901998f14e88b7669e7a01863acc8f64905f7
NFT20_success12錢包補跑8.feather,82749,2022-06-05 15:37:39,successful,7167,5.0,2022-06-05 07:16:44.000000,0x0cc901998f14e88b7669e7a01863acc8f64905f7,0x4fa94ca44d1c403c7a864a4e6fb885cc3f6e92bb,,4.190000e+18,...,2022-06-05T15:37:46.798515,world-of-women-nft,0x7f268357a8c2552623316e2562d90e642bb538e5,0x4fa94ca44d1c403c7a864a4e6fb885cc3f6e92bb,,,400.0,0x0cc901998f14e88b7669e7a01863acc8f64905f7,0x7f268357a8c2552623316e2562d90e642bb538e5,0x0cc901998f14e88b7669e7a01863acc8f64905f7
NFT20_success12錢包補跑14.feather,14354,2022-06-07 09:08:06,successful,7694,4.0,2022-06-07 08:40:07.000000,0xbb0baa3ca39863a2a48c7d10fde5d166a60e363d,0xf02aaa8e504daa449d4636a22c9be2b572f6b796,,2.660000e+18,...,2022-06-07T09:08:30.638440,mfers,0x7f268357a8c2552623316e2562d90e642bb538e5,0xf02aaa8e504daa449d4636a22c9be2b572f6b796,,,250.0,0xbb0baa3ca39863a2a48c7d10fde5d166a60e363d,0x7f268357a8c2552623316e2562d90e642bb538e5,0xbb0baa3ca39863a2a48c7d10fde5d166a60e363d


### Remove duplicates

according Fred's recommendation:

1. subsetting the dataset should significantly reduces the duplicate event records
2. and then follow by dropping any of the duplicate rows

In [15]:
wallet_addresses = pd.read_excel(os.path.join(os.getcwd(), 'data', '錢包整理_0617.xlsx'), sheet_name='20個項目的參與錢包')
#wallet_addresses = pd.read_excel(os.path.join(os.getcwd(), 'data', '錢包整理_0617.xlsx'), sheet_name='20項目買方錢包winner_address')

In [16]:
wallet_addresses.shape

(257460, 3)

In [17]:
wallet_addresses.columns

Index(['collection_slug', 'token_owner_address', '資料所存放的檔案'], dtype='object')

In [18]:
y = wallet_addresses.drop_duplicates(subset=['token_owner_address'])
y

Unnamed: 0,collection_slug,token_owner_address,資料所存放的檔案
0,karafuru,0x24907c58e080f2a9d1f31f25d555aa3d5a5e3419,NFT20_success12錢包補跑9.xlsx
1,karafuru,0xf0e60ce560a7d7226369940e168bf00abd675467,NFT20_success12錢包補跑20.xlsx
2,karafuru,0x87cfdc4bfae569088bc1bcc0dd3c19f53da54193,NFT20_success12錢包補跑20.xlsx
3,karafuru,0x00167777e0daf9dc2d3a76195ab56e857ec67ab2,mekaverse_success1.xlsx
4,karafuru,0x94f055731e8588cf434de1591dc922490a86cd4e,world-of-women-nft_success1.xlsx
...,...,...,...
257437,cyberkongz-vx,0x212859fb34f5dbd67df64ddf086ec802fbd03a7c,
257443,cyberkongz-vx,0x714c3c5ff30a03f7a0aa4c891449fe331272775a,
257451,cyberkongz-vx,0x0fc87896cd69c65a92ec169034b5e9af3a1998ba,
257456,cyberkongz-vx,0x152a42d1382d213fa85a490cbf6c83f200468bee,


_\* N.b._ remove duplicate token_owner_address from the table before joining

In [19]:
X = nft20_wallets.merge(y.loc[:, ['token_owner_address', '資料所存放的檔案']],
                    left_on='wallet_address_input', right_on='token_owner_address')

In [20]:
bad = X.duplicated(['event_timestamp', 'collection_slug', 'token_id', 'wallet_address_input'], keep=False)
sum(bad)

620277

In [21]:
bad = nft20_wallets.duplicated(['event_timestamp', 'collection_slug', 'token_id', 'wallet_address_input'], keep=False)
sum(bad)

620277

_N.b._ The above step did not reduce the duplicates

In [22]:
bad = nft20_wallets.duplicated(['event_timestamp', 'collection_slug', 'token_id', 'wallet_address_input'], keep=False)
sum(bad)

620277

Sort the problem subset by `event_timestamp` and `winner_account_address` followed by dropping the duplicates while keeping only the first occurrence 

In [23]:
good = nft20_wallets[bad].sort_values(['event_timestamp', 'winner_account_address']) \
    .drop_duplicates(subset=['event_timestamp', 'collection_slug', 'token_id', 'wallet_address_input'])
good.shape

(276182, 36)

In [24]:
print('Percentage of duplicates removed:', (1 - good.shape[0] / sum(bad)) * 100)

Percentage of duplicates removed: 55.474409014037285


### Save the de-duped subset

In [25]:
nft20_wallets[~bad].shape

(324674, 36)

In [26]:
deduped = pd.concat([nft20_wallets[~bad], good])
deduped.shape

(600856, 36)

In [27]:
deduped.to_parquet(os.path.join(data_dir, 'nft20_success.parquet'), compression='lz4')

_N.b._ parquet format performs nearly as fast as feather and is commonly used in Databricks environment.

In [28]:
print('parquet file size:',
      os.stat(os.path.join(data_dir, 'nft20_success.parquet')).st_size / 1024**3)

parquet file size: 0.15252696443349123
