# Read and combine feather files into a dataframe

__Motivation:__ 

## Helper functions

In [1]:
import os
import re
import pandas as pd


def get_abspath(path, pattern):
    """
    Search directory non-recursively for filename by name patterns
    
    Parameters
    ----------
    path : 
    pattern : 

    Returns
    -------
    a list of absolute path, use os.path.basename to get the filename
    """
    return [os.path.join(path, x) for x in os.listdir(path) if re.search(pattern, x)]


def print_filesize(abspath):
    df=pd.DataFrame({'filename': [os.path.basename(x) for x in abspath],
                     'size': [os.stat(x).st_size for x in abspath]}) \
        .set_index('filename')
    print(df)
    print("total size:", round(sum(df["size"]) / 1024**3, 2), "gb")
    return df

data_dir = os.path.join(os.getcwd(), 'data', 'wallets successful event')

In [2]:
wallets_fn = get_abspath(os.path.join(data_dir, 'feather'), '\.feather$')
fsize = print_filesize(wallets_fn)

                                                         size
filename                                                     
alien-frens-evolution_success1.feather               46287690
boredapekennelclub_success1.feather                 102316282
boredapeyachtclub_success1.feather                   43145602
clonex_success1.feather                              31176802
coolcatsnft_A1.feather                              162405530
coolcatsnft_A2.feather                              161271826
coolcatsnft_A3.feather                               65333498
coolcatsnft_AA1.feather                              81152522
coolcatsnft_AA11.feather                             81240850
coolcatsnft_AA2.feather                              80436530
coolcatsnft_AA22.feather                             80853482
coolcatsnft_補跑B.feather                                506650
coolcatsnft_補跑E.feather                               1263210
cryptoadz-by-gremplin_success1.feather               64466754
cryptosk

## Read feather files into pandas.DataFrame 

In [3]:
lst = (pd.read_feather(each).assign(filename=os.path.basename(each)) for each in wallets_fn)
df = pd.concat(lst).drop("index", axis=1)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 17357359 entries, 0 to 332720
Data columns (total 35 columns):
 #   Column                            Dtype         
---  ------                            -----         
 0   event_timestamp                   datetime64[ns]
 1   event_type                        object        
 2   token_id                          object        
 3   num_sales                         float64       
 4   listing_time                      datetime64[ns]
 5   token_owner_address               object        
 6   token_seller_address              object        
 7   from_account_address              object        
 8   deal_price                        float64       
 9   payment_token_symbol              object        
 10  payment_token_decimals            float64       
 11  payment_token_usdprice            float64       
 12  quantity                          float64       
 13  starting_price                    float64       
 14  ending_price      

\* _N.b._ The NFT_20 list contains 21 manually selected collections.

In [5]:
df.groupby("collection_slug").size().sort_values(ascending=False).head(21)

collection_slug
cryptokitties            158632
parallelalpha            109435
rarible                  106216
cool-cats-nft            102621
lazy-lions                87793
boredapeyachtclub         78261
world-of-women-nft        71285
mutant-ape-yacht-club     68643
pudgypenguins             68488
phantabear                63970
axie                      60612
bored-ape-kennel-club     59773
cryptoadz-by-gremplin     53066
adam-bomb-squad           51441
deadfellaz                51336
robotos-official          51056
thewickedcraniums         50947
ape-gang-old              50477
creatureworld             50375
meebits                   44634
cyberkongz-vx             41851
dtype: int64

## Remove duplicates

In [6]:
bad = df.duplicated(['event_timestamp', 'collection_slug', 'token_id', 'wallet_address_input'], keep=False)
sum(bad)

9545520

Sort the problem subset by `event_timestamp` and `winner_account_address` followed by dropping the duplicates while keeping only the first occurrence 

In [8]:
good = df[bad].sort_values(['event_timestamp', 'winner_account_address']) \
    .drop_duplicates(subset=['event_timestamp', 'collection_slug', 'token_id', 'wallet_address_input'])
good.shape

(4305270, 35)

In [9]:
print('Percentage of duplicates removed:', (1 - good.shape[0] / sum(bad)) * 100)

Percentage of duplicates removed: 54.89748070298946


### Save the de-duped subset

In [10]:
df[~bad].shape

(7811839, 35)

In [11]:
deduped = pd.concat([df[~bad], good])
deduped.shape

(12117109, 35)

In [12]:
deduped.to_parquet(os.path.join(data_dir, 'wallets_successful_event.parquet'), compression='lz4')

_N.b._ parquet format performs nearly as fast as feather and is commonly used in Databricks environment.

In [13]:
print('parquet file size:',
      os.stat(os.path.join(data_dir, 'wallets_successful_event.parquet')).st_size / 1024**3)

parquet file size: 3.244157531298697
