# Wallet Feature Engineering

The purpose of this notebook is to...

# Read Data

Descitpion of the dataset ...

In [1]:
import os
import re
import datetime

data_dir = os.path.join(os.getcwd(), 'data')
cool_cats_nft_dir = os.path.join(data_dir, 'cool-cats-nft')

coolcats_wallets = [x for x in os.listdir(cool_cats_nft_dir) if re.search('^coolcatsnft_A\d\.xlsx', x)]
print(coolcats_wallets)

['coolcatsnft_A1.xlsx', 'coolcatsnft_A2.xlsx', 'coolcatsnft_A3.xlsx']


In [2]:
import pandas as pd


def read_combine(io):
    """
    Read a list of Excel files and combine them into a panadas DataFrame
    
    Args:
        io (list): list of fully qualitifed filenames
        
    Returns:
        DataFrame
    """
    
    list_df = (pd.read_excel(each) for each in io)
    return pd.concat(list_df)

In [3]:
import time

files = [os.path.join(cool_cats_nft_dir, x) for x in coolcats_wallets]
start_time = time.time()
wallets = read_combine(files)
total_time = time.time() - start_time
print("total minutes to load:", total_time / 60)

total minutes to load: 16.407136325041453


In [4]:
wallets = wallets[wallets.msg == "success"]

In [5]:
wallets.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2398450 entries, 0 to 399421
Data columns (total 32 columns):
 #   Column                  Dtype  
---  ------                  -----  
 0   Unnamed: 0              int64  
 1   Unnamed: 1              int64  
 2   event_timestamp         object 
 3   event_type              object 
 4   token_id                object 
 5   num_sales               float64
 6   listing_time            object 
 7   token_owner_address     object 
 8   token_seller_address    object 
 9   deal_price              float64
 10  payment_token_symbol    object 
 11  payment_token_decimals  float64
 12  payment_token_usdprice  float64
 13  quantity                object 
 14  starting_price          float64
 15  ending_price            float64
 16  approved_account        float64
 17  asset_bundle            object 
 18  auction_type            object 
 19  bid_amount              float64
 20  transaction_hash        object 
 21  block_hash              object 


In [6]:
wallets.drop(["Unnamed: 0", "Unnamed: 1", "starting_price", "ending_price",
              "approved_account", "bid_amount", "duration",
              "pages", "msg", "FILTER"], axis=1, inplace=True)

# Generate Features

將錢包地址分組

暫時給定任一個錢包地址來進行以下特徵計算 (最後再用迴圈串起來)

In [7]:
sectors = wallets.groupby("wallet_address_input")

df_temp3 = sectors.get_group("0x5338035c008ea8c4b850052bc8dad6a33dc2206c")
df_temp3 = df_temp3.reset_index(drop=True)

In [8]:
df_temp3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5521 entries, 0 to 5520
Data columns (total 22 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   event_timestamp         5521 non-null   object 
 1   event_type              5521 non-null   object 
 2   token_id                5503 non-null   object 
 3   num_sales               5503 non-null   float64
 4   listing_time            5472 non-null   object 
 5   token_owner_address     5503 non-null   object 
 6   token_seller_address    5521 non-null   object 
 7   deal_price              5521 non-null   float64
 8   payment_token_symbol    5521 non-null   object 
 9   payment_token_decimals  5521 non-null   float64
 10  payment_token_usdprice  5521 non-null   float64
 11  quantity                5521 non-null   object 
 12  asset_bundle            18 non-null     object 
 13  auction_type            0 non-null      object 
 14  transaction_hash        5521 non-null   

In [9]:
import numpy as np

#買入價格，給定以太鏈。Eth = deal_price/(10**18)
df_temp3["cost"] = np.where(df_temp3["wallet_address_input"][0]==df_temp3["token_seller_address"], 0,df_temp3["deal_price"]/10**18)
#賣出價格
df_temp3["sellprice"] = np.where(df_temp3["wallet_address_input"][0]==df_temp3["token_seller_address"], df_temp3["deal_price"]/10**18, 0)
#日期轉換
df_temp3["Datetime"] = pd.to_datetime(df_temp3["event_timestamp"])
#買賣戳記
df_temp3["Buy_Sell"] = np.where(df_temp3["wallet_address_input"][0]==df_temp3["token_seller_address"], "S", 'B')
#投資組合(庫存)
df_temp3["Profolio"] = np.NaN
#損益
df_temp3["PL"] = 0
#token持有數量
df_temp3["NFT_total_num"] = 0
#用collection_slug和tokenid組一個獨立欄位，用以紀錄錢包所持有的token
df_temp3["collection_slug_tokenid"] = df_temp3["collection_slug"] + df_temp3["token_id"]
#token從二級買進到賣出所持有的時間
df_temp3["HoldPeriod"] = np.NaN
df_temp3["Position"] = 0
df_temp3["Sell"] = 0

In [10]:
df_temp3.head()

Unnamed: 0,event_timestamp,event_type,token_id,num_sales,listing_time,token_owner_address,token_seller_address,deal_price,payment_token_symbol,payment_token_decimals,...,sellprice,Datetime,Buy_Sell,Profolio,PL,NFT_total_num,collection_slug_tokenid,HoldPeriod,Position,Sell
0,2022-05-04T03:29:18,successful,2977,1.0,2022-05-04T03:19:37,0x5338035c008ea8c4b850052bc8dad6a33dc2206c,0x4984fc170325e8fe57e9de1c2b74ce5eabb6f9da,8.45e+17,ETH,18.0,...,0.0,2022-05-04 03:29:18,B,,0,0,fragments-by-james-jean2977,,0,0
1,2022-05-04T03:29:18,successful,3016,1.0,2022-05-04T00:14:49,0x5338035c008ea8c4b850052bc8dad6a33dc2206c,0xc293bc1602efeba837cb240c49476e1d3fe0fd98,8.45e+17,ETH,18.0,...,0.0,2022-05-04 03:29:18,B,,0,0,fragments-by-james-jean3016,,0,0
2,2022-05-04T03:29:18,successful,4956,1.0,2022-05-03T19:39:05,0x5338035c008ea8c4b850052bc8dad6a33dc2206c,0xe2fb909159dea75b1520c382ca102989cdd1a276,8.4e+17,ETH,18.0,...,0.0,2022-05-04 03:29:18,B,,0,0,fragments-by-james-jean4956,,0,0
3,2022-05-04T03:29:18,successful,5078,1.0,2022-05-04T01:40:56,0x5338035c008ea8c4b850052bc8dad6a33dc2206c,0x8a45d09b2dbbf1657fb8c14561b6525443631d22,8.5e+17,ETH,18.0,...,0.0,2022-05-04 03:29:18,B,,0,0,fragments-by-james-jean5078,,0,0
4,2022-05-04T03:29:18,successful,5800,1.0,2022-05-04T02:52:43,0x5338035c008ea8c4b850052bc8dad6a33dc2206c,0xcc60f720388551bc9159cfed814a15de2f49d1e9,8.49e+17,ETH,18.0,...,0.0,2022-05-04 03:29:18,B,,0,0,fragments-by-james-jean5800,,0,0


In [11]:
porfolio_dict = {}#紀錄持有的NFT集合
porfolio_costdict = {}#紀錄買入成本
porfolio_datedict = {}#紀錄買入時間
count = 0
error = []
#資料時間是從新到舊，計算時要倒序，從舊到新去累計上來。
for i in range(len(df_temp3)-1,-1,-1):
    #初次買進NFT項目
    if df_temp3["collection_slug"][i] not in porfolio_dict.keys():
        if df_temp3["Buy_Sell"][i]=="B":
            #庫存加一
            count = count+1
            porfolio_dict[df_temp3["collection_slug"][i]] = [df_temp3["token_id"][i]]
            df_temp3.loc[i, "Profolio"] = [porfolio_dict]
            df_temp3.loc[i, "NFT_total_num"] = count
            #NFT成本
            porfolio_costdict[df_temp3["collection_slug_tokenid"][i]] = df_temp3["cost"][i]
            #NFT買進時間
            porfolio_datedict[df_temp3["collection_slug_tokenid"][i]] = df_temp3["Datetime"][i]
            #position
            df_temp3.loc[i, "Position"] = sum(porfolio_costdict.values())
            
        else:
            #賣出代表過去有可能發生來自於其他錢包轉移，但無法計算到先前持有的成本。
            df_temp3.loc[i, "NFT_total_num"] = count
            df_temp3.loc[i, "Position"] = sum(porfolio_costdict.values())
    else:
        #手上持有此項目的NFT
        if df_temp3["token_id"][i] not in porfolio_dict[df_temp3["collection_slug"][i]]:
            if df_temp3["Buy_Sell"][i]=="B":
                #買進加碼
                porfolio_dict[df_temp3["collection_slug"][i]].append(df_temp3["token_id"][i])
                df_temp3.loc[i, "Profolio"] = [porfolio_dict]
                #庫存加一
                count = count+1
                df_temp3.loc[i, "NFT_total_num"] = count
                #NFT成本
                porfolio_costdict[df_temp3["collection_slug_tokenid"][i]] = df_temp3["cost"][i]
                #NFT買進時間
                porfolio_datedict[df_temp3["collection_slug_tokenid"][i]] = df_temp3["Datetime"][i]
                #position
                df_temp3.loc[i, "Position"] = sum(porfolio_costdict.values())
                
            else:
                #賣出。有可能發生來自於其他錢包轉移，但無法計算到先前持有的成本。
                df_temp3.loc[i, "NFT_total_num"] = count
                df_temp3.loc[i, "Profolio"] = [porfolio_dict]
                df_temp3.loc[i, "Position"] = sum(porfolio_costdict.values())
        else:
            if df_temp3["Buy_Sell"][i]=="B":
                #不可能發生，因為tokenid是唯一的?
                df_temp3.loc[i, "NFT_total_num"] = count
                df_temp3.loc[i, "Profolio"] = [porfolio_dict]
                df_temp3.loc[i, "Position"] = sum(porfolio_costdict.values())
            else:
                #損益發生點，完成一次買入跟賣出
                #庫存減一
                count = count-1
                df_temp3.loc[i, "NFT_total_num"] = count
                #將token從porfolio移除
                porfolio_dict[df_temp3["collection_slug"][i]].remove(df_temp3["token_id"][i])
                df_temp3.loc[i, "Profolio"] = [porfolio_dict]
                if df_temp3["collection_slug_tokenid"][i] in porfolio_costdict.keys():
                    profit = df_temp3["sellprice"][i] - porfolio_costdict[df_temp3["collection_slug_tokenid"][i]]
                    df_temp3.loc[i, "PL"] =  profit        
                    #丟棄key and value因為賣出了
                    porfolio_costdict.pop(df_temp3["collection_slug_tokenid"][i])
                    df_temp3.loc[i, "Position"] = sum(porfolio_costdict.values())
                    #TOKEN從買入到賣出持有間隔時間
                    date_substrate = df_temp3["Datetime"][i] - porfolio_datedict[df_temp3["collection_slug_tokenid"][i]]
                    df_temp3.loc[i, "HoldPeriod"] =  date_substrate
                    #賣出戳記
                    df_temp3.loc[i, "Sell"] =  1
                    
                else:
                    #通常不會到這裡
                    error.append([df_temp3["wallet_address_input"][0],df_temp3["collection_slug_tokenid"][i]])
                    df_temp3.loc[i, "Profolio"] = [porfolio_dict]
                    df_temp3.loc[i, "Position"] = sum(porfolio_costdict.values())

In [12]:
#損益為正
def positive_SIGN(row):
    if row['PL_sign'] == 1:
        return 1
    return 0

#損益為負
def negative_SIGN(row):
    if row['PL_sign'] == -1 :
        return 1
    return 0

In [13]:
#累計損益是在一個錢包裡完成完整的買進賣出動作所累計的。
df_temp3['cum_PL'] = df_temp3.loc[::-1, 'PL'].cumsum()[::-1]
#總利潤
df_temp3['TotalRevenue'] = df_temp3['cum_PL'] - df_temp3["Position"]
#損益正負符號
df_temp3["PL_sign"] = np.sign(list(df_temp3["PL"].values))
#累計賣出數量
df_temp3["cum_Sell"] = df_temp3.loc[::-1, 'Sell'].cumsum()[::-1]
#損益為正做記號
df_temp3["positive_sign"] = df_temp3.apply(lambda row: positive_SIGN(row), axis=1)
#損益為負做記號
df_temp3["negative_sign"] = df_temp3.apply(lambda row: negative_SIGN(row), axis=1)
#累積正損益數
df_temp3["cum_positive_sign"] = df_temp3.loc[::-1, 'positive_sign'].cumsum()[::-1]
#累積負損益數
df_temp3["cum_negative_sign"] = df_temp3.loc[::-1, 'negative_sign'].cumsum()[::-1]
#勝率
df_temp3["winrate"] = df_temp3["cum_positive_sign"] / df_temp3['cum_Sell']
#輸錢率
df_temp3["lossrate"] = df_temp3["cum_negative_sign"] / df_temp3['cum_Sell']
#用0填補缺值
df_temp3["winrate"] = df_temp3["winrate"].fillna(0)
df_temp3["lossrate"] = df_temp3["lossrate"].fillna(0)
#接受問價而賣出做紀號
df_temp3["Bid_sell"] = np.where((df_temp3["payment_token_symbol"]=="WETH")&(df_temp3["Buy_Sell"]=="S"), 1,0)
#透過問價而買入做紀號
df_temp3["Bid_buy"] = np.where((df_temp3["payment_token_symbol"]=="WETH")&(df_temp3["Buy_Sell"]=="B"), 1,0)
#累計問價買入數
df_temp3["cum_Bid_buy"] = df_temp3.loc[::-1, 'Bid_buy'].cumsum()[::-1]
#累計接受問價賣出數
df_temp3["cum_Bid_sell"] = df_temp3.loc[::-1, 'Bid_sell'].cumsum()[::-1]

#勝率(透過問價而買入&接受問價而賣出)前者代表很會釣魚，後者代表失去信心或是無法抵抗高價誘惑
df_temp3["Bid_sell_rate"] = df_temp3["cum_Bid_sell"] / df_temp3["cum_Sell"]
df_temp3["Bid_sell_rate"] = df_temp3["Bid_sell_rate"].fillna(0)
df_temp3["Bid_buy_rate"] = df_temp3["cum_Bid_buy"] / df_temp3["NFT_total_num"]
#TOKEN賣出數/手上TOKEN持有數
df_temp3["sellposition_rate"] = df_temp3["cum_Sell"]/df_temp3["NFT_total_num"]

In [14]:
df_temp3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5521 entries, 0 to 5520
Data columns (total 50 columns):
 #   Column                   Non-Null Count  Dtype         
---  ------                   --------------  -----         
 0   event_timestamp          5521 non-null   object        
 1   event_type               5521 non-null   object        
 2   token_id                 5503 non-null   object        
 3   num_sales                5503 non-null   float64       
 4   listing_time             5472 non-null   object        
 5   token_owner_address      5503 non-null   object        
 6   token_seller_address     5521 non-null   object        
 7   deal_price               5521 non-null   float64       
 8   payment_token_symbol     5521 non-null   object        
 9   payment_token_decimals   5521 non-null   float64       
 10  payment_token_usdprice   5521 non-null   float64       
 11  quantity                 5521 non-null   object        
 12  asset_bundle             18 non-nu

In [15]:
df_temp3.describe().loc["mean"]

num_sales                 1.017576e+02
deal_price                5.979868e+17
payment_token_decimals    1.800000e+01
payment_token_usdprice    2.390965e+03
block_number              1.335009e+07
is_private                1.279240e-03
cost                      1.693795e-01
sellprice                 4.286074e-01
PL                       -7.802300e-03
NFT_total_num             5.437930e+02
Position                  1.795545e+02
Sell                      9.871400e-02
cum_PL                    4.249531e+01
TotalRevenue             -1.370592e+02
PL_sign                   1.847491e-02
cum_Sell                  2.250926e+02
positive_sign             5.850389e-02
negative_sign             4.002898e-02
cum_positive_sign         1.614360e+02
cum_negative_sign         6.358920e+01
winrate                   7.560965e-01
lossrate                  2.435963e-01
Bid_sell                  8.694077e-03
Bid_buy                   1.811266e-04
cum_Bid_buy               2.807462e-01
cum_Bid_sell             