In [285]:
import pandas as pd
import re
import numpy as np
print("libaries successfully imported :)")

libaries successfully imported :)


In [286]:
# The data spans from Start (YYYY-MM-DD): 2024-11-04 08:00:00 SGT End: 2025-10-13 07:59:59 SGT to align with the crypto price data 
# UTC 2024-11-04 00:00:00 to 2025-10-12 23:59:59

# We taking data from 2024-11-03 to 2025-10-13 to have a full day before the start date of 2024-11-05 for any rolling calculations

data = pd.read_json('../Data/whale_alert_031124_131025.json')

#Structure of the message data downloaded
message_data = data.get("messages")
message_data[0]

{'id': 82893,
 'type': 'message',
 'date': '2024-11-03T08:26:49',
 'date_unixtime': '1730593609',
 'edited': '2024-11-03T08:26:52',
 'edited_unixtime': '1730593612',
 'from': 'Whale Alert',
 'from_id': 'channel1309043988',
 'text': ['ðŸš¨ ðŸš¨ ðŸš¨ ðŸš¨ ðŸš¨  45,679 ',
  {'type': 'hashtag', 'text': '#WETH'},
  ' (113,879,074 USD) transferred from unknown wallet to unknown wallet\n',
  {'type': 'text_link',
   'text': 'Details',
   'href': 'https://whale-alert.io/transaction/ethereum/0x3d900dfb58f61b63145cc93d3852c02093e0adfb48b20968d09b09b00f6d19e5'},
  ''],
 'text_entities': [{'type': 'plain', 'text': 'ðŸš¨ ðŸš¨ ðŸš¨ ðŸš¨ ðŸš¨  45,679 '},
  {'type': 'hashtag', 'text': '#WETH'},
  {'type': 'plain',
   'text': ' (113,879,074 USD) transferred from unknown wallet to unknown wallet\n'},
  {'type': 'text_link',
   'text': 'Details',
   'href': 'https://whale-alert.io/transaction/ethereum/0x3d900dfb58f61b63145cc93d3852c02093e0adfb48b20968d09b09b00f6d19e5'},
  {'type': 'plain', 'text': ''}]}

In [287]:
# We will parse the data to extract relevant information used in the studies such as datetime, bitcoin transactions, bitcoin change and usd changes
# Store each message as a dictionary with 'date' and 'text' (joined string),
# but for hashtags, only keep the hashtag value (e.g., BTC)
# Remove emojis, hashtags, double spaces, and newlines from text
# Only include messages where text contains 'transferred' and 'BTC'
flattened_sentiment_df = []
emoji_pattern = re.compile('[\U00010000-\U0010ffff\U0001F600-\U0001F64F\U0001F300-\U0001F5FF\U0001F680-\U0001F6FF\U0001F1E0-\U0001F1FF]+', flags=re.UNICODE)
for msg in message_data:
    unixdatetime = msg.get("date_unixtime")
    text_items = []
    for item in msg.get("text"):
        if isinstance(item, dict):
            # Only keep the hashtag value if type is 'hashtag'
            if item.get('type') == 'hashtag' and 'text' in item:
                # Remove hashtag symbol
                text_items.append(item['text'].replace('#', ''))
        else:
            # Filter out unwanted string markers
            if not (str(item).startswith('type:') or str(item).startswith('text:')):
                text_items.append(item)
    text_str = ' '.join([str(x) for x in text_items])
    # Remove emojis
    text_str = emoji_pattern.sub('', text_str)
    # Remove newlines and double spaces
    text_str = text_str.replace('\n', ' ')
    text_str = re.sub(' +', ' ', text_str).strip()
    # Only include if 'transferred' and 'BTC' are in text (CHANGE FOR OTHER CRYPTO)
    if 'transferred' in text_str and 'BTC' in text_str:
        row_dict = {'unixdatetime': unixdatetime, 'text': text_str}
        flattened_sentiment_df.append(row_dict)
flattened_sentiment_df[0:10]

[{'unixdatetime': '1730595625',
  'text': '990 BTC (68,475,464 USD) transferred from unknown wallet to Binance'},
 {'unixdatetime': '1730649625',
  'text': '738 BTC (50,275,703 USD) transferred from Bitstamp to unknown wallet'},
 {'unixdatetime': '1730707295',
  'text': '2,000 BTC (137,247,624 USD) transferred from Bybit to unknown wallet'},
 {'unixdatetime': '1730731336',
  'text': '784 BTC (53,625,145 USD) transferred from unknown wallet to Binance'},
 {'unixdatetime': '1730741045',
  'text': '1,527 BTC (103,318,256 USD) transferred from unknown wallet to unknown wallet'},
 {'unixdatetime': '1730763570',
  'text': '2,000 BTC (135,826,428 USD) transferred from unknown wallet to unknown wallet'},
 {'unixdatetime': '1730768676',
  'text': '2,100 BTC (142,920,979 USD) transferred from unknown wallet to unknown wallet'},
 {'unixdatetime': '1730772690',
  'text': '1,837 BTC (124,931,007 USD) transferred from unknown wallet to unknown wallet'},
 {'unixdatetime': '1730777740',
  'text': '1,6

In [288]:
# we will now format the data such that the information required in the studies is extracted 
# Extract UTC timestamp, Cryptocurrency, From, To, number of coins, and value (USD) for the final dataframe
# Only keep the first text with parentheses containing 'USD' for the Value (USD) field
final_records = []
for row in flattened_sentiment_df:
    unix_ts = row['unixdatetime']
    utc_timestamp = pd.to_datetime(unix_ts, unit='s', utc=True)
    text = row['text']

    # Extract number of coins (first number with comma)
    num_coins_match = re.search(r'(\d{1,3}(?:,\d{3})*(?:\.\d+)?)', text)
    num_coins = num_coins_match.group(1) if num_coins_match else None
    num_coins = pd.to_numeric(num_coins.replace(',', '') if num_coins else None)

    # Only keep the first text with parentheses containing 'USD'
    value_match = re.search(r"\(([^()]*)\)", text)
    value_raw = value_match.group(1) if value_match else None
    value = pd.to_numeric(value_raw.replace('USD', '').replace(',', '').strip())

    # Extract coin (first all-caps word after number of coins)
    coin_match = re.search(r'(\d{1,3}(?:,\d{3})*(?:\.\d+)?)[ ]*([A-Z]{2,10})', text)
    coin = coin_match.group(2) if coin_match else None

    # Extract 'from' and 'to' using regex
    from_match = re.search(r'transferred from (.*?) to', text)
    to_match = re.search(r'to (.*?)(?:\.|$)', text)
    from_entity = from_match.group(1).strip() if from_match else None
    to_entity = to_match.group(1).lstrip() if to_match else None
    final_records.append({
        'UTC timestamp': utc_timestamp,
        'Cryptocurrency': coin,
        'Number of Coins': num_coins,
        'Value (USD)': value,
        'From': from_entity,
        'To': to_entity
    })
final_df = pd.DataFrame(final_records)  # rename
final_df 

  utc_timestamp = pd.to_datetime(unix_ts, unit='s', utc=True)


Unnamed: 0,UTC timestamp,Cryptocurrency,Number of Coins,Value (USD),From,To
0,2024-11-03 01:00:25+00:00,BTC,990,68475464,unknown wallet,Binance
1,2024-11-03 16:00:25+00:00,BTC,738,50275703,Bitstamp,unknown wallet
2,2024-11-04 08:01:35+00:00,BTC,2000,137247624,Bybit,unknown wallet
3,2024-11-04 14:42:16+00:00,BTC,784,53625145,unknown wallet,Binance
4,2024-11-04 17:24:05+00:00,BTC,1527,103318256,unknown wallet,unknown wallet
...,...,...,...,...,...,...
3431,2025-10-11 13:35:41+00:00,BTC,522,58528931,Kraken,unknown wallet
3432,2025-10-11 14:05:41+00:00,BTC,1696,190465731,unknown wallet,OKEX
3433,2025-10-12 10:34:11+00:00,BTC,499,55784256,OKEX,unknown wallet
3434,2025-10-12 11:24:41+00:00,BTC,528,58960701,Ceffu,Binance


In [289]:
# Filter for only BTC transactions
final_df['Cryptocurrency'] == 'BTC'

# Filter for transactions where either 'From' or 'To' contains 'unknown' (case-insensitive), but not both
from_unknown = final_df['From'].str.lower().str.contains('unknown', na=False)
to_unknown = final_df['To'].str.lower().str.contains('unknown', na=False)
# Only keep rows where exactly one of 'From' or 'To' is unknown
unknown_mask = from_unknown ^ to_unknown
unknown_transactions = final_df[unknown_mask]
len(unknown_transactions)


2025

In [290]:
# sepparating exchange to unknown and unknown to exchange transactions into 2 df


exchange_to_unknown = unknown_transactions[unknown_transactions['To'].str.contains('unknown', case=False, na=False)][['Number of Coins', 'Value (USD)', 'UTC timestamp']]
exchange_to_unknown = exchange_to_unknown[['UTC timestamp', 'Value (USD)', 'Number of Coins']]

unknown_to_exchange = unknown_transactions[unknown_transactions['From'].str.contains('unknown', case=False, na=False)][['Number of Coins', 'Value (USD)', 'UTC timestamp']]
unknown_to_exchange = unknown_to_exchange[['UTC timestamp', 'Value (USD)', 'Number of Coins']]



In [291]:
exchange_to_unknown['value_change'] = exchange_to_unknown['Value (USD)']
exchange_to_unknown['coins_change'] = exchange_to_unknown['Number of Coins']

unknown_to_exchange['value_change'] = -unknown_to_exchange['Value (USD)']
unknown_to_exchange['coins_change'] = -unknown_to_exchange['Number of Coins']

In [292]:
df=pd.concat([exchange_to_unknown, unknown_to_exchange], ignore_index=True, sort=False)

df['whale_to_exchange_usd'] = -df['value_change'].where(df['value_change'] < 0, 0)
df['exchange_to_whale_usd'] = df['value_change'].where(df['value_change'] > 0, 0)
df['whale_to_exchange_coins'] = -df['coins_change'].where(df['value_change'] < 0, 0)
df['exchange_to_whale_coins'] = df['coins_change'].where(df['value_change'] > 0, 0)

In [293]:
df['datetimeday'] = pd.to_datetime(df['UTC timestamp'], errors='coerce').dt.date

temp5=pd.to_datetime(df['UTC timestamp'], errors='coerce').dt.floor('1h')
temp5=pd.to_datetime(temp5.map(lambda t: t.strftime('%Y-%m-%d %H:%M')))
df['datetime1h']=temp5

df

Unnamed: 0,UTC timestamp,Value (USD),Number of Coins,value_change,coins_change,whale_to_exchange_usd,exchange_to_whale_usd,whale_to_exchange_coins,exchange_to_whale_coins,datetimeday,datetime1h
0,2024-11-03 16:00:25+00:00,50275703,738,50275703,738,0,50275703,0,738,2024-11-03,2024-11-03 16:00:00
1,2024-11-04 08:01:35+00:00,137247624,2000,137247624,2000,0,137247624,0,2000,2024-11-04,2024-11-04 08:00:00
2,2024-11-05 23:35:55+00:00,52331577,749,52331577,749,0,52331577,0,749,2024-11-05,2024-11-05 23:00:00
3,2024-11-06 06:29:07+00:00,462970491,6163,462970491,6163,0,462970491,0,6163,2024-11-06,2024-11-06 06:00:00
4,2024-11-06 06:53:46+00:00,132073947,1768,132073947,1768,0,132073947,0,1768,2024-11-06,2024-11-06 06:00:00
...,...,...,...,...,...,...,...,...,...,...,...
2020,2025-10-10 23:50:44+00:00,91245382,800,-91245382,-800,91245382,0,800,0,2025-10-10,2025-10-10 23:00:00
2021,2025-10-11 01:01:43+00:00,135957599,1207,-135957599,-1207,135957599,0,1207,0,2025-10-11,2025-10-11 01:00:00
2022,2025-10-11 06:48:48+00:00,134787242,1200,-134787242,-1200,134787242,0,1200,0,2025-10-11,2025-10-11 06:00:00
2023,2025-10-11 10:22:41+00:00,322545127,2882,-322545127,-2882,322545127,0,2882,0,2025-10-11,2025-10-11 10:00:00


In [294]:
### Whale txn count, whale net usd, whale net usd
hourly_df = (
    df.groupby('datetime1h')
      .agg(
        whale_txn_count=('value_change', 'count'),         
        whale_net_usd=('value_change', 'sum'),            
        wtoe_usd=('exchange_to_whale_usd', 'sum'),   
        etow_usd=('whale_to_exchange_usd', 'sum'),
        wtoe_coins=('exchange_to_whale_coins', 'sum'),
        etow_coins=('whale_to_exchange_coins', 'sum')     
      )
      .reset_index()
)
hourly_df

Unnamed: 0,datetime1h,whale_txn_count,whale_net_usd,wtoe_usd,etow_usd,wtoe_coins,etow_coins
0,2024-11-03 01:00:00,1,-68475464,0,68475464,0,990
1,2024-11-03 16:00:00,1,50275703,50275703,0,738,0
2,2024-11-04 08:00:00,1,137247624,137247624,0,2000,0
3,2024-11-04 14:00:00,1,-53625145,0,53625145,0,784
4,2024-11-05 16:00:00,1,-104684670,0,104684670,0,1497
...,...,...,...,...,...,...,...
1438,2025-10-11 10:00:00,1,-322545127,0,322545127,0,2882
1439,2025-10-11 13:00:00,1,58528931,58528931,0,522,0
1440,2025-10-11 14:00:00,1,-190465731,0,190465731,0,1696
1441,2025-10-12 10:00:00,1,55784256,55784256,0,499,0


In [295]:
# Make hourly index (so rolling-by-rows works) and compute 24h features
hourly_df['datetime1h'] = pd.to_datetime(hourly_df['datetime1h'], utc=True)
hourly_df = hourly_df.set_index('datetime1h').sort_index()

In [296]:
# 24h rolling net USD 
hourly_df['whale_net_usd_24h'] = (
    hourly_df['whale_net_usd']
    .rolling(window=24, min_periods=1)
    .sum()
)


# Whale burst flag using past 24 hours (95th percentile), shifted 1h to avoid look-ahead
hourly_df['whale_burst_threshold'] = (
    hourly_df['whale_txn_count']
    .rolling(window=24, min_periods=0)
    .quantile(0.95)
    .shift(1) # shift to avoid look-ahead, means the window will be stricktly before the current hour
)

hourly_df['whale_burst_flag'] = (
    (hourly_df['whale_txn_count'] >= hourly_df['whale_burst_threshold'])
    .astype('Int64')   # will be <NA> for early hours without enough history
)
hourly_df = hourly_df[hourly_df.index >= '2024-11-04 00:00:00']

hourly_df.drop(columns=['whale_burst_threshold'], inplace=True)


### log and power transformation
for skewness > 1:

if positively skewed: log

if there are negative values: signed log

if percentage 0-100: use power transform 

check skewness, if skew still > 1 after log:
power transform those features

In [297]:
hourly_df

Unnamed: 0_level_0,whale_txn_count,whale_net_usd,wtoe_usd,etow_usd,wtoe_coins,etow_coins,whale_net_usd_24h,whale_burst_flag
datetime1h,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2024-11-04 08:00:00+00:00,1,137247624,137247624,0,2000,0,119047863.0,1
2024-11-04 14:00:00+00:00,1,-53625145,0,53625145,0,784,65422718.0,1
2024-11-05 16:00:00+00:00,1,-104684670,0,104684670,0,1497,-39261952.0,1
2024-11-05 18:00:00+00:00,1,-81636854,0,81636854,0,1170,-120898806.0,1
2024-11-05 23:00:00+00:00,1,52331577,52331577,0,749,0,-68567229.0,1
...,...,...,...,...,...,...,...,...
2025-10-11 10:00:00+00:00,1,-322545127,0,322545127,0,2882,587325102.0,0
2025-10-11 13:00:00+00:00,1,58528931,58528931,0,522,0,871759502.0,0
2025-10-11 14:00:00+00:00,1,-190465731,0,190465731,0,1696,462266889.0,0
2025-10-12 10:00:00+00:00,1,55784256,55784256,0,499,0,579399936.0,0


In [298]:
from scipy.stats import skew
merged_log = hourly_df.copy()
# Select numeric columns
numeric_cols = merged_log.select_dtypes(include=[np.number]).columns

# Compute skewness
skew_vals = merged_log[numeric_cols].apply(lambda x: skew(x.dropna()))
skew_df = pd.DataFrame({"feature": skew_vals.index, "skew": skew_vals.values})
skew_df = skew_df.sort_values(by="skew", ascending=False)

print(skew_df)

             feature       skew
4         wtoe_coins  37.920980
3           etow_usd  14.649914
5         etow_coins  12.013834
2           wtoe_usd   4.576344
0    whale_txn_count   3.273996
7   whale_burst_flag   1.692923
1      whale_net_usd   0.584451
6  whale_net_usd_24h   0.391837


In [299]:
## logging values so that it is not skewed by large transactions
def signed_log1p(x):
    x = np.asarray(x, dtype=float)
    return np.sign(x) * np.log1p(np.abs(x))


In [300]:
# log transform certain features
log_features = ['etow_usd', 'etow_coins','whale_txn_count', 'wtoe_usd', 'wtoe_coins'] # we ignore whale_burst_flag since it is binary

for col in log_features:
    merged_log[f'{col}_log'] = signed_log1p(merged_log[col])
    merged_log = merged_log.drop(columns=[col])


merged_log


Unnamed: 0_level_0,whale_net_usd,whale_net_usd_24h,whale_burst_flag,etow_usd_log,etow_coins_log,whale_txn_count_log,wtoe_usd_log,wtoe_coins_log
datetime1h,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2024-11-04 08:00:00+00:00,137247624,119047863.0,1,0.000000,0.000000,0.693147,18.737297,7.601402
2024-11-04 14:00:00+00:00,-53625145,65422718.0,1,17.797529,6.665684,0.693147,0.000000,0.000000
2024-11-05 16:00:00+00:00,-104684670,-39261952.0,1,18.466463,7.311886,0.693147,0.000000,0.000000
2024-11-05 18:00:00+00:00,-81636854,-120898806.0,1,18.217791,7.065613,0.693147,0.000000,0.000000
2024-11-05 23:00:00+00:00,52331577,-68567229.0,1,0.000000,0.000000,0.693147,17.773111,6.620073
...,...,...,...,...,...,...,...,...
2025-10-11 10:00:00+00:00,-322545127,587325102.0,0,19.591754,7.966587,0.693147,0.000000,0.000000
2025-10-11 13:00:00+00:00,58528931,871759502.0,0,0.000000,0.000000,0.693147,17.885032,6.259581
2025-10-11 14:00:00+00:00,-190465731,462266889.0,0,19.064983,7.436617,0.693147,0.000000,0.000000
2025-10-12 10:00:00+00:00,55784256,579399936.0,0,0.000000,0.000000,0.693147,17.837002,6.214608


In [301]:
# Compute skewness
numeric_cols = merged_log.select_dtypes(include=[np.number]).columns
skew_vals = merged_log[numeric_cols].apply(lambda x: skew(x.dropna()))
skew_df = pd.DataFrame({"feature": skew_vals.index, "skew": skew_vals.values})
skew_df = skew_df.sort_values(by="skew", ascending=False)

print(skew_df)

               feature      skew
5  whale_txn_count_log  1.980398
2     whale_burst_flag  1.692923
0        whale_net_usd  0.584451
1    whale_net_usd_24h  0.391837
4       etow_coins_log  0.123487
3         etow_usd_log  0.077929
7       wtoe_coins_log -0.167390
6         wtoe_usd_log -0.518684


In [302]:
from sklearn.preprocessing import PowerTransformer
still_skew_col = [
    "whale_txn_count_log",
]

pt = PowerTransformer(method='yeo-johnson')
merged_log[still_skew_col] = pt.fit_transform(merged_log[still_skew_col])

In [303]:
# Compute skewness
numeric_cols = merged_log.select_dtypes(include=[np.number]).columns
skew_vals = merged_log[numeric_cols].apply(lambda x: skew(x.dropna()))
skew_df = pd.DataFrame({"feature": skew_vals.index, "skew": skew_vals.values})
skew_df = skew_df.sort_values(by="skew", ascending=False)
print(skew_df)

               feature      skew
2     whale_burst_flag  1.692923
5  whale_txn_count_log  1.046098
0        whale_net_usd  0.584451
1    whale_net_usd_24h  0.391837
4       etow_coins_log  0.123487
3         etow_usd_log  0.077929
7       wtoe_coins_log -0.167390
6         wtoe_usd_log -0.518684


In [304]:
merged_log.head()

Unnamed: 0_level_0,whale_net_usd,whale_net_usd_24h,whale_burst_flag,etow_usd_log,etow_coins_log,whale_txn_count_log,wtoe_usd_log,wtoe_coins_log
datetime1h,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2024-11-04 08:00:00+00:00,137247624,119047863.0,1,0.0,0.0,-0.606516,18.737297,7.601402
2024-11-04 14:00:00+00:00,-53625145,65422718.0,1,17.797529,6.665684,-0.606516,0.0,0.0
2024-11-05 16:00:00+00:00,-104684670,-39261952.0,1,18.466463,7.311886,-0.606516,0.0,0.0
2024-11-05 18:00:00+00:00,-81636854,-120898806.0,1,18.217791,7.065613,-0.606516,0.0,0.0
2024-11-05 23:00:00+00:00,52331577,-68567229.0,1,0.0,0.0,-0.606516,17.773111,6.620073


In [305]:
merged_log.fillna(0, inplace=True)
merged_log.columns

Index(['whale_net_usd', 'whale_net_usd_24h', 'whale_burst_flag',
       'etow_usd_log', 'etow_coins_log', 'whale_txn_count_log', 'wtoe_usd_log',
       'wtoe_coins_log'],
      dtype='object')

In [306]:
merged_log = merged_log.reset_index(drop=False)
merged_log 

Unnamed: 0,datetime1h,whale_net_usd,whale_net_usd_24h,whale_burst_flag,etow_usd_log,etow_coins_log,whale_txn_count_log,wtoe_usd_log,wtoe_coins_log
0,2024-11-04 08:00:00+00:00,137247624,119047863.0,1,0.000000,0.000000,-0.606516,18.737297,7.601402
1,2024-11-04 14:00:00+00:00,-53625145,65422718.0,1,17.797529,6.665684,-0.606516,0.000000,0.000000
2,2024-11-05 16:00:00+00:00,-104684670,-39261952.0,1,18.466463,7.311886,-0.606516,0.000000,0.000000
3,2024-11-05 18:00:00+00:00,-81636854,-120898806.0,1,18.217791,7.065613,-0.606516,0.000000,0.000000
4,2024-11-05 23:00:00+00:00,52331577,-68567229.0,1,0.000000,0.000000,-0.606516,17.773111,6.620073
...,...,...,...,...,...,...,...,...,...
1436,2025-10-11 10:00:00+00:00,-322545127,587325102.0,0,19.591754,7.966587,-0.606516,0.000000,0.000000
1437,2025-10-11 13:00:00+00:00,58528931,871759502.0,0,0.000000,0.000000,-0.606516,17.885032,6.259581
1438,2025-10-11 14:00:00+00:00,-190465731,462266889.0,0,19.064983,7.436617,-0.606516,0.000000,0.000000
1439,2025-10-12 10:00:00+00:00,55784256,579399936.0,0,0.000000,0.000000,-0.606516,17.837002,6.214608


In [307]:
merged_log.to_csv('../Data/whale_alert_btc.csv', index=False)