In [26]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

In [31]:
df = pd.read_csv('data2.csv')
df['timestamp'] = pd.to_datetime(df['timestamp'])


In [32]:
features = df[['wallet_id', 'timestamp', 'amount', 'counterparty_wallet', 'mixing_service_used']]
features

Unnamed: 0,wallet_id,timestamp,amount,counterparty_wallet,mixing_service_used
0,1HLoD9E4SDFFPDiYfNYnkBLQ85Y51J3Zb1,2023-01-01 08:00:00,1.5,3Cbq7aT1tY8kMxWLbitaG7yT6bPbKChq64,0
1,3Cbq7aT1tY8kMxWLbitaG7yT6bPbKChq64,2023-01-01 09:15:00,2.0,1HLoD9E4SDFFPDiYfNYnkBLQ85Y51J3Zb1,1
2,1FvEiJoxPAAwbHqvAeCB1L2b8vPwotCiR,2023-01-01 10:30:00,1.8,3Cbq7aT1tY8kMxWLbitaG7yT6bPbKChq64,0
3,3Cbq7aT1tY8kMxWLbitaG7yT6bPbKChq64,2023-01-01 12:00:00,3.2,1FvEiJoxPAAwbHqvAeCB1L2b8vPwotCiR,0
4,1HLoD9E4SDFFPDiYfNYnkBLQ85Y51J3Zb1,2023-01-01 14:45:00,2.3,3Cbq7aT1tY8kMxWLbitaG7yT6bPbKChq64,1
5,3Cbq7aT1tY8kMxWLbitaG7yT6bPbKChq64,2023-01-01 16:30:00,1.6,1FvEiJoxPAAwbHqvAeCB1L2b8vPwotCiR,0
6,1FvEiJoxPAAwbHqvAeCB1L2b8vPwotCiR,2023-01-01 18:15:00,2.5,3Cbq7aT1tY8kMxWLbitaG7yT6bPbKChq64,1
7,3Cbq7aT1tY8kMxWLbitaG7yT6bPbKChq64,2023-01-01 20:00:00,2.8,1HLoD9E4SDFFPDiYfNYnkBLQ85Y51J3Zb1,0
8,1HLoD9E4SDFFPDiYfNYnkBLQ85Y51J3Zb1,2023-01-01 21:45:00,1.9,3Cbq7aT1tY8kMxWLbitaG7yT6bPbKChq64,0
9,3Cbq7aT1tY8kMxWLbitaG7yT6bPbKChq64,2023-01-01 23:30:00,2.2,1HLoD9E4SDFFPDiYfNYnkBLQ85Y51J3Zb1,1


In [34]:
## Wallet-Level Aggregation
wallet_aggregated = features.groupby('wallet_id').agg({
    'timestamp': 'count',
    'amount': 'sum',
    'counterparty_wallet': pd.Series.nunique,
    'mixing_service_used': 'mean'  # Assuming mixing_service_used is binary (0 or 1)
}).reset_index()


wallet_aggregated

Unnamed: 0,wallet_id,timestamp,amount,counterparty_wallet,mixing_service_used
0,1FvEiJoxPAAwbHqvAeCB1L2b8vPwotCiR,2,4.3,1,0.5
1,1HLoD9E4SDFFPDiYfNYnkBLQ85Y51J3Zb1,3,5.7,1,0.333333
2,3Cbq7aT1tY8kMxWLbitaG7yT6bPbKChq64,5,11.8,2,0.4


In [43]:
time_aggregated = features.groupby(['wallet_id', pd.Grouper(key='timestamp', freq='D')]).agg({
    'amount': 'sum',
}).reset_index()


In [46]:
aggregated_results = pd.merge(wallet_aggregated, time_aggregated, on='wallet_id', how='left', suffixes=('_total', '_daily'))
aggregated_results

Unnamed: 0,wallet_id,timestamp_total,amount_total,counterparty_wallet,mixing_service_used,timestamp_daily,amount_daily
0,1FvEiJoxPAAwbHqvAeCB1L2b8vPwotCiR,2,4.3,1,0.5,2023-01-01,4.3
1,1HLoD9E4SDFFPDiYfNYnkBLQ85Y51J3Zb1,3,5.7,1,0.333333,2023-01-01,5.7
2,3Cbq7aT1tY8kMxWLbitaG7yT6bPbKChq64,5,11.8,2,0.4,2023-01-01,11.8


In [52]:
non_numeric_columns = ['wallet_id', 'timestamp_daily']
non_numeric_data = aggregated_results[non_numeric_columns]
non_numeric_data

Unnamed: 0,wallet_id,timestamp_daily
0,1FvEiJoxPAAwbHqvAeCB1L2b8vPwotCiR,2023-01-01
1,1HLoD9E4SDFFPDiYfNYnkBLQ85Y51J3Zb1,2023-01-01
2,3Cbq7aT1tY8kMxWLbitaG7yT6bPbKChq64,2023-01-01


In [55]:
numeric_data = aggregated_results.drop(columns=non_numeric_columns)
numeric_data

Unnamed: 0,timestamp_total,amount_total,counterparty_wallet,mixing_service_used,amount_daily
0,2,4.3,1,0.5,4.3
1,3,5.7,1,0.333333,5.7
2,5,11.8,2,0.4,11.8


In [57]:
scaler = StandardScaler()
scaled_features_numeric = scaler.fit_transform(numeric_data)

In [60]:
scaled_features = pd.concat([non_numeric_data, pd.DataFrame(scaled_features_numeric, columns=numeric_data.columns)], axis=1)
scaled_features

Unnamed: 0,wallet_id,timestamp_daily,timestamp_total,amount_total,counterparty_wallet,mixing_service_used,amount_daily
0,1FvEiJoxPAAwbHqvAeCB1L2b8vPwotCiR,2023-01-01,-1.069045,-0.911109,-0.707107,1.297771,-0.911109
1,1HLoD9E4SDFFPDiYfNYnkBLQ85Y51J3Zb1,2023-01-01,-0.267261,-0.481147,-0.707107,-1.13555,-0.481147
2,3Cbq7aT1tY8kMxWLbitaG7yT6bPbKChq64,2023-01-01,1.336306,1.392256,1.414214,-0.162221,1.392256


In [63]:
silhouette_avg = silhouette_score(scaled_features)
print(f"Silhouette Score for Clustering: {silhouette_avg}")

TypeError: missing a required argument: 'labels'