In [None]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler, OneHotEncoder

file_path = r"D:\sai\new_nodes_with_creation_gas.txt"
output_path = r"D:\sai\kmeans_anomaly_binned_block.txt"
df = pd.read_csv(file_path, sep='\t')

df['timestamp'] = pd.to_numeric(df['timestamp'], errors='coerce')
df['gasUsed'] = pd.to_numeric(df['gasUsed'], errors='coerce')
df['gasLimit'] = pd.to_numeric(df['gasLimit'], errors='coerce')
df['blockNumber'] = pd.to_numeric(df['blockNumber'], errors='coerce')
df.dropna(inplace=True)

df['block_bin'] = pd.qcut(df['blockNumber'], q=20, duplicates='drop')

block_encoded = pd.get_dummies(df['block_bin'], prefix='block')

numeric_features = df[['timestamp', 'gasUsed', 'gasLimit']]
X_combined = pd.concat([numeric_features, block_encoded], axis=1)

scaler = StandardScaler()
X_combined[numeric_features.columns] = scaler.fit_transform(numeric_features)

kmeans = KMeans(n_clusters=2, random_state=42, n_init='auto')
labels = kmeans.fit_predict(X_combined)

df['anomaly'] = (labels == np.argmin(np.bincount(labels))).astype(int)

print(f"Cluster 0: {(df['anomaly'] == 0).sum()} nodes (normal)")
print(f"Cluster 1: {(df['anomaly'] == 1).sum()} nodes (anomalous)")

df.to_csv(output_path, sep='\t', index=False)
print(f"\nSaved clustered data with anomalies to:\n{output_path}")

print("\nSample rows:")
print(df.head(50))


Cluster 0: 2933505 nodes (normal)
Cluster 1: 12313 nodes (anomalous)

Saved clustered data with anomalies to:
D:\sai\kmeans_anomaly_binned_block.txt

Sample rows:
    to_node   timestamp  blockNumber  from_address  gasUsed  gasLimit  \
0        14  1438919491        46220            13    21000     21000   
1        35  1438920178        46261            21    21000     21000   
2        49  1438920756        46295            16    21000     21000   
3        50  1438920830        46300            16    21000     21000   
4        79  1438922335        46390            78    21000     21000   
5       140  1438927371        46682            36    21000     21001   
6       163  1438929366        46804            68    21000     21000   
7       201  1438931097        46893           117    21000     21000   
8       204  1438931480        46913           117    21000     21000   
9       232  1438933203        47015           117    21000     21000   
10      241  1438933636        470

In [5]:
pip install hdbscan

Collecting hdbscan
  Obtaining dependency information for hdbscan from https://files.pythonhosted.org/packages/64/b1/96c347c7740efa1ac803be64155159284f92fafcff88c1077344e64eead5/hdbscan-0.8.40-cp311-cp311-win_amd64.whl.metadata
  Downloading hdbscan-0.8.40-cp311-cp311-win_amd64.whl.metadata (15 kB)
Downloading hdbscan-0.8.40-cp311-cp311-win_amd64.whl (732 kB)
   ---------------------------------------- 0.0/732.2 kB ? eta -:--:--
    --------------------------------------- 10.2/732.2 kB ? eta -:--:--
    --------------------------------------- 10.2/732.2 kB ? eta -:--:--
   - ------------------------------------- 30.7/732.2 kB 220.2 kB/s eta 0:00:04
   - ------------------------------------- 30.7/732.2 kB 220.2 kB/s eta 0:00:04
   ----- -------------------------------- 112.6/732.2 kB 504.4 kB/s eta 0:00:02
   ----- -------------------------------- 112.6/732.2 kB 504.4 kB/s eta 0:00:02
   -------------- ----------------------- 276.5/732.2 kB 853.3 kB/s eta 0:00:01
   ----------------- --

In [None]:
import pandas as pd
import numpy as np
from scipy.stats import zscore

file_path = r"D:\sai\created_nouse_account\timestamp_node_count.txt"
df = pd.read_csv(file_path, sep='\t')

df['zscore'] = zscore(df['node_count'])

df['anomaly'] = (df['zscore'].abs() > 3).astype(int)

outlier_count = df['anomaly'].sum()
total_count = len(df)

nodes_in_outliers = df.loc[df['anomaly'] == 1, 'node_count'].sum()

print(f"Outliers detected (|zscore| > 3): {outlier_count} out of {total_count} rows")
print(f"Total number of nodes in outlier timestamps: {nodes_in_outliers}")

output_path = r"D:\sai\zscore_outliers_marked.txt"
df.to_csv(output_path, sep='\t', index=False)
print(f"Z-score results saved to: {output_path}")


Outliers detected (|zscore| > 3): 30 out of 1244 rows
Total number of nodes in outlier timestamps: 489536
Z-score results saved to: D:\sai\zscore_outliers_marked.txt


In [None]:
import pandas as pd
import plotly.graph_objects as go

file_path = r"D:\sai\kmeans_anomaly_binned_block.txt"  
df = pd.read_csv(file_path, sep='\t')

df['timestamp'] = pd.to_datetime(df['timestamp'], unit='s')
df['day'] = df['timestamp'].dt.date

daily_counts = df.groupby(['day', 'anomaly']).size().reset_index(name='node_count')

normal_df = daily_counts[daily_counts['anomaly'] == 0].copy()
anomaly_df = daily_counts[daily_counts['anomaly'] == 1].copy()

normal_df['day'] = pd.to_datetime(normal_df['day'])
anomaly_df['day'] = pd.to_datetime(anomaly_df['day'])

fig = go.Figure()

fig.add_trace(go.Scatter(
    x=normal_df['day'],
    y=normal_df['node_count'],
    mode='lines',
    name='Normal',
    line=dict(color='blue')
))

fig.add_trace(go.Scatter(
    x=anomaly_df['day'],
    y=anomaly_df['node_count'],
    mode='markers',
    name='Anomalous',
    marker=dict(color='red', size=2, symbol='circle')
))

fig.update_layout(
    title='Daily Node Creation: Normal vs Anomalous',
    xaxis=dict(
        title='Date',
        rangeslider=dict(visible=True),
        rangeselector=dict(
            buttons=list([
                dict(count=7, label="1w", step="day", stepmode="backward"),
                dict(count=1, label="1m", step="month", stepmode="backward"),
                dict(step="all")
            ])
        )
    ),
    yaxis_title='Node Count',
    template="plotly_white",
    hovermode="x unified",
    dragmode="zoom"
)

fig.show()


In [2]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go
from scipy.stats import zscore

file_path = r"D:\sai\created_nouse_account\timestamp_node_count.txt"
df = pd.read_csv(file_path, sep='\t')

df['day'] = pd.to_datetime(df['day'])

df = df.sort_values('day')

df['zscore'] = zscore(df['node_count'])
df['anomaly'] = (df['zscore'].abs() >3).astype(int)

fig = go.Figure()

fig.add_trace(go.Scatter(
    x=df['day'],
    y=df['node_count'],
    mode='lines',
    name='Node Count',
    line=dict(color='blue')
))

anomalies = df[df['anomaly'] == 1]

fig.add_trace(go.Scatter(
    x=anomalies['day'],
    y=anomalies['node_count'],
    mode='markers',
    name='Anomaly',
    marker=dict(color='red', size=5, symbol='triangle-up'),
    hovertext=["Anomaly" for _ in anomalies['day']],
    hoverinfo='text'
))

fig.update_layout(
    title='Node Count with Anomaly (Red)',
    xaxis=dict(
        title='Date',
        rangeslider=dict(visible=True),
        rangeselector=dict(
            buttons=list([
                dict(count=7, label="1w", step="day", stepmode="backward"),
                dict(count=1, label="1m", step="month", stepmode="backward"),
                dict(step="all")
            ])
        )
    ),
    yaxis_title='Node Count',
    template='plotly_white',
    hovermode='x unified',
    dragmode='zoom'
)

fig.show()
