In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA


In [None]:
features = pd.read_csv("elliptic_txs_features.csv", header=None, on_bad_lines='skip', engine='python')
print("Loaded rows:", features.shape[0])


Loaded rows: 203769


In [None]:
features.columns = ['tx_id', 'time_step'] + [f'f_{i}' for i in range(1, 166)]


In [None]:
with open("elliptic_txs_features.csv", 'r') as f:
    raw_lines = f.readlines()

print("Total lines in file:", len(raw_lines))  # This will likely be ~203,769
print("Rows loaded by pandas:", features.shape[0])  # This is 192,363


Total lines in file: 203769
Rows loaded by pandas: 203769


In [None]:
features.head()

Unnamed: 0,tx_id,time_step,f_1,f_2,f_3,f_4,f_5,f_6,f_7,f_8,...,f_156,f_157,f_158,f_159,f_160,f_161,f_162,f_163,f_164,f_165
0,230425980,1,-0.171469,-0.184668,-1.201369,-0.12197,-0.043875,-0.113002,-0.061584,-0.162097,...,-0.562153,-0.600999,1.46133,1.461369,0.018279,-0.08749,-0.131155,-0.097524,-0.120613,-0.119792
1,5530458,1,-0.171484,-0.184668,-1.201369,-0.12197,-0.043875,-0.113002,-0.061584,-0.162112,...,0.947382,0.673103,-0.979074,-0.978556,0.018279,-0.08749,-0.131155,-0.097524,-0.120613,-0.119792
2,232022460,1,-0.172107,-0.184668,-1.201369,-0.12197,-0.043875,-0.113002,-0.061584,-0.162749,...,0.670883,0.439728,-0.979074,-0.978556,-0.098889,-0.106715,-0.131155,-0.183671,-0.120613,-0.119792
3,232438397,1,0.163054,1.96379,-0.646376,12.409294,-0.063725,9.782742,12.414558,-0.163645,...,-0.577099,-0.613614,0.241128,0.241406,1.072793,0.08553,-0.131155,0.677799,-0.120613,-0.119792
4,230460314,1,1.011523,-0.081127,-1.201369,1.153668,0.333276,1.312656,-0.061584,-0.163523,...,-0.511871,-0.400422,0.517257,0.579382,0.018279,0.277775,0.326394,1.29375,0.178136,0.179117


In [None]:
classes = pd.read_csv("/content/elliptic_txs_classes.csv", header=None, on_bad_lines='skip', engine='python')
print("Loaded rows:", classes.shape[0])


Loaded rows: 203770


In [None]:
classes.head()

Unnamed: 0,0,1
0,txId,class
1,230425980,unknown
2,5530458,unknown
3,232022460,unknown
4,232438397,2


In [None]:
classes.columns = ['tx_id','class']

In [None]:
features['tx_id'] = features['tx_id'].astype(str)
classes['tx_id'] = classes['tx_id'].astype(str)

In [None]:
merged_df = pd.merge(features,classes,on='tx_id',how='left')

In [None]:
merged_df.head()

Unnamed: 0,tx_id,time_step,f_1,f_2,f_3,f_4,f_5,f_6,f_7,f_8,...,f_157,f_158,f_159,f_160,f_161,f_162,f_163,f_164,f_165,class
0,230425980,1,-0.171469,-0.184668,-1.201369,-0.12197,-0.043875,-0.113002,-0.061584,-0.162097,...,-0.600999,1.46133,1.461369,0.018279,-0.08749,-0.131155,-0.097524,-0.120613,-0.119792,unknown
1,5530458,1,-0.171484,-0.184668,-1.201369,-0.12197,-0.043875,-0.113002,-0.061584,-0.162112,...,0.673103,-0.979074,-0.978556,0.018279,-0.08749,-0.131155,-0.097524,-0.120613,-0.119792,unknown
2,232022460,1,-0.172107,-0.184668,-1.201369,-0.12197,-0.043875,-0.113002,-0.061584,-0.162749,...,0.439728,-0.979074,-0.978556,-0.098889,-0.106715,-0.131155,-0.183671,-0.120613,-0.119792,unknown
3,232438397,1,0.163054,1.96379,-0.646376,12.409294,-0.063725,9.782742,12.414558,-0.163645,...,-0.613614,0.241128,0.241406,1.072793,0.08553,-0.131155,0.677799,-0.120613,-0.119792,2
4,230460314,1,1.011523,-0.081127,-1.201369,1.153668,0.333276,1.312656,-0.061584,-0.163523,...,-0.400422,0.517257,0.579382,0.018279,0.277775,0.326394,1.29375,0.178136,0.179117,unknown


In [None]:
merged_df.shape

(203769, 168)

In [None]:
merged_df['time_step'].value_counts()

Unnamed: 0_level_0,count
time_step,Unnamed: 1_level_1
1,7880
42,7140
5,6803
10,6727
3,6621
36,6393
7,6048
22,5894
4,5693
45,5598


In [None]:
merged_df['class'].value_counts()

Unnamed: 0_level_0,count
class,Unnamed: 1_level_1
unknown,157205
2,42019
1,4545


In [None]:
# Step 1: Standard scaling
scaler = StandardScaler()
feature_cols = [f'f_{i}' for i in range(1, 166)]
X = merged_df[feature_cols].copy()
X.fillna(0, inplace=True)
X_scaled = scaler.fit_transform(X)

# Step 2: Full PCA to determine ideal components
pca_full = PCA().fit(X_scaled)
cumulative = np.cumsum(pca_full.explained_variance_ratio_)
ideal_components = np.argmax(cumulative >= 0.95) + 1
print(f"Ideal number of components to retain 95% variance: {ideal_components}")

# Step 3: Apply PCA using optimal number of components
pca = PCA(n_components=ideal_components)
X_pca = pca.fit_transform(X_scaled)

# Optional: See explained variance
print(f"Top 10 individual variances: {pca.explained_variance_ratio_[:10]}")
print(f"Cumulative variance (first 20): {np.cumsum(pca.explained_variance_ratio_)[:20]}")
print(f"Total variance retained: {np.sum(pca.explained_variance_ratio_):.4f}")


Ideal number of components to retain 95% variance: 59
Top 10 individual variances: [0.10482862 0.06091381 0.05503429 0.04571852 0.0405349  0.03910661
 0.03511996 0.0282828  0.02709893 0.02642217]
Cumulative variance (first 20): [0.10482862 0.16574242 0.22077672 0.26649523 0.30703013 0.34613674
 0.3812567  0.4095395  0.43663843 0.4630606  0.48775051 0.50949631
 0.53018332 0.55034896 0.56801004 0.58523879 0.60188189 0.61749937
 0.63280787 0.64720703]
Total variance retained: 0.9524


In [None]:
pca = PCA(n_components=59)
X_pca = pca.fit_transform(X_scaled)

In [None]:
pca_df = pd.DataFrame(X_pca, columns=[f'PC{i+1}' for i in range(59)])
pca_df.head()


Unnamed: 0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10,...,PC50,PC51,PC52,PC53,PC54,PC55,PC56,PC57,PC58,PC59
0,-0.901864,-3.756163,-1.734837,0.545077,0.115159,1.761448,-4.16668,0.339828,-0.271514,1.412007,...,1.928295,0.479539,-0.238521,0.104439,0.079943,0.35744,0.409291,0.249455,-0.055423,-1.041955
1,-0.804055,-3.557472,-2.176768,-2.087779,-0.254481,-1.156366,2.891613,-0.195913,0.206715,1.440352,...,-0.390808,-0.390893,0.138491,-0.134847,0.128413,-0.003742,-0.309139,-0.04298,-0.057803,-0.032952
2,-1.269576,-4.377144,-2.124023,-0.783335,0.342497,-5.481981,0.675491,-0.37086,0.331038,0.728002,...,-0.122269,0.188627,0.197744,-0.311393,-0.007686,-1.160918,1.463536,-0.510027,0.457832,1.115936
3,2.168197,-3.787118,-0.370527,4.168413,-0.483941,-1.295672,-1.309844,6.881549,8.999808,-4.067352,...,4.69455,0.35914,-0.556381,-0.511249,-1.320841,-0.139057,-0.15552,0.720744,-2.193073,0.543651
4,-1.489209,-4.356148,-0.399558,-1.344298,1.256397,-0.712556,-0.975835,2.512395,2.38401,-0.803795,...,-0.875862,-0.635899,-0.957029,0.073865,0.435096,-0.053528,0.083915,-0.262133,0.105493,-1.002839


In [None]:
print(X_pca.shape)


(203769, 59)


In [None]:
# Combine pca_df with tx_id, time_step, and class
pca_df['tx_id'] = merged_df['tx_id'].values
pca_df['time_step'] = merged_df['time_step'].values
pca_df['class'] = merged_df['class'].values


In [None]:
pca_df.shape

(203769, 62)

In [None]:
known_df = pca_df[pca_df['class'].isin(['1', '2'])].copy()
unknown_df = pca_df[pca_df['class'] == 'unknown'].copy()
# spliting the dataset into known and unkown labels

In [None]:
pc_cols = [f'PC{i+1}' for i in range(59)]

X_known = known_df[pc_cols]
X_unknown = unknown_df[pc_cols]


In [None]:
from sklearn.ensemble import IsolationForest

iso_forest = IsolationForest(n_estimators=100, contamination='auto', random_state=42)
iso_forest.fit(X_known)


In [None]:
# -1 = anomaly, 1 = normal
unknown_preds = iso_forest.predict(X_unknown)

# Add prediction to DataFrame
unknown_df['anomaly'] = unknown_preds


In [None]:
unknown_df['anomaly'].value_counts()


Unnamed: 0_level_0,count
anomaly,Unnamed: 1_level_1
1,156361
-1,844
