In [1]:
import numpy as np
import pandas as pd

# Data Cleaning Crypto Crimes on Elliptic Dataset

## Read data 

In [2]:
# Read the three dataframes
df_classes = pd.read_csv("data/elliptic_txs_classes.csv")
df_edges = pd.read_csv("data/elliptic_txs_edgelist.csv")
df_features = pd.read_csv("data/elliptic_txs_features.csv", header = None)

In [3]:
# Classes dataframe
df_classes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 203769 entries, 0 to 203768
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   txId    203769 non-null  int64 
 1   class   203769 non-null  object
dtypes: int64(1), object(1)
memory usage: 3.1+ MB


In [5]:
# Edges from starting transaction to destination transaction
df_edges.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 234355 entries, 0 to 234354
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype
---  ------  --------------   -----
 0   txId1   234355 non-null  int64
 1   txId2   234355 non-null  int64
dtypes: int64(2)
memory usage: 3.6 MB


In [6]:
# Features dataframe
df_features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 203769 entries, 0 to 203768
Columns: 167 entries, 0 to 166
dtypes: float64(165), int64(2)
memory usage: 259.6 MB


## Format Data

Our objective isn't to format the data or feature engineer it now. Right now we are cleaning the data to make EDA easier. 

The heavier formatting and cleaning can be applied later. 

### Rename Columns, in fetaure dataframe

In [43]:
# Renaming columns
colNames1 = {'0': 'txId', '1': "time_step"}
colNames2 = {f'{i + 2}': f"local_feature_{i + 1}" for i in range(93)}
colNames3 = {f'{i+95}': f"aggregate_feature_{i+1}" for i in range(72)}
colNames = dict(**colNames1, **colNames2, **colNames3)
colNames = {int(key): value for key,value in colNames.items()}

In [44]:
df_features = df_features.rename(columns=colNames)
df_features.head()

Unnamed: 0,txId,Time step,local_feature_1,local_feature_2,local_feature_3,local_feature_4,local_feature_5,local_feature_6,local_feature_7,local_feature_8,...,aggregate_feature_63,aggregate_feature_64,aggregate_feature_65,aggregate_feature_66,aggregate_feature_67,aggregate_feature_68,aggregate_feature_69,aggregate_feature_70,aggregate_feature_71,aggregate_feature_72
0,230425980,1,-0.171469,-0.184668,-1.201369,-0.12197,-0.043875,-0.113002,-0.061584,-0.162097,...,-0.562153,-0.600999,1.46133,1.461369,0.018279,-0.08749,-0.131155,-0.097524,-0.120613,-0.119792
1,5530458,1,-0.171484,-0.184668,-1.201369,-0.12197,-0.043875,-0.113002,-0.061584,-0.162112,...,0.947382,0.673103,-0.979074,-0.978556,0.018279,-0.08749,-0.131155,-0.097524,-0.120613,-0.119792
2,232022460,1,-0.172107,-0.184668,-1.201369,-0.12197,-0.043875,-0.113002,-0.061584,-0.162749,...,0.670883,0.439728,-0.979074,-0.978556,-0.098889,-0.106715,-0.131155,-0.183671,-0.120613,-0.119792
3,232438397,1,0.163054,1.96379,-0.646376,12.409294,-0.063725,9.782742,12.414558,-0.163645,...,-0.577099,-0.613614,0.241128,0.241406,1.072793,0.08553,-0.131155,0.677799,-0.120613,-0.119792
4,230460314,1,1.011523,-0.081127,-1.201369,1.153668,0.333276,1.312656,-0.061584,-0.163523,...,-0.511871,-0.400422,0.517257,0.579382,0.018279,0.277775,0.326394,1.29375,0.178136,0.179117


### Rename the classes column

In [59]:
# This is for EDA only, will change later.
df_classes['class'] = df_classes['class'].map({'2':'licit', '1':'illicit', 'unknown': 'unknown'})

### Save new csvs

In [61]:
df_classes.to_csv("data/elliptic_txs_classes.csv", index = False)
df_features.to_csv("data/elliptic_txs_features.csv", index = False)