#  Anomaly Detection in a Blockchain - Preprocessing

## Step1: Importing Libraries and Datasets

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('Raw_Blockchain.csv')

In [3]:
df

Unnamed: 0,address,year,day,length,weight,count,looped,neighbors,income,label
0,111K8kZAEnJg245r2cM6y9zgJGHZtJPy6,2017,11,18,0.008333,1,0,2,1.000500e+08,princetonCerber
1,1123pJv8jzeFQaCV4w644pzQJzVWay2zcA,2016,132,44,0.000244,1,0,1,1.000000e+08,princetonLocky
2,112536im7hy6wtKbpH1qYDWtTyMRAcA2p7,2016,246,0,1.000000,1,0,2,2.000000e+08,princetonCerber
3,1126eDRw2wqSkWosjTCre8cjjQW8sSeWH7,2016,322,72,0.003906,1,0,2,7.120000e+07,princetonCerber
4,1129TSjKtx65E35GiUo4AYVeyo48twbrGX,2016,238,144,0.072848,456,0,1,2.000000e+08,princetonLocky
...,...,...,...,...,...,...,...,...,...,...
2916692,12D3trgho1vJ4mGtWBRPyHdMJK96TRYSry,2018,330,0,0.111111,1,0,1,1.255809e+09,white
2916693,1P7PputTcVkhXBmXBvSD9MJ3UYPsiou1u2,2018,330,0,1.000000,1,0,1,4.409699e+07,white
2916694,1KYiKJEfdJtap9QX2v9BXJMpz2SfU4pgZw,2018,330,2,12.000000,6,6,35,2.398267e+09,white
2916695,15iPUJsRNZQZHmZZVwmQ63srsmughCXV4a,2018,330,0,0.500000,1,0,1,1.780427e+08,white


## Step 2: Handling Missing Values

Check for missing values in your dataset and decide how to handle them. You can choose to remove rows with missing values, impute missing values with a specific strategy (e.g., mean, median, mode), or consider more advanced imputation methods.

In [4]:
# Check for missing values in each column
missing_values = df.isna().sum()
missing_values

address      0
year         0
day          0
length       0
weight       0
count        0
looped       0
neighbors    0
income       0
label        0
dtype: int64

In [5]:
# Display columns with missing values and the count of missing values in each
print(missing_values[missing_values > 0])

Series([], dtype: int64)


In [6]:
# Remove rows with missing values
df.dropna(inplace=True)

## Step 3: Data Cleaning

Perform data cleaning tasks such as removing duplicates, correcting inconsistent data, and standardizing data formats. For example, you might want to ensure consistency in date formats, text capitalization, and categorical values.

In [7]:
# Check for duplicate rows
duplicates = df.duplicated().sum()
print("Number of duplicate rows:", duplicates)

Number of duplicate rows: 0


In [8]:
# Remove duplicate rows
df.drop_duplicates(inplace=True)

In [9]:
# Remove duplicate columns
df = df.loc[:, ~df.columns.duplicated()]

In [10]:
df.head()

Unnamed: 0,address,year,day,length,weight,count,looped,neighbors,income,label
0,111K8kZAEnJg245r2cM6y9zgJGHZtJPy6,2017,11,18,0.008333,1,0,2,100050000.0,princetonCerber
1,1123pJv8jzeFQaCV4w644pzQJzVWay2zcA,2016,132,44,0.000244,1,0,1,100000000.0,princetonLocky
2,112536im7hy6wtKbpH1qYDWtTyMRAcA2p7,2016,246,0,1.0,1,0,2,200000000.0,princetonCerber
3,1126eDRw2wqSkWosjTCre8cjjQW8sSeWH7,2016,322,72,0.003906,1,0,2,71200000.0,princetonCerber
4,1129TSjKtx65E35GiUo4AYVeyo48twbrGX,2016,238,144,0.072848,456,0,1,200000000.0,princetonLocky


## Step 4: Data Statistics

perform data quality checks and verify that your dataset is relatively clean and rendered:

In [11]:
# Check data types of columns
data_types = df.dtypes
print(data_types)

address       object
year           int64
day            int64
length         int64
weight       float64
count          int64
looped         int64
neighbors      int64
income       float64
label         object
dtype: object


In [12]:
df["label"].value_counts()

white                          2875284
paduaCryptoWall                  12390
montrealCryptoLocker              9315
princetonCerber                   9223
princetonLocky                    6625
montrealCryptXXX                  2419
montrealNoobCrypt                  483
montrealDMALockerv3                354
montrealDMALocker                  251
montrealSamSam                      62
montrealCryptoTorLocker2015         55
montrealGlobeImposter               55
montrealGlobev3                     34
montrealGlobe                       32
montrealWannaCry                    28
montrealRazy                        13
montrealAPT                         11
paduaKeRanger                       10
montrealFlyper                       9
montrealXTPLocker                    8
montrealXLockerv5.0                  7
montrealVenusLocker                  7
montrealCryptConsole                 7
montrealEDA2                         6
montrealJigSaw                       4
paduaJigsaw              

In [13]:
# Summary statistics
summary_stats = df.describe()
summary_stats

Unnamed: 0,year,day,length,weight,count,looped,neighbors,income
count,2916697.0,2916697.0,2916697.0,2916697.0,2916697.0,2916697.0,2916697.0,2916697.0
mean,2014.475,181.4572,45.00859,0.5455192,721.6446,238.5067,2.206516,4464889000.0
std,2.257398,104.0118,58.98236,3.674255,1689.676,966.3217,17.91877,162686000000.0
min,2011.0,1.0,0.0,3.606469e-94,1.0,0.0,1.0,30000000.0
25%,2013.0,92.0,2.0,0.02148438,1.0,0.0,1.0,74285590.0
50%,2014.0,181.0,8.0,0.25,1.0,0.0,2.0,199998500.0
75%,2016.0,271.0,108.0,0.8819482,56.0,0.0,2.0,994000000.0
max,2018.0,365.0,144.0,1943.749,14497.0,14496.0,12920.0,49964400000000.0


## Data Preparation 

In [14]:
df.drop(columns=["address","year","day"],axis=1,inplace=True)

In [15]:
for col in df.columns[:-1]:
    df[col]=df[col]

In [16]:
X=df.drop(columns=["label"])
y=df["label"]

In [17]:
X.columns

Index(['length', 'weight', 'count', 'looped', 'neighbors', 'income'], dtype='object')

In [18]:
df.head()

Unnamed: 0,length,weight,count,looped,neighbors,income,label
0,18,0.008333,1,0,2,100050000.0,princetonCerber
1,44,0.000244,1,0,1,100000000.0,princetonLocky
2,0,1.0,1,0,2,200000000.0,princetonCerber
3,72,0.003906,1,0,2,71200000.0,princetonCerber
4,144,0.072848,456,0,1,200000000.0,princetonLocky


In [19]:
new_df=pd.DataFrame()
grouped=df.groupby("label")

In [20]:
new_df["num_of_instances"]=grouped.size()

In [21]:
df

Unnamed: 0,length,weight,count,looped,neighbors,income,label
0,18,0.008333,1,0,2,1.000500e+08,princetonCerber
1,44,0.000244,1,0,1,1.000000e+08,princetonLocky
2,0,1.000000,1,0,2,2.000000e+08,princetonCerber
3,72,0.003906,1,0,2,7.120000e+07,princetonCerber
4,144,0.072848,456,0,1,2.000000e+08,princetonLocky
...,...,...,...,...,...,...,...
2916692,0,0.111111,1,0,1,1.255809e+09,white
2916693,0,1.000000,1,0,1,4.409699e+07,white
2916694,2,12.000000,6,6,35,2.398267e+09,white
2916695,0,0.500000,1,0,1,1.780427e+08,white


## Conclusion:
    Having completed the data preprocessing steps, it can be confidently stated that the data is now in a clean and 
    analysis-ready state. The key preprocessing tasks, including handling missing values, data cleaning, duplicate removal, 
    and data visualization, have been successfully carried out. There are no missing values, and no significant data 
    inconsistencies were detected. With this clean dataset, we are well-prepared to proceed with our analysis, modeling, 
    or any other data-driven tasks with a high level of data quality assurance.

In [22]:
df.to_csv('Blockchain.csv',index=False)