In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans

In [2]:
# Loading the CSV file into the dataframe
crypto_df = pd.read_csv('Resources/crypto_data.csv')
crypto_df.head()

Unnamed: 0.1,Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
0,42,42 Coin,Scrypt,True,PoW/PoS,41.99995,42
1,365,365Coin,X11,True,PoW/PoS,,2300000000
2,404,404Coin,Scrypt,True,PoW/PoS,1055185000.0,532000000
3,611,SixEleven,SHA-256,True,PoW,,611000
4,808,808,SHA-256,True,PoW/PoS,0.0,0


In [3]:
crypto_df.shape

(1252, 7)

In [4]:
crypto_df.value_counts()

Unnamed: 0  CoinName     Algorithm    IsTrading  ProofType  TotalCoinsMined  TotalCoinSupply
1337        EliteCoin    X13          True       PoW/PoS    2.927942e+10     314159265359       1
POKER       PokerCoin    Scrypt       False      PoS/PoW    0.000000e+00     466666667          1
PKB         ParkByte     SHA-256      True       PoW/PoS    0.000000e+00     25000000           1
PLNC        PLNCoin      Scrypt       True       PoW/PoS    1.708960e+07     38540000           1
PLTC        PlatinCoin   CryptoNight  True       PoW        8.430000e+04     600000518          1
                                                                                               ..
FAIR        FairCoin     Groestl      True       PoW/PoS    5.319383e+07     0                  1
FC2         Fuel2Coin    X11          True       PoS        0.000000e+00     100000000          1
FCN         FantomCoin   CryptoNight  True       PoW        0.000000e+00     18400000           1
FIBRE       FIBRE        

In [5]:
# Filtering for currencies that are currently being traded
crypto_df = crypto_df[crypto_df['IsTrading'] == True]
crypto_df.head()

Unnamed: 0.1,Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
0,42,42 Coin,Scrypt,True,PoW/PoS,41.99995,42
1,365,365Coin,X11,True,PoW/PoS,,2300000000
2,404,404Coin,Scrypt,True,PoW/PoS,1055185000.0,532000000
3,611,SixEleven,SHA-256,True,PoW,,611000
4,808,808,SHA-256,True,PoW/PoS,0.0,0


In [6]:
# Dropping 'IsTrading' Column
crypto_df = crypto_df.drop(columns ='IsTrading')

In [7]:
# Check null values
crypto_df.isnull().sum()

Unnamed: 0           0
CoinName             0
Algorithm            0
ProofType            0
TotalCoinsMined    459
TotalCoinSupply      0
dtype: int64

In [8]:
# Removing all rows with null values
crypto_df = crypto_df.dropna()

In [10]:
# Verifying that the null valued rows are dropped
crypto_df.isnull().sum()

Unnamed: 0         0
CoinName           0
Algorithm          0
ProofType          0
TotalCoinsMined    0
TotalCoinSupply    0
dtype: int64

In [11]:
# Filtering for the crpytocurrencies that have been mined already
crypto_df = crypto_df[crypto_df["TotalCoinsMined"]> 0]
crypto_df

Unnamed: 0.1,Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
0,42,42 Coin,Scrypt,PoW/PoS,4.199995e+01,42
2,404,404Coin,Scrypt,PoW/PoS,1.055185e+09,532000000
5,1337,EliteCoin,X13,PoW/PoS,2.927942e+10,314159265359
7,BTC,Bitcoin,SHA-256,PoW,1.792718e+07,21000000
8,ETH,Ethereum,Ethash,PoW,1.076842e+08,0
...,...,...,...,...,...,...
1238,ZEPH,ZEPHYR,SHA-256,DPoS,2.000000e+09,2000000000
1242,GAP,Gapcoin,Scrypt,PoW/PoS,1.493105e+07,250000000
1245,BDX,Beldex,CryptoNight,PoW,9.802226e+08,1400222610
1246,ZEN,Horizen,Equihash,PoW,7.296538e+06,21000000


In [12]:
# Saving the 'CoinName' column for t-SNE
CoinName=pd.DataFrame(crypto_df['CoinName']).reset_index(drop=True)
CoinName

Unnamed: 0,CoinName
0,42 Coin
1,404Coin
2,EliteCoin
3,Bitcoin
4,Ethereum
...,...
527,ZEPHYR
528,Gapcoin
529,Beldex
530,Horizen


In [13]:
# Dropping the 'CoinName' column from the dataframe considering it has no value in the contribution to the data that we seek
crypto_df.drop(columns='CoinName', axis=1, inplace= True)
crypto_df

Unnamed: 0.1,Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
0,42,Scrypt,PoW/PoS,4.199995e+01,42
2,404,Scrypt,PoW/PoS,1.055185e+09,532000000
5,1337,X13,PoW/PoS,2.927942e+10,314159265359
7,BTC,SHA-256,PoW,1.792718e+07,21000000
8,ETH,Ethash,PoW,1.076842e+08,0
...,...,...,...,...,...
1238,ZEPH,SHA-256,DPoS,2.000000e+09,2000000000
1242,GAP,Scrypt,PoW/PoS,1.493105e+07,250000000
1245,BDX,CryptoNight,PoW,9.802226e+08,1400222610
1246,ZEN,Equihash,PoW,7.296538e+06,21000000


In [14]:
# Dropping 'Unnamed' column
crypto_df = crypto_df.drop(columns=['Unnamed: 0'])
crypto_df

Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
0,Scrypt,PoW/PoS,4.199995e+01,42
2,Scrypt,PoW/PoS,1.055185e+09,532000000
5,X13,PoW/PoS,2.927942e+10,314159265359
7,SHA-256,PoW,1.792718e+07,21000000
8,Ethash,PoW,1.076842e+08,0
...,...,...,...,...
1238,SHA-256,DPoS,2.000000e+09,2000000000
1242,Scrypt,PoW/PoS,1.493105e+07,250000000
1245,CryptoNight,PoW,9.802226e+08,1400222610
1246,Equihash,PoW,7.296538e+06,21000000


In [15]:
# Converting the algorithm and prooftype into numerical data
X = pd.get_dummies(crypto_df, columns=['Algorithm', 'ProofType'])
X

Unnamed: 0,TotalCoinsMined,TotalCoinSupply,Algorithm_1GB AES Pattern Search,Algorithm_536,Algorithm_Argon2d,Algorithm_BLAKE256,Algorithm_Blake,Algorithm_Blake2S,Algorithm_Blake2b,Algorithm_C11,...,ProofType_PoW/PoS,ProofType_PoW/PoS.1,ProofType_PoW/PoW,ProofType_PoW/nPoS,ProofType_Pos,ProofType_Proof of Authority,ProofType_Proof of Trust,ProofType_TPoS,ProofType_Zero-Knowledge Proof,ProofType_dPoW/PoW
0,4.199995e+01,42,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
2,1.055185e+09,532000000,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
5,2.927942e+10,314159265359,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
7,1.792718e+07,21000000,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,1.076842e+08,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1238,2.000000e+09,2000000000,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1242,1.493105e+07,250000000,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1245,9.802226e+08,1400222610,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1246,7.296538e+06,21000000,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [16]:
# Viewing all of the columnnames
X.columns

Index(['TotalCoinsMined', 'TotalCoinSupply',
       'Algorithm_1GB AES Pattern Search', 'Algorithm_536',
       'Algorithm_Argon2d', 'Algorithm_BLAKE256', 'Algorithm_Blake',
       'Algorithm_Blake2S', 'Algorithm_Blake2b', 'Algorithm_C11',
       'Algorithm_Cloverhash', 'Algorithm_Counterparty',
       'Algorithm_CryptoNight', 'Algorithm_CryptoNight Heavy',
       'Algorithm_CryptoNight-V7', 'Algorithm_Cryptonight-GPU',
       'Algorithm_DPoS', 'Algorithm_Dagger', 'Algorithm_Dagger-Hashimoto',
       'Algorithm_ECC 256K1', 'Algorithm_Equihash',
       'Algorithm_Equihash+Scrypt', 'Algorithm_Ethash', 'Algorithm_Exosis',
       'Algorithm_Green Protocol', 'Algorithm_Groestl', 'Algorithm_HMQ1725',
       'Algorithm_HybridScryptHash256', 'Algorithm_IMesh',
       'Algorithm_Jump Consistent Hash', 'Algorithm_Keccak',
       'Algorithm_Leased POS', 'Algorithm_Lyra2RE', 'Algorithm_Lyra2REv2',
       'Algorithm_Lyra2Z', 'Algorithm_M7 POW', 'Algorithm_Multiple',
       'Algorithm_NIST5', 'Algor

In [17]:
# Standarize dataset
scaler = StandardScaler()
crypto_scaled = scaler.fit_transform(X)

In [18]:
# Number of features in dataset
len(X.columns)

98

In [19]:
# Reducing dimensions from 98 to 4 using PCA
pca = PCA(n_components=4)

# Grabbing three principal components for the crypto data.
crypto_pca1 = pca.fit_transform(crypto_scaled)

In [20]:
# Creating a dataframe from the PCA data
df_crypto_pca1 = pd.DataFrame(data=crypto_pca1, 
                             columns=["PC 1", "PC 2", "PC 3", "PC4"])
df_crypto_pca1.head()

Unnamed: 0,PC 1,PC 2,PC 3,PC4
0,-0.32995,1.008941,-0.445273,-0.017887
1,-0.313266,1.008907,-0.445484,-0.017886
2,2.307833,1.57454,-0.561888,-0.026813
3,-0.154853,-1.349338,0.107709,0.005611
4,-0.149363,-1.985712,0.347129,0.049044


In [21]:
# Viewing the explained variance ratio
pca.explained_variance_ratio_

array([0.02793118, 0.02140418, 0.02047694, 0.02044505])

In [22]:
# Using PCA we preserve 90% of the 'explained variance' through dimension reduction
pca2 = PCA(n_components=.90)

# Grabbing three principal components for the crypto data.
crypto_pca = pca2.fit_transform(crypto_scaled)
crypto_pca

array([[-3.35098808e-01,  1.03218907e+00, -5.90713438e-01, ...,
         3.91305630e-15, -2.25435607e-15, -5.99024584e-16],
       [-3.18433948e-01,  1.03233143e+00, -5.91125551e-01, ...,
         3.75637712e-15, -2.19086589e-15, -4.41719317e-17],
       [ 2.30546811e+00,  1.65638302e+00, -6.83616670e-01, ...,
        -1.01639009e-13,  1.10178585e-15,  7.83077047e-14],
       ...,
       [ 3.24348535e-01, -2.31230764e+00,  4.21515366e-01, ...,
        -8.20759771e-15, -6.51792518e-14,  1.95265232e-14],
       [-1.49363652e-01, -2.04933553e+00,  4.12968257e-01, ...,
        -1.64202016e-15, -2.93110610e-15, -1.52187234e-15],
       [-2.89957500e-01,  8.21194511e-01, -2.76326847e-01, ...,
        -2.06754367e-15, -3.37415118e-15, -2.44300560e-15]])

In [23]:
# Creating a dataframe from the PCA data
transformed_crypto_pca = pd.DataFrame(data=crypto_pca)
transformed_crypto_pca.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,64,65,66,67,68,69,70,71,72,73
0,-0.335099,1.032189,-0.590713,0.001397,-2.585454e-15,-2.164895e-14,-2.250875e-12,-0.007129,-1.319756,-0.051813,...,-1.071995e-15,-1.557977e-15,4.4776870000000006e-17,9.205529000000001e-17,7.560486000000001e-17,2.34557e-15,2.976201e-15,3.913056e-15,-2.254356e-15,-5.990246e-16
1,-0.318434,1.032331,-0.591126,0.001386,-2.388053e-15,-2.144965e-14,-2.23231e-12,-0.007739,-1.322514,-0.056674,...,4.058616e-16,-1.838374e-15,1.477054e-16,8.320883e-16,-9.87705e-16,2.182964e-15,3.49913e-15,3.756377e-15,-2.190866e-15,-4.417193e-17
2,2.305468,1.656383,-0.683617,0.004731,1.054695e-15,-3.522258e-14,-7.658698e-12,-0.054781,-1.542879,-0.943951,...,-1.728598e-14,-3.228766e-15,-6.302382e-14,3.503961e-14,1.911059e-14,-5.731383e-15,-5.557323e-14,-1.01639e-13,1.101786e-15,7.83077e-14
3,-0.145184,-1.320593,0.192813,-0.001229,-4.903154e-16,8.862953e-15,1.987214e-12,-0.002071,0.281463,0.251862,...,-1.111375e-15,-7.296071e-16,-1.05366e-14,7.339311e-15,-9.932598e-15,1.047256e-15,-1.609524e-14,-1.532224e-14,-1.548906e-14,6.528893e-15
4,-0.151768,-2.036192,0.396182,-0.001705,7.659398e-17,1.389176e-14,2.756466e-12,0.027735,0.519099,0.18882,...,-5.222553e-14,-4.887392e-14,-1.247253e-13,9.374215e-14,-1.759288e-14,3.24803e-15,-1.694098e-14,4.055445e-14,-1.588358e-13,2.634451e-14


In [24]:
# Fetch explained variance
pca.explained_variance_ratio_

array([0.02793118, 0.02140418, 0.02047694, 0.02044505])