In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
pd.set_option('display.max_columns', None)

In [None]:
crypto_df = pd.read_csv('Data/crypto_data.csv')
crypto_df.head()

In [None]:
crypto_df.info()

In [None]:
crypto_df.describe()

### Remove all rows that have at least one null value.

In [None]:
crypto_df2 = crypto_df.dropna(how="any")
crypto_df2.info()

### Discard all cryptocurrencies that are not being traded. In other words, filter for currencies that are currently being traded. Once you have done this, drop the IsTrading column from the dataframe.

In [None]:
crypto_df2.IsTrading.value_counts()        

In [None]:
crypto_df3 = crypto_df2[crypto_df2['IsTrading'] == True] 
crypto_df3.info()

In [None]:
crypto_df3.IsTrading.value_counts() 

In [None]:
crypto_df4 = crypto_df3.drop(['IsTrading'], axis = 1)
crypto_df4.info()

### Filter for cryptocurrencies that have been mined. That is, the total coins mined should be greater than zero.

In [None]:
crypto_df5 =crypto_df4[crypto_df4['TotalCoinsMined'] > 0]
crypto_df5.info()

In [None]:
crypto_df5["TotalCoinSupply"] = pd.to_numeric(crypto_df5.TotalCoinSupply)
crypto_df5.info()

In [None]:
crypto_df5.describe()

### In order for your dataset to be comprehensible to a machine learning algorithm, its data should be numeric. Since the coin names do not contribute to the analysis of the data, delete the CoinName from the original dataframe.

In [None]:
crypto_df6 = crypto_df5.drop(['Unnamed: 0', 'CoinName'], axis = 1)
crypto_df6.head()

In [None]:
crypto_df6.ProofType.value_counts()

In [None]:
crypto_df7 = crypto_df6.replace(
    {"PoW/PoS ": "PoW/PoS", "PoW and PoS": "PoW/PoS", "Proof of Authority": "PoA", "Pos": "PoS", "Proof of Trust": "PoT"})
crypto_df7.ProofType.value_counts()

In [None]:
print(f"Duplicate entries: {crypto_df7.duplicated().sum()}")

### Your next step in data preparation is to convert the remaining features with text values, Algorithm and ProofType, into numerical data. To accomplish this task, use Pandas to create dummy variables. Examine the number of rows and columns of your dataset now. 

In [None]:
crypto_df7 = pd.get_dummies(crypto_df7)
crypto_df7.head()

## How did they change? The number of rows remained the same and the number of columns increased. 

# Scale

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
scaler = StandardScaler()
scaler = scaler.fit(crypto_df7.loc[:, ["TotalCoinsMined", "TotalCoinSupply"]])

In [None]:
final_crypto = crypto_df7.copy()

final_crypto.loc[:, ["TotalCoinsMined", "TotalCoinSupply"]] = scaler.transform(crypto_df7.loc[:, ["TotalCoinsMined", "TotalCoinSupply"]])
final_crypto.head()

In [None]:
final_crypto.info()

In [None]:
final_crypto.describe()

# TSNE

In [None]:
from sklearn.manifold import TSNE

In [None]:
tsne = TSNE(learning_rate=35)
# Reduce dimensions
tsne_features = tsne.fit_transform(final_crypto)
tsne_features.shape

In [None]:
# Visualize the clusters
plt.scatter(tsne_features[:,0], tsne_features[:,1])
plt.show()

# PSA 

In [None]:
from sklearn.decomposition import PCA

In [None]:
pca = PCA(n_components=0.90)
pca.fit(final_crypto)
print(pca.explained_variance_ratio_)
print(np.sum(pca.explained_variance_ratio_))

In [None]:
plt.plot(range(len(pca.explained_variance_ratio_)), pca.explained_variance_ratio_)
plt.show()

In [None]:
print(len(pca.explained_variance_ratio_))

In [None]:
crypto_pca = pca.transform(final_crypto)
crypto_pca = pd.DataFrame(crypto_pca, columns = ["PCA"+ str(x) for x in range(1, len(pca.explained_variance_ratio_) + 1)])
crypto_pca.head()

# TSNE w/PCA

In [None]:
tsne = TSNE(learning_rate=35)
# Reduce dimensions
tsne_features = tsne.fit_transform(crypto_pca)
tsne_features.shape

In [None]:
# Visualize the clusters
plt.scatter(tsne_features[:,0], tsne_features[:,1])
plt.show()

# KMeans w/PCA

In [None]:
from sklearn.cluster import KMeans

In [None]:
# Finding the best value for k
inertia = []
k = list(range(1, 20))

# Calculate the inertia for the range of k values
for i in k:
    km = KMeans(n_clusters=i, random_state=0)
    km.fit(crypto_pca)
    inertia.append(km.inertia_)

# Creating the Elbow Curve
elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)

plt.plot(df_elbow['k'], df_elbow['inertia'])
plt.xticks(list(range(20)))
plt.title('Elbow Curve')
plt.xlabel('Number of clusters')
plt.ylabel('Inertia')
plt.show()

In [None]:
# Predicting clusters with k=8

# Initialize the k-means model
model = KMeans(n_clusters=8, random_state=0)

# Fit the model
model.fit(crypto_pca)

# Predict clusters
predictions = model.predict(crypto_pca)

# Add the predicted class column to the dataframe
crypto_pca["class"] = model.labels_
crypto_pca.head()

In [None]:
# Visualize the clusters
plt.scatter(tsne_features[:,0], tsne_features[:,1], c=crypto_pca["class"])
plt.show()

In [None]:
crypto_pca["class"].value_counts()

In [None]:
# Anaysis 

# 1 hot encoding wasnt approriate for the Algorithm column because it  produced many columns