In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans

In [None]:
#Reading csv file
df = pd.read_csv("Resources/crypto_data.csv")
df.head()

In [None]:
df.dtypes

<h1>Data Preparation</h1>

In [None]:
#Selecting only trading cryptocurrencies
trading = df.loc[df['IsTrading'] == True]
#Dropping IsTrading column
trading = trading.drop(columns='IsTrading')
trading.shape

In [None]:
#Removing all rows that have at least one null value
trading = trading.dropna()
trading.shape

In [None]:
#Filtering for cryptocurrencies that have been mined
mined = trading.loc[df['TotalCoinsMined'] > 0]
mined.shape

In [None]:
#deleting the CoinName from the original dataframe.
new_crypto_df = mined.drop(columns='CoinName')
new_crypto_df.head()

In [None]:
new_crypto_df.shape

<h3>One-Hot-Encoding</h3>

In [None]:
#Converting categorical data into numerical data
df_numerical = pd.get_dummies(new_crypto_df, drop_first=True)
df_numerical.head()

In [None]:
df_numerical.shape

<h3>Standard Scaler</h3>

In [None]:
#Scaling the data
scaler = StandardScaler()
scaled_data = scaler.fit_transform(df_numerical)

<h1>Dimensionality Reduction</h1>

<h3>PCA Model</h3>

In [None]:
#Initialising PCA model
pca = PCA(n_components=0.90)
crypto_pca = pca.fit_transform(scaled_data)

In [None]:
#Transforming PCA data to a DataFrame
df_crypto_pca = pd.DataFrame(
    data=crypto_pca
)
df_crypto_pca.shape

<h3>t-SNE Model</h3>

In [None]:
#Initialising t-SNE model
tsne = TSNE(learning_rate=50)

In [None]:
tsne_features = tsne.fit_transform(df_crypto_pca)

In [None]:
tsne_features.shape

In [None]:
# Preparing to plot the dataset

# The first column of transformed features
df_crypto_pca['x'] = tsne_features[:,0]

# The second column of transformed features
df_crypto_pca['y'] = tsne_features[:,1]

In [None]:
#Visualising the clusters
plt.scatter(df_crypto_pca['x'], df_crypto_pca['y'])
plt.show()

<h1>Running K-Means</h1>

In [None]:
# Finding the best value for k
inertia = []
k = list(range(1, 11))

# Calculate the inertia for the range of k values
for i in k:
    km = KMeans(n_clusters=i, random_state=0)
    km.fit(tsne_features)
    inertia.append(km.inertia_)

# Creating the Elbow Curve
elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)

plt.plot(df_elbow['k'], df_elbow['inertia'])
plt.xticks(list(range(11)))
plt.title('Elbow Curve')
plt.xlabel('Number of clusters')
plt.ylabel('Inertia')
plt.show()

In [None]:
df_tsne = pd.DataFrame(
    data=tsne_features
)
df_tsne.head()

In [None]:
#Predicting clusters with k=3
model = KMeans(n_clusters=3, random_state=1)
model.fit(df_tsne)
# Predict clusters
predictions = model.predict(df_tsne)
df_tsne["class"] = model.labels_
df_tsne.head()

In [None]:
plt.scatter(df_tsne[0], df_tsne[1], c=df_tsne['class'])
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.show()

<h1>Conclusion and Recomendations</h1>

<p>summary</p>

<h1>Scaling only on the numerical data, not the one transformed from categorical</h1>

In [None]:
#Scaling the data
scaler = StandardScaler()
scaled_data = scaler.fit_transform(df_numerical[['TotalCoinsMined']])

In [None]:
 # A list of the columns from the original DataFrame
df_numerical.columns

In [None]:
# Creating a DataFrame with the transformed data
new_crypto_df = pd.DataFrame(scaled_data, columns=df_numerical.columns[:1])

missing_cols = set(df_numerical.columns ) - set(new_crypto_df.columns )
# Add a missing column in test set with default value equal to 0
for c in missing_cols:
    new_crypto_df[c] = 0

new_crypto_df = new_crypto_df[df_numerical.columns]
new_crypto_df.head()

In [None]:
pca = PCA(n_components=0.90)
crypto_pca = pca.fit_transform(new_crypto_df)

In [None]:
 # Transform PCA data to a DataFrame
df_crypto_pca = pd.DataFrame(
    data=crypto_pca
)
df_crypto_pca.head()