In [126]:
import pandas as pd
import numpy as np
import hvplot.pandas
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

## Artist Week

In [127]:
df_01 = pd.read_csv("../Resources/ArtistWeek.csv")
print(df_01.info())
df_01.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 595 entries, 0 to 594
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Unnamed: 0      595 non-null    int64  
 1   index           595 non-null    int64  
 2   Artist          595 non-null    object 
 3   Followers       130 non-null    float64
 4   Genres          129 non-null    object 
 5   NumAlbums       130 non-null    float64
 6   YearFirstAlbum  130 non-null    float64
 7   Gender          126 non-null    object 
 8   Group.Solo      130 non-null    object 
 9   Features        108 non-null    object 
 10  Week            595 non-null    object 
 11  Total streams   595 non-null    float64
dtypes: float64(4), int64(2), object(6)
memory usage: 55.9+ KB
None


Unnamed: 0.1,Unnamed: 0,index,Artist,Followers,Genres,NumAlbums,YearFirstAlbum,Gender,Group.Solo,Features,Week,Total streams
0,0,0,ed sheeran,52698756.0,"pop,uk pop",8.0,2011.0,M,Solo,,2018-07-20,936155300.0
1,1,1,justin bieber,30711450.0,"canadian pop,dance pop,pop,post-teen pop",10.0,2009.0,M,Solo,,2018-12-14,45434490.0
2,2,2,jonas brothers,3069527.0,"boy band,dance pop,pop,post-teen pop",10.0,2006.0,M,Group,,2019-04-12,182558400.0
3,3,3,drake,41420478.0,"canadian hip hop,canadian pop,hip hop,pop rap,...",11.0,2010.0,M,Solo,,2018-07-20,3441947000.0
4,4,4,chris brown,9676862.0,"dance pop,pop,pop rap,r&b,rap",6.0,2005.0,M,Solo,,2017-11-10,138334200.0


In [128]:
df_01 = df_01.drop(columns=["Unnamed: 0","Artist","Genres","Week"])
df_01['Features'] = df_01['Features'].notna().astype(int)
df_01 = df_01.dropna()
df_01_dummies = pd.get_dummies(df_01)

In [129]:
df_01_dummies.columns

Index(['index', 'Followers', 'NumAlbums', 'YearFirstAlbum', 'Features',
       'Total streams', 'Gender_F', 'Gender_M', 'Group.Solo_Group',
       'Group.Solo_Solo'],
      dtype='object')

In [130]:
# Use the `StandardScaler()` module from scikit-learn to normalize the data from the CSV file

df_01_scaled = StandardScaler().fit_transform(df_01_dummies[["Followers", "NumAlbums", "YearFirstAlbum", "Features", "Total streams", "Gender_F", "Gender_M","Group.Solo_Group","Group.Solo_Solo"]])
df_01_scaled[0:5]


array([[ 5.34671764, -0.41911065,  0.73528133, -0.4975186 ,  2.07035408,
        -0.45993311,  0.45993311, -0.43437224,  0.43437224],
       [ 2.76806818, -0.31232007,  0.62767919, -0.4975186 , -0.28493872,
        -0.45993311,  0.45993311, -0.43437224,  0.43437224],
       [-0.47374915, -0.31232007,  0.46627597, -0.4975186 ,  0.07765184,
        -0.45993311,  0.45993311,  2.30217289, -2.30217289],
       [ 4.02401238, -0.25892478,  0.68148026, -0.4975186 ,  8.69630685,
        -0.45993311,  0.45993311, -0.43437224,  0.43437224],
       [ 0.30115255, -0.52590123,  0.41247489, -0.4975186 , -0.03928831,
        -0.45993311,  0.45993311, -0.43437224,  0.43437224]])

In [131]:
# Create a DataFrame with the scaled data
df_01_scaled_df = pd.DataFrame(df_01_scaled, columns = ["Followers", "NumAlbums", "YearFirstAlbum", "Features", "Total streams", "Gender_F", "Gender_M","Group.Solo_Group","Group.Solo_Solo"])

# Copy the crypto names from the original data
df_01_scaled_df["Artist"] = df_01.index

# Set the coinid column as index
df_01_scaled_df.set_index("Artist", inplace = True)

# Display sample data
df_01_scaled_df.head()

Unnamed: 0_level_0,Followers,NumAlbums,YearFirstAlbum,Features,Total streams,Gender_F,Gender_M,Group.Solo_Group,Group.Solo_Solo
Artist,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,5.346718,-0.419111,0.735281,-0.497519,2.070354,-0.459933,0.459933,-0.434372,0.434372
1,2.768068,-0.31232,0.627679,-0.497519,-0.284939,-0.459933,0.459933,-0.434372,0.434372
2,-0.473749,-0.31232,0.466276,-0.497519,0.077652,-0.459933,0.459933,2.302173,-2.302173
3,4.024012,-0.258925,0.68148,-0.497519,8.696307,-0.459933,0.459933,-0.434372,0.434372
4,0.301153,-0.525901,0.412475,-0.497519,-0.039288,-0.459933,0.459933,-0.434372,0.434372


In [132]:
# Create a list with the number of k-values from 1 to 11
k_values = list(range(1,11))


In [133]:
# Create an empty list to store the inertia values
inertia_values = []


# Create a for loop to compute the inertia with each possible value of k
# Inside the loop:
# 1. Create a KMeans model using the loop counter for the n_clusters
# 2. Fit the model to the data using `df_market_data_scaled`
# 3. Append the model.inertia_ to the inertia list

for each_k in k_values:
    k_model_elbow = KMeans(n_clusters=each_k, random_state=1)
    k_model_elbow.fit(df_01_scaled_df)
    inertia_values.append(k_model_elbow.inertia_)

inertia_values

  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)


[1134.0000000000002,
 868.4092139031537,
 630.1726652199843,
 464.97503701513153,
 359.0830686744978,
 282.7196235329007,
 242.36899817594633,
 217.5267851077425,
 184.20695861642253,
 177.52740759677866]

In [134]:
# Create a dictionary with the data to plot the Elbow curve
elbow_dict = {"k" : k_values, "inertia" : inertia_values}


# Create a DataFrame with the data to plot the Elbow curve
elbow_df = pd.DataFrame(elbow_dict, columns= ["k", "inertia"])
elbow_df.head()

Unnamed: 0,k,inertia
0,1,1134.0
1,2,868.409214
2,3,630.172665
3,4,464.975037
4,5,359.083069


In [135]:
# Plot a line chart with all the inertia values computed with 
# the different values of k to visually identify the optimal value for k.
elbow_df.hvplot.line(x = "k", y = "inertia")

  return dataset.data.dtypes[idx].type
  return dataset.data.dtypes[idx].type


In [136]:
# Initialize the K-Means model using the best value for k
k_model_df_market_data_scaled_4c = KMeans(n_clusters=5, random_state=1)
# Fit the K-Means model using the scaled data
k_model_df_market_data_scaled_4c.fit(df_01_scaled_df)

# Predict the clusters to group the cryptocurrencies using the scaled data
k_model_df_market_data_scaled_4c_predictions = k_model_df_market_data_scaled_4c.predict(df_01_scaled_df)

# Print the resulting array of cluster values.
k_model_df_market_data_scaled_4c_predictions


  super()._check_params_vs_input(X, default_n_init=10)


array([4, 1, 2, 4, 1, 0, 1, 0, 2, 1, 0, 1, 1, 1, 2, 1, 0, 1, 1, 1, 0, 1,
       1, 3, 1, 2, 1, 1, 1, 0, 2, 1, 0, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 0,
       3, 0, 3, 3, 3, 2, 2, 3, 3, 3, 0, 0, 3, 3, 3, 1, 2, 1, 1, 1, 2, 0,
       1, 1, 1, 1, 1, 0, 2, 1, 4, 2, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0,
       2, 1, 1, 1, 1, 1, 2, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 2, 0, 1, 1,
       0, 2, 1, 2, 3, 1, 2, 1, 3, 2, 2, 3, 1, 3, 1, 3])

In [137]:
# Create a copy of the DataFrame
df_01_scaled_df_copy = df_01_scaled_df.copy()

In [138]:
# Add a new column to the DataFrame with the predicted clusters
df_01_scaled_df_copy["predictions"] = k_model_df_market_data_scaled_4c_predictions

# Display sample data
df_01_scaled_df_copy.head()

Unnamed: 0_level_0,Followers,NumAlbums,YearFirstAlbum,Features,Total streams,Gender_F,Gender_M,Group.Solo_Group,Group.Solo_Solo,predictions
Artist,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,5.346718,-0.419111,0.735281,-0.497519,2.070354,-0.459933,0.459933,-0.434372,0.434372,4
1,2.768068,-0.31232,0.627679,-0.497519,-0.284939,-0.459933,0.459933,-0.434372,0.434372,1
2,-0.473749,-0.31232,0.466276,-0.497519,0.077652,-0.459933,0.459933,2.302173,-2.302173,2
3,4.024012,-0.258925,0.68148,-0.497519,8.696307,-0.459933,0.459933,-0.434372,0.434372,4
4,0.301153,-0.525901,0.412475,-0.497519,-0.039288,-0.459933,0.459933,-0.434372,0.434372,1


In [139]:
# Create a scatter plot using hvPlot by setting 
# `x="price_change_percentage_24h"` and `y="price_change_percentage_7d"`. 
# Color the graph points with the labels found using K-Means and 
# add the crypto name in the `hover_cols` parameter to identify 
# the cryptocurrency represented by each data point.


df_01_scaled_df_copy.hvplot.scatter(x = "Followers", y = "YearFirstAlbum", by = "predictions", hover_cols = "coin_id")

  return dataset.data.dtypes[idx].type
  return dataset.data.dtypes[idx].type


## attributesBillboard

In [140]:
df_02 = pd.read_csv("../Resources/attributesBillboard.csv")
print(df_02.info())
df_02.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4774 entries, 0 to 4773
Data columns (total 27 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Unnamed: 0        4774 non-null   int64  
 1   Artist            4774 non-null   object 
 2   Name              4774 non-null   object 
 3   Weekly.rank       4774 non-null   int64  
 4   Peak.position     4229 non-null   float64
 5   Weeks.on.chart    4229 non-null   float64
 6   Week              4774 non-null   object 
 7   Date              3938 non-null   object 
 8   Genre             4774 non-null   object 
 9   Writing.Credits   4279 non-null   object 
 10  Lyrics            4774 non-null   object 
 11  Features          717 non-null    object 
 12  Acousticness      4774 non-null   float64
 13  Album             4774 non-null   object 
 14  Danceability      4774 non-null   float64
 15  Duration          4774 non-null   int64  
 16  Energy            4774 non-null   float64


Unnamed: 0.1,Unnamed: 0,Artist,Name,Weekly.rank,Peak.position,Weeks.on.chart,Week,Date,Genre,Writing.Credits,...,Explicit,Instrumentalness,Liveness,Loudness,Mode,Popularity,Speechiness,Tempo,TimeSignature,Valence
0,0,jonas brothers,sucker,6,1.0,17.0,2019-07-06,"March 1, 2019","Alternative Pop,Boy Band,Teen Pop,Pop-Rock,Pop","Ryan tedder, Louis bell, Frank dukes, Nick jon...",...,False,0.0,0.106,-5.065,0,81,0.0588,137.958,4,0.952
1,1,taylor swift,you need to calm down,13,2.0,2.0,2019-07-06,"June 14, 2019","Synth-Pop,LGBTQ+,Pop","Joel little, Taylor swift",...,False,0.0,0.0637,-5.617,1,89,0.0553,85.026,4,0.714
2,2,panic! at the disco,"hey look ma, i made it",24,24.0,11.0,2019-07-06,"June 22, 2018","Pop-Rock,Jazz Fusion,Alternative,Alternative P...","Jake sinclair, Michael angelakos, Dillon franc...",...,False,0.0,0.121,-3.337,1,81,0.0695,107.936,4,0.58
3,3,lee brice,rumor,26,25.0,16.0,2019-07-06,"November 3, 2017",Country,"Kyle jacobs, Ashley gorley, Lee brice",...,False,0.0,0.115,-6.857,1,79,0.0486,140.975,4,0.599
4,4,panic! at the disco,high hopes,32,4.0,47.0,2019-07-06,"May 23, 2018","Adult Alternative,Rock,Power Pop,Pop-Rock,Alte...","Sam hollander, Cook classics, Tayla parx, Jake...",...,False,0.0,0.064,-2.729,1,87,0.0618,82.014,4,0.681


In [141]:
df_02.columns

Index(['Unnamed: 0', 'Artist', 'Name', 'Weekly.rank', 'Peak.position',
       'Weeks.on.chart', 'Week', 'Date', 'Genre', 'Writing.Credits', 'Lyrics',
       'Features', 'Acousticness', 'Album', 'Danceability', 'Duration',
       'Energy', 'Explicit', 'Instrumentalness', 'Liveness', 'Loudness',
       'Mode', 'Popularity', 'Speechiness', 'Tempo', 'TimeSignature',
       'Valence'],
      dtype='object')

In [142]:
df_02 = df_02.drop(columns=["Unnamed: 0","Artist","Name","Week","Date","Genre","Writing.Credits","Lyrics","Album"])
df_02['Features'] = df_02['Features'].notna().astype(int)
df_02 = df_02.dropna()
df_02_dummies = pd.get_dummies(df_02)
df_02_dummies.head()

Unnamed: 0,Weekly.rank,Peak.position,Weeks.on.chart,Features,Acousticness,Danceability,Duration,Energy,Explicit,Instrumentalness,Liveness,Loudness,Mode,Popularity,Speechiness,Tempo,TimeSignature,Valence
0,6,1.0,17.0,0,0.0427,0.842,181027,0.734,False,0.0,0.106,-5.065,0,81,0.0588,137.958,4,0.952
1,13,2.0,2.0,0,0.00929,0.771,171360,0.671,False,0.0,0.0637,-5.617,1,89,0.0553,85.026,4,0.714
2,24,24.0,11.0,0,0.0137,0.577,169667,0.833,False,0.0,0.121,-3.337,1,81,0.0695,107.936,4,0.58
3,26,25.0,16.0,0,0.749,0.655,198440,0.56,False,0.0,0.115,-6.857,1,79,0.0486,140.975,4,0.599
4,32,4.0,47.0,0,0.193,0.579,190947,0.904,False,0.0,0.064,-2.729,1,87,0.0618,82.014,4,0.681


In [143]:
df_02_dummies.columns

Index(['Weekly.rank', 'Peak.position', 'Weeks.on.chart', 'Features',
       'Acousticness', 'Danceability', 'Duration', 'Energy', 'Explicit',
       'Instrumentalness', 'Liveness', 'Loudness', 'Mode', 'Popularity',
       'Speechiness', 'Tempo', 'TimeSignature', 'Valence'],
      dtype='object')

In [144]:
# Use the `StandardScaler()` module from scikit-learn to normalize the data from the CSV file

df_02_scaled = StandardScaler().fit_transform(df_02_dummies[['Weekly.rank', 'Peak.position', 'Weeks.on.chart', 'Features',
       'Acousticness', 'Danceability', 'Duration', 'Energy', 'Explicit',
       'Instrumentalness', 'Liveness', 'Loudness', 'Mode', 'Popularity',
       'Speechiness', 'Tempo', 'TimeSignature', 'Valence']])
df_02_scaled[0:5]


array([[-3.20774353, -1.40911266, -0.0625221 , -0.40661383, -0.59948035,
         1.63326542, -1.24157136,  0.17088416, -0.46279928, -0.13565222,
        -0.53379455,  0.35738149, -1.61695081,  1.86707972, -0.29727765,
         0.50433154,  0.10700916,  1.91065833],
       [-2.88829648, -1.37264794, -1.57861123, -0.40661383, -0.76222976,
         1.13173892, -1.46352201, -0.19867304, -0.46279928, -0.13565222,
        -0.80111435,  0.11973485,  0.618448  ,  2.23168235, -0.33498676,
        -1.26812352,  0.10700916,  0.83126933],
       [-2.38630825, -0.5704241 , -0.66895775, -0.40661383, -0.74074742,
        -0.23862926, -1.50239264,  0.7516169 , -0.46279928, -0.13565222,
        -0.43900029,  1.10131881,  0.618448  ,  1.86707972, -0.18199552,
        -0.50097047,  0.10700916,  0.22354611],
       [-2.29503766, -0.53395938, -0.1635947 , -0.40661383,  2.84110344,
         0.31234351, -0.84177549, -0.84979763, -0.46279928, -0.13565222,
        -0.47691799, -0.41410905,  0.618448  ,  1.775

In [145]:
# Create a DataFrame with the scaled data
df_02_scaled_df = pd.DataFrame(df_02_scaled, columns = ['Weekly.rank', 'Peak.position', 'Weeks.on.chart', 'Features',
       'Acousticness', 'Danceability', 'Duration', 'Energy', 'Explicit',
       'Instrumentalness', 'Liveness', 'Loudness', 'Mode', 'Popularity',
       'Speechiness', 'Tempo', 'TimeSignature', 'Valence'])

# Copy the crypto names from the original data
df_02_scaled_df["Index"] = df_02.index

# Set the coinid column as index
df_02_scaled_df.set_index("Index", inplace = True)

# Display sample data
df_02_scaled_df.head()

Unnamed: 0_level_0,Weekly.rank,Peak.position,Weeks.on.chart,Features,Acousticness,Danceability,Duration,Energy,Explicit,Instrumentalness,Liveness,Loudness,Mode,Popularity,Speechiness,Tempo,TimeSignature,Valence
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
0,-3.207744,-1.409113,-0.062522,-0.406614,-0.59948,1.633265,-1.241571,0.170884,-0.462799,-0.135652,-0.533795,0.357381,-1.616951,1.86708,-0.297278,0.504332,0.107009,1.910658
1,-2.888296,-1.372648,-1.578611,-0.406614,-0.76223,1.131739,-1.463522,-0.198673,-0.462799,-0.135652,-0.801114,0.119735,0.618448,2.231682,-0.334987,-1.268124,0.107009,0.831269
2,-2.386308,-0.570424,-0.668958,-0.406614,-0.740747,-0.238629,-1.502393,0.751617,-0.462799,-0.135652,-0.439,1.101319,0.618448,1.86708,-0.181996,-0.50097,0.107009,0.223546
3,-2.295038,-0.533959,-0.163595,-0.406614,2.841103,0.312344,-0.841775,-0.849798,-0.462799,-0.135652,-0.476918,-0.414109,0.618448,1.775929,-0.407173,0.605357,0.107009,0.309716
4,-2.021226,-1.299718,2.969656,-0.406614,0.132673,-0.224502,-1.013812,1.168102,-0.462799,-0.135652,-0.799218,1.363075,0.618448,2.140532,-0.264956,-1.368982,0.107009,0.681606


In [146]:
# Create a list with the number of k-values from 1 to 11
k_values = list(range(1,11))

# Create an empty list to store the inertia values
inertia_values = []


# Create a for loop to compute the inertia with each possible value of k
# Inside the loop:
# 1. Create a KMeans model using the loop counter for the n_clusters
# 2. Fit the model to the data using `df_market_data_scaled`
# 3. Append the model.inertia_ to the inertia list

for each_k in k_values:
    k_model_elbow = KMeans(n_clusters=each_k, random_state=1)
    k_model_elbow.fit(df_02_scaled_df)
    inertia_values.append(k_model_elbow.inertia_)

inertia_values

  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)


[76121.99999999993,
 68681.45919258884,
 62916.863726336975,
 57689.61296682198,
 54122.095103497806,
 52152.96934932962,
 50568.89441125114,
 48433.500873928766,
 47643.19119815676,
 46102.628604354766]

In [147]:
# Create a dictionary with the data to plot the Elbow curve
elbow_dict = {"k" : k_values, "inertia" : inertia_values}


# Create a DataFrame with the data to plot the Elbow curve
elbow_df = pd.DataFrame(elbow_dict, columns= ["k", "inertia"])
elbow_df.head()

Unnamed: 0,k,inertia
0,1,76122.0
1,2,68681.459193
2,3,62916.863726
3,4,57689.612967
4,5,54122.095103


In [148]:
# Plot a line chart with all the inertia values computed with 
# the different values of k to visually identify the optimal value for k.
elbow_df.hvplot.line(x = "k", y = "inertia")

  return dataset.data.dtypes[idx].type
  return dataset.data.dtypes[idx].type


In [149]:
# Initialize the K-Means model using the best value for k
k_model_df_market_data_scaled_4c = KMeans(n_clusters=5, random_state=1)
# Fit the K-Means model using the scaled data
k_model_df_market_data_scaled_4c.fit(df_02_scaled_df)

# Predict the clusters to group the cryptocurrencies using the scaled data
k_model_df_market_data_scaled_4c_predictions = k_model_df_market_data_scaled_4c.predict(df_02_scaled_df)

# Print the resulting array of cluster values.
k_model_df_market_data_scaled_4c_predictions


  super()._check_params_vs_input(X, default_n_init=10)


array([4, 4, 4, ..., 3, 1, 0])

In [150]:
# Create a copy of the DataFrame
df_02_scaled_df_copy = df_02_scaled_df.copy()

In [151]:
# Add a new column to the DataFrame with the predicted clusters
df_02_scaled_df_copy["predictions"] = k_model_df_market_data_scaled_4c_predictions

# Display sample data
df_02_scaled_df_copy.head()

Unnamed: 0_level_0,Weekly.rank,Peak.position,Weeks.on.chart,Features,Acousticness,Danceability,Duration,Energy,Explicit,Instrumentalness,Liveness,Loudness,Mode,Popularity,Speechiness,Tempo,TimeSignature,Valence,predictions
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
0,-3.207744,-1.409113,-0.062522,-0.406614,-0.59948,1.633265,-1.241571,0.170884,-0.462799,-0.135652,-0.533795,0.357381,-1.616951,1.86708,-0.297278,0.504332,0.107009,1.910658,4
1,-2.888296,-1.372648,-1.578611,-0.406614,-0.76223,1.131739,-1.463522,-0.198673,-0.462799,-0.135652,-0.801114,0.119735,0.618448,2.231682,-0.334987,-1.268124,0.107009,0.831269,4
2,-2.386308,-0.570424,-0.668958,-0.406614,-0.740747,-0.238629,-1.502393,0.751617,-0.462799,-0.135652,-0.439,1.101319,0.618448,1.86708,-0.181996,-0.50097,0.107009,0.223546,4
3,-2.295038,-0.533959,-0.163595,-0.406614,2.841103,0.312344,-0.841775,-0.849798,-0.462799,-0.135652,-0.476918,-0.414109,0.618448,1.775929,-0.407173,0.605357,0.107009,0.309716,3
4,-2.021226,-1.299718,2.969656,-0.406614,0.132673,-0.224502,-1.013812,1.168102,-0.462799,-0.135652,-0.799218,1.363075,0.618448,2.140532,-0.264956,-1.368982,0.107009,0.681606,4


In [152]:
# Create a scatter plot using hvPlot by setting 
# `x="price_change_percentage_24h"` and `y="price_change_percentage_7d"`. 
# Color the graph points with the labels found using K-Means and 
# add the crypto name in the `hover_cols` parameter to identify 
# the cryptocurrency represented by each data point.


df_02_scaled_df_copy.hvplot.scatter(x = "Popularity", y = "Duration", by = "predictions", hover_cols = "coin_id")

  return dataset.data.dtypes[idx].type
  return dataset.data.dtypes[idx].type


## Attributes

In [153]:
df_03 = pd.read_csv("../Resources/songAttributes_1999-2019.csv", encoding="latin-1")
print(df_03.info())
df_03.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 154931 entries, 0 to 154930
Data columns (total 18 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   X                 154931 non-null  int64  
 1   Acousticness      154931 non-null  float64
 2   Album             154931 non-null  object 
 3   Artist            154931 non-null  object 
 4   Danceability      154931 non-null  float64
 5   Duration          154931 non-null  int64  
 6   Energy            154931 non-null  float64
 7   Explicit          154931 non-null  bool   
 8   Instrumentalness  154931 non-null  float64
 9   Liveness          154931 non-null  float64
 10  Loudness          154931 non-null  float64
 11  Mode              154931 non-null  int64  
 12  Name              154931 non-null  object 
 13  Popularity        154931 non-null  int64  
 14  Speechiness       154931 non-null  float64
 15  Tempo             154931 non-null  float64
 16  TimeSignature     15

Unnamed: 0,X,Acousticness,Album,Artist,Danceability,Duration,Energy,Explicit,Instrumentalness,Liveness,Loudness,Mode,Name,Popularity,Speechiness,Tempo,TimeSignature,Valence
0,0,0.000728,Collective Soul (Deluxe Version),Collective Soul,0.52,234947,0.904,False,0.0103,0.0634,-5.03,1,Welcome All Again,35,0.0309,106.022,4,0.365
1,1,0.0182,Collective Soul (Deluxe Version),Collective Soul,0.581,239573,0.709,False,0.000664,0.174,-4.909,1,Fuzzy,31,0.0282,120.027,4,0.408
2,2,0.000473,Collective Soul (Deluxe Version),Collective Soul,0.572,198400,0.918,False,0.000431,0.0977,-3.324,0,Dig,30,0.0559,144.061,4,0.37
3,3,0.00097,Collective Soul (Deluxe Version),Collective Soul,0.596,231453,0.661,False,3.3e-05,0.113,-5.051,1,You,35,0.0254,111.975,4,0.183
4,4,3.6e-05,Collective Soul (Deluxe Version),Collective Soul,0.52,222520,0.808,False,1e-05,0.08,-4.553,0,My Days,21,0.0318,92.721,4,0.666


In [154]:
df_03 = df_03.drop(columns=["X","Album","Artist","Name"])
df_03 = df_03.dropna()
df_03_dummies = pd.get_dummies(df_03)
df_03_dummies.head()

Unnamed: 0,Acousticness,Danceability,Duration,Energy,Explicit,Instrumentalness,Liveness,Loudness,Mode,Popularity,Speechiness,Tempo,TimeSignature,Valence
0,0.000728,0.52,234947,0.904,False,0.0103,0.0634,-5.03,1,35,0.0309,106.022,4,0.365
1,0.0182,0.581,239573,0.709,False,0.000664,0.174,-4.909,1,31,0.0282,120.027,4,0.408
2,0.000473,0.572,198400,0.918,False,0.000431,0.0977,-3.324,0,30,0.0559,144.061,4,0.37
3,0.00097,0.596,231453,0.661,False,3.3e-05,0.113,-5.051,1,35,0.0254,111.975,4,0.183
4,3.6e-05,0.52,222520,0.808,False,1e-05,0.08,-4.553,0,21,0.0318,92.721,4,0.666


In [155]:
df_03_dummies.columns

Index(['Acousticness', 'Danceability', 'Duration', 'Energy', 'Explicit',
       'Instrumentalness', 'Liveness', 'Loudness', 'Mode', 'Popularity',
       'Speechiness', 'Tempo', 'TimeSignature', 'Valence'],
      dtype='object')

In [156]:
# Use the `StandardScaler()` module from scikit-learn to normalize the data from the CSV file

df_03_scaled = StandardScaler().fit_transform(df_03_dummies[['Acousticness', 'Danceability', 'Duration', 'Energy', 'Explicit',
       'Instrumentalness', 'Liveness', 'Loudness', 'Mode', 'Popularity',
       'Speechiness', 'Tempo', 'TimeSignature', 'Valence']])
df_03_scaled[0:5]


array([[-0.88383848, -0.3307916 ,  0.02076269,  1.14005597, -0.52116674,
        -0.26400845, -0.82273458,  0.72016176,  0.6812078 ,  0.89363594,
        -0.60203858, -0.42597394,  0.20464731, -0.55528788],
       [-0.82569678,  0.03245497,  0.05914778,  0.30212519, -0.52116674,
        -0.31227077, -0.3460696 ,  0.74923584,  0.6812078 ,  0.6513086 ,
        -0.61979453,  0.02618422,  0.20464731, -0.3763936 ],
       [-0.88468705, -0.02113878, -0.28249278,  1.20021511, -0.52116674,
        -0.31343776, -0.6749081 ,  1.13008226, -1.46798083,  0.59072676,
        -0.43763165,  0.80213348,  0.20464731, -0.53448622],
       [-0.88303317,  0.1217779 , -0.00822942,  0.0958653 , -0.52116674,
        -0.31542916, -0.608968  ,  0.71511585,  0.6812078 ,  0.89363594,
        -0.63820811, -0.23377847,  0.20464731, -1.31246832],
       [-0.88614192, -0.3307916 , -0.08235263,  0.7275362 , -0.52116674,
        -0.31554866, -0.75119173,  0.83477611, -1.46798083,  0.04549024,
        -0.59611993, -0.85

In [157]:
# Create a DataFrame with the scaled data
df_03_scaled_df = pd.DataFrame(df_03_scaled, columns = ['Acousticness', 'Danceability', 'Duration', 'Energy', 'Explicit',
       'Instrumentalness', 'Liveness', 'Loudness', 'Mode', 'Popularity',
       'Speechiness', 'Tempo', 'TimeSignature', 'Valence'])

# Copy the crypto names from the original data
df_03_scaled_df["Artist"] = df_03.index

# Set the coinid column as index
df_03_scaled_df.set_index("Artist", inplace = True)

# Display sample data
df_03_scaled_df.head()

Unnamed: 0_level_0,Acousticness,Danceability,Duration,Energy,Explicit,Instrumentalness,Liveness,Loudness,Mode,Popularity,Speechiness,Tempo,TimeSignature,Valence
Artist,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0,-0.883838,-0.330792,0.020763,1.140056,-0.521167,-0.264008,-0.822735,0.720162,0.681208,0.893636,-0.602039,-0.425974,0.204647,-0.555288
1,-0.825697,0.032455,0.059148,0.302125,-0.521167,-0.312271,-0.34607,0.749236,0.681208,0.651309,-0.619795,0.026184,0.204647,-0.376394
2,-0.884687,-0.021139,-0.282493,1.200215,-0.521167,-0.313438,-0.674908,1.130082,-1.467981,0.590727,-0.437632,0.802133,0.204647,-0.534486
3,-0.883033,0.121778,-0.008229,0.095865,-0.521167,-0.315429,-0.608968,0.715116,0.681208,0.893636,-0.638208,-0.233778,0.204647,-1.312468
4,-0.886142,-0.330792,-0.082353,0.727536,-0.521167,-0.315549,-0.751192,0.834776,-1.467981,0.04549,-0.59612,-0.855403,0.204647,0.696972


In [158]:
# Create a list with the number of k-values from 1 to 11
k_values = list(range(1,11))

# Create an empty list to store the inertia values
inertia_values = []


# Create a for loop to compute the inertia with each possible value of k
# Inside the loop:
# 1. Create a KMeans model using the loop counter for the n_clusters
# 2. Fit the model to the data using `df_market_data_scaled`
# 3. Append the model.inertia_ to the inertia list

for each_k in k_values:
    k_model_elbow = KMeans(n_clusters=each_k, random_state=1)
    k_model_elbow.fit(df_03_scaled_df)
    inertia_values.append(k_model_elbow.inertia_)

inertia_values


  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)


[2169034.0000000037,
 1855508.961996036,
 1675008.3921113396,
 1560928.2975670607,
 1475802.1300795567,
 1397792.2685760504,
 1322954.7510230015,
 1251378.2868384842,
 1204390.3189250687,
 1137178.6664083349]

In [159]:
# Create a dictionary with the data to plot the Elbow curve
elbow_dict = {"k" : k_values, "inertia" : inertia_values}


# Create a DataFrame with the data to plot the Elbow curve
elbow_df = pd.DataFrame(elbow_dict, columns= ["k", "inertia"])
elbow_df.head()

Unnamed: 0,k,inertia
0,1,2169034.0
1,2,1855509.0
2,3,1675008.0
3,4,1560928.0
4,5,1475802.0


In [160]:
# Plot a line chart with all the inertia values computed with 
# the different values of k to visually identify the optimal value for k.
elbow_df.hvplot.line(x = "k", y = "inertia")

  return dataset.data.dtypes[idx].type
  return dataset.data.dtypes[idx].type


In [161]:
# Initialize the K-Means model using the best value for k
k_model_df_market_data_scaled_4c = KMeans(n_clusters=5, random_state=1)
# Fit the K-Means model using the scaled data
k_model_df_market_data_scaled_4c.fit(df_03_scaled_df)

# Predict the clusters to group the cryptocurrencies using the scaled data
k_model_df_market_data_scaled_4c_predictions = k_model_df_market_data_scaled_4c.predict(df_03_scaled_df)

# Print the resulting array of cluster values.
k_model_df_market_data_scaled_4c_predictions


  super()._check_params_vs_input(X, default_n_init=10)


array([2, 2, 2, ..., 4, 2, 2])

In [162]:
# Create a copy of the DataFrame
df_03_scaled_df_copy = df_03_scaled_df.copy()

# Add a new column to the DataFrame with the predicted clusters
df_03_scaled_df_copy["predictions"] = k_model_df_market_data_scaled_4c_predictions

# Display sample data
df_03_scaled_df_copy.head()

Unnamed: 0_level_0,Acousticness,Danceability,Duration,Energy,Explicit,Instrumentalness,Liveness,Loudness,Mode,Popularity,Speechiness,Tempo,TimeSignature,Valence,predictions
Artist,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
0,-0.883838,-0.330792,0.020763,1.140056,-0.521167,-0.264008,-0.822735,0.720162,0.681208,0.893636,-0.602039,-0.425974,0.204647,-0.555288,2
1,-0.825697,0.032455,0.059148,0.302125,-0.521167,-0.312271,-0.34607,0.749236,0.681208,0.651309,-0.619795,0.026184,0.204647,-0.376394,2
2,-0.884687,-0.021139,-0.282493,1.200215,-0.521167,-0.313438,-0.674908,1.130082,-1.467981,0.590727,-0.437632,0.802133,0.204647,-0.534486,2
3,-0.883033,0.121778,-0.008229,0.095865,-0.521167,-0.315429,-0.608968,0.715116,0.681208,0.893636,-0.638208,-0.233778,0.204647,-1.312468,2
4,-0.886142,-0.330792,-0.082353,0.727536,-0.521167,-0.315549,-0.751192,0.834776,-1.467981,0.04549,-0.59612,-0.855403,0.204647,0.696972,4


In [163]:
# Create a scatter plot using hvPlot by setting 
# `x="price_change_percentage_24h"` and `y="price_change_percentage_7d"`. 
# Color the graph points with the labels found using K-Means and 
# add the crypto name in the `hover_cols` parameter to identify 
# the cryptocurrency represented by each data point.


df_03_scaled_df_copy.hvplot.scatter(x = "Popularity", y = "Valence", by = "predictions", hover_cols = "coin_id")

  return dataset.data.dtypes[idx].type
  return dataset.data.dtypes[idx].type
