# Aula 3 - Clusterização por gênero

## Aula 3.1 PCA e StandartScaler

In [28]:
url_dados_generos = 'https://github.com/IgorNascAlves/dados/blob/main/dados_musicas_genero.csv?raw=true'

sessao_spark.sparkContext.addFile(url_dados_generos)
path_dados_file = "file://" + SparkFiles.get("dados_musicas_genero.csv")

dados_generos = sessao_spark.read.csv(path_dados_file, header=True, inferSchema= True)

In [30]:
dados_generos.show()

+----+--------------------+-------------------+-------------------+------------------+-------------------+--------------------+-------------------+-------------------+--------------------+------------------+-------------------+------------------+---+
|mode|              genres|       acousticness|       danceability|       duration_ms|             energy|    instrumentalness|           liveness|           loudness|         speechiness|             tempo|            valence|        popularity|key|
+----+--------------------+-------------------+-------------------+------------------+-------------------+--------------------+-------------------+-------------------+--------------------+------------------+-------------------+------------------+---+
|   1|21st century clas...| 0.9793333333333332|0.16288333333333335|160297.66666666663|0.07131666666666665|          0.60683367|             0.3616|-31.514333333333337| 0.04056666666666667|           75.3365|0.10378333333333334| 27.83333333333333| 

In [31]:
dados_generos.count()

2973

In [33]:
dados_generos.select('genres').distinct().count()

2973

In [34]:
len(dados_generos.columns)

14

In [35]:
from pyspark.ml.feature import VectorAssembler

In [36]:
dados_generos.columns

['mode',
 'genres',
 'acousticness',
 'danceability',
 'duration_ms',
 'energy',
 'instrumentalness',
 'liveness',
 'loudness',
 'speechiness',
 'tempo',
 'valence',
 'popularity',
 'key']

In [37]:
X = dados_generos.columns
X.remove('genres')
X

['mode',
 'acousticness',
 'danceability',
 'duration_ms',
 'energy',
 'instrumentalness',
 'liveness',
 'loudness',
 'speechiness',
 'tempo',
 'valence',
 'popularity',
 'key']

In [38]:
dados_generos_vector = VectorAssembler(inputCols=X, outputCol='features').transform(dados_generos).select(['features', 'genres'])

In [40]:
dados_generos_vector.show(truncate=False, n=5)

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------------------+
|features                                                                                                                                                                                                                   |genres                |
+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------------------+
|[1.0,0.9793333333333332,0.16288333333333335,160297.66666666663,0.07131666666666665,0.60683367,0.3616,-31.514333333333337,0.04056666666666667,75.3365,0.10378333333333334,27.83333333333333,6.0]                            |21st century classical|
|[1.0,0.49478,0.2993

In [41]:
from pyspark.ml.feature import StandardScaler

In [42]:
scaler = StandardScaler(inputCol='features', outputCol='scaled_features')
scaler_model = scaler.fit(dados_generos_vector)
dados_generos_scaler = scaler_model.transform(dados_generos_vector)

In [43]:
dados_generos_scaler.show()

+--------------------+--------------------+--------------------+
|            features|              genres|     scaled_features|
+--------------------+--------------------+--------------------+
|[1.0,0.9793333333...|21st century clas...|[2.68174831000279...|
|[1.0,0.49478,0.29...|               432hz|[2.68174831000279...|
|[1.0,0.762,0.7120...|               8-bit|[2.68174831000279...|
|[1.0,0.6514170195...|                  []|[2.68174831000279...|
|[1.0,0.6765573049...|          a cappella|[2.68174831000279...|
|[1.0,0.45921,0.51...|            abstract|[2.68174831000279...|
|[1.0,0.3421466666...|      abstract beats|[2.68174831000279...|
|[1.0,0.2438540633...|    abstract hip hop|[2.68174831000279...|
|[0.0,0.3229999999...|           accordeon|[0.0,1.0101313736...|
|[1.0,0.446125,0.6...|           accordion|[2.68174831000279...|
|[0.0,0.0679505384...|          acid house|[0.0,0.2125045534...|
|[1.0,0.2569145079...|           acid rock|[2.68174831000279...|
|[1.0,0.00683,0.66...|   

## Aula 3.2 PCA

In [44]:
from pyspark.ml.feature import PCA

In [45]:
pca = PCA(k=2, inputCol='scaled_features', outputCol='pca_features')
model_pca = pca.fit(dados_generos_scaler)
dados_generos_pca = model_pca.transform(dados_generos_scaler)

In [47]:
dados_generos_pca.select('pca_features').show(truncate=False)

+-----------------------------------------+
|pca_features                             |
+-----------------------------------------+
|[2.5070953668885667,0.43816913737697943] |
|[-0.5969679056633488,4.981612052751348]  |
|[-4.158460276223561,-0.8366525081079943] |
|[-2.387344878512217,-0.4877989015663404] |
|[-2.6501218371679083,-0.5756819768820474]|
|[-1.496509120336763,1.8644183183717797]  |
|[-3.923520772157324,0.2851835002352836]  |
|[-4.611011109831114,-0.6783790472312378] |
|[-2.837690063084229,-0.5712993716580518] |
|[-2.706690139892783,-1.25937880797083]   |
|[-4.6983313839242875,1.2765569680619446] |
|[-3.375987496679868,0.7560741064307471]  |
|[-5.608998877066021,1.0427311644393213]  |
|[0.2954946352117687,-0.2763864586236301] |
|[-2.5725591062870428,-1.3169815431109795]|
|[-3.4008228020493454,0.5073029625781897] |
|[-4.366720316263419,-0.3364827059771091] |
|[-2.7254698167724003,0.5058604987046365] |
|[-4.958112358381605,1.2627579957290729]  |
|[-3.6934951846422712,1.38227620

In [48]:
from pyspark.ml import Pipeline

In [49]:
pca_pipeline = Pipeline(stages=[VectorAssembler(inputCols=X, outputCol='features'),
                                StandardScaler(inputCol='features', outputCol='scaled_features'),
                                PCA(k=2, inputCol='scaled_features', outputCol='pca_features')])

In [50]:
pca_pipeline_model = pca_pipeline.fit(dados_generos)

In [51]:
dados_generos_pca = pca_pipeline_model.transform(dados_generos)

In [52]:
dados_generos_pca.show()

+----+--------------------+-------------------+-------------------+------------------+-------------------+--------------------+-------------------+-------------------+--------------------+------------------+-------------------+------------------+---+--------------------+--------------------+--------------------+
|mode|              genres|       acousticness|       danceability|       duration_ms|             energy|    instrumentalness|           liveness|           loudness|         speechiness|             tempo|            valence|        popularity|key|            features|     scaled_features|        pca_features|
+----+--------------------+-------------------+-------------------+------------------+-------------------+--------------------+-------------------+-------------------+--------------------+------------------+-------------------+------------------+---+--------------------+--------------------+--------------------+
|   1|21st century clas...| 0.9793333333333332|0.162883333

In [53]:
dados_generos_pca.select('pca_features').show(truncate=False)

+-----------------------------------------+
|pca_features                             |
+-----------------------------------------+
|[2.5070953668885667,0.43816913737697943] |
|[-0.5969679056633488,4.981612052751348]  |
|[-4.158460276223561,-0.8366525081079943] |
|[-2.387344878512217,-0.4877989015663404] |
|[-2.6501218371679083,-0.5756819768820474]|
|[-1.496509120336763,1.8644183183717797]  |
|[-3.923520772157324,0.2851835002352836]  |
|[-4.611011109831114,-0.6783790472312378] |
|[-2.837690063084229,-0.5712993716580518] |
|[-2.706690139892783,-1.25937880797083]   |
|[-4.6983313839242875,1.2765569680619446] |
|[-3.375987496679868,0.7560741064307471]  |
|[-5.608998877066021,1.0427311644393213]  |
|[0.2954946352117687,-0.2763864586236301] |
|[-2.5725591062870428,-1.3169815431109795]|
|[-3.4008228020493454,0.5073029625781897] |
|[-4.366720316263419,-0.3364827059771091] |
|[-2.7254698167724003,0.5058604987046365] |
|[-4.958112358381605,1.2627579957290729]  |
|[-3.6934951846422712,1.38227620

## Aula 3.3 K-Means

https://user.ceng.metu.edu.tr/~akifakkus/courses/ceng574/k-means/

In [54]:
from pyspark.ml.clustering import KMeans

In [55]:
SEED = 1224

In [56]:
kmeans = KMeans(featuresCol='pca_features', predictionCol='cluster_pca').setK(5).setSeed(SEED)

In [57]:
model_kmeans = kmeans.fit(dados_generos_pca)

In [59]:
prections_kmeans = model_kmeans.transform(dados_generos_pca)

In [63]:
prections_kmeans.select('pca_features', 'cluster_pca').show(truncate=False)

+-----------------------------------------+-----------+
|pca_features                             |cluster_pca|
+-----------------------------------------+-----------+
|[2.5070953668885667,0.43816913737697943] |2          |
|[-0.5969679056633488,4.981612052751348]  |2          |
|[-4.158460276223561,-0.8366525081079943] |4          |
|[-2.387344878512217,-0.4877989015663404] |0          |
|[-2.6501218371679083,-0.5756819768820474]|0          |
|[-1.496509120336763,1.8644183183717797]  |2          |
|[-3.923520772157324,0.2851835002352836]  |4          |
|[-4.611011109831114,-0.6783790472312378] |1          |
|[-2.837690063084229,-0.5712993716580518] |4          |
|[-2.706690139892783,-1.25937880797083]   |0          |
|[-4.6983313839242875,1.2765569680619446] |3          |
|[-3.375987496679868,0.7560741064307471]  |4          |
|[-5.608998877066021,1.0427311644393213]  |1          |
|[0.2954946352117687,-0.2763864586236301] |2          |
|[-2.5725591062870428,-1.3169815431109795]|0    

## Aula 3.4 Plotando a clustering

In [64]:
from pyspark.ml.functions import vector_to_array

In [66]:
pca_features_xy = prections_kmeans.withColumn('x', vector_to_array('pca_features')[0])\
                .withColumn('y', vector_to_array('pca_features')[1])\
                .select(['x', 'y', 'cluster_pca', 'genres'])

In [67]:
pca_features_xy.show()

+-------------------+-------------------+-----------+--------------------+
|                  x|                  y|cluster_pca|              genres|
+-------------------+-------------------+-----------+--------------------+
| 2.5070953668885667|0.43816913737697943|          2|21st century clas...|
|-0.5969679056633488|  4.981612052751348|          2|               432hz|
| -4.158460276223561|-0.8366525081079943|          4|               8-bit|
| -2.387344878512217|-0.4877989015663404|          0|                  []|
|-2.6501218371679083|-0.5756819768820474|          0|          a cappella|
| -1.496509120336763| 1.8644183183717797|          2|            abstract|
| -3.923520772157324| 0.2851835002352836|          4|      abstract beats|
| -4.611011109831114|-0.6783790472312378|          1|    abstract hip hop|
| -2.837690063084229|-0.5712993716580518|          4|           accordeon|
| -2.706690139892783|  -1.25937880797083|          0|           accordion|
|-4.6983313839242875| 1.2

In [68]:
fig = px.scatter(pca_features_xy.toPandas(), x='x', y='y', color='cluster_pca', hover_data=['x', 'y', 'genres'])
fig.show()

In [71]:
pca_pipeline_model.stages[2].explainedVariance

DenseVector([0.2975, 0.1212])