In [19]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StandardScaler
from pyspark.ml.clustering import GaussianMixture
import numpy as np

from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator

In [20]:
# creating spark app
spark = SparkSession.builder.appName("K-means Implementation").getOrCreate()

#reading in the CSV file 
# df = spark.read.csv('numpy_array_for_modeling.csv', header=True, inferSchema=True)
df = spark.read.csv('numpy_array_for_modeling_with_cathegorical_columns.csv',
                    header=True, inferSchema = True)

df.printSchema()

root
 |-- title: string (nullable = true)
 |-- artist: string (nullable = true)
 |-- album: string (nullable = true)
 |-- af_danceability: double (nullable = true)
 |-- af_energy: double (nullable = true)
 |-- af_loudness: double (nullable = true)
 |-- af_speechiness: double (nullable = true)
 |-- af_acousticness: double (nullable = true)
 |-- af_instrumentalness: double (nullable = true)
 |-- af_valence: double (nullable = true)
 |-- af_tempo: double (nullable = true)
 |-- language_id: double (nullable = true)



In [21]:
#checking if there are any columns that we need to delete if they break the mold
df = df.orderBy('Title')
row_53 = df.limit(52).collect()[-1]
print(row_53)

Row(title='El Taxi', artist='"Pitbull, Sensato, Osmani Garcia ""La Voz"""', album='Dale', af_danceability=0.869, af_energy=0.691, af_loudness=-6.872, af_speechiness=0.192, af_acousticness=0.12, af_instrumentalness=0.0, af_valence=0.875, af_tempo=95.019, language_id=1.0)


In [22]:
columns_to_clean = ['title','artist', 'album']
# we are removing quotations from the titles
df = df.select(
    *[
        F.regexp_replace(c, r"[\"']", "").alias(c)
        if c in columns_to_clean else c
        for c in df.columns
    ]
)

In [23]:
#checking if there are any columns that we need to delete if they break the mold
df = df.orderBy('Title')
row_53 = df.limit(52).collect()[-1]
print(row_53)

Row(title='El Taxi', artist='Pitbull, Sensato, Osmani Garcia La Voz', album='Dale', af_danceability=0.869, af_energy=0.691, af_loudness=-6.872, af_speechiness=0.192, af_acousticness=0.12, af_instrumentalness=0.0, af_valence=0.875, af_tempo=95.019, language_id=1.0)


In [24]:
df.printSchema()

root
 |-- title: string (nullable = true)
 |-- artist: string (nullable = true)
 |-- album: string (nullable = true)
 |-- af_danceability: double (nullable = true)
 |-- af_energy: double (nullable = true)
 |-- af_loudness: double (nullable = true)
 |-- af_speechiness: double (nullable = true)
 |-- af_acousticness: double (nullable = true)
 |-- af_instrumentalness: double (nullable = true)
 |-- af_valence: double (nullable = true)
 |-- af_tempo: double (nullable = true)
 |-- language_id: double (nullable = true)



In [25]:
input_columns =  [
    "af_danceability",
    "af_energy",
    "af_loudness",
    "af_speechiness",
    "af_acousticness",
    "af_instrumentalness",
    "af_valence",
    "af_tempo",
    "language_id"
]
vector_assembler = VectorAssembler(inputCols = input_columns, 
                            outputCol = 'vectorized_features')
vector_dataframe = vector_assembler.transform(df) # df is our data without being vectorized
transformed_vectors = vector_dataframe.select('vectorized_features') # selecting the vectorized features so we can use them in a model
transformed_vectors.show(5)

+--------------------+
| vectorized_features|
+--------------------+
|[0.818,0.803,-4.2...|
|[0.849,0.759,-6.2...|
|[0.731,0.863,-5.3...|
|[0.745,0.875,-4.2...|
|[0.724,0.654,-8.3...|
+--------------------+
only showing top 5 rows



In [26]:
vector_dataframe.printSchema()

root
 |-- title: string (nullable = true)
 |-- artist: string (nullable = true)
 |-- album: string (nullable = true)
 |-- af_danceability: double (nullable = true)
 |-- af_energy: double (nullable = true)
 |-- af_loudness: double (nullable = true)
 |-- af_speechiness: double (nullable = true)
 |-- af_acousticness: double (nullable = true)
 |-- af_instrumentalness: double (nullable = true)
 |-- af_valence: double (nullable = true)
 |-- af_tempo: double (nullable = true)
 |-- language_id: double (nullable = true)
 |-- vectorized_features: vector (nullable = true)



In [27]:
transformed_vectors.printSchema()

root
 |-- vectorized_features: vector (nullable = true)



In [28]:
#scaling the data for k-means since it's a distance based algorithm 
scaler = StandardScaler(inputCol = 'vectorized_features',
                       outputCol = 'scaled_vectorized_features',
                       withStd = True,
                       withMean = False)
#check the summary statistics of our resutls by fitting the standard scaler
scalerModel = scaler.fit(vector_dataframe)

#normalizing each feature to have unit standard deviation
vector_dataframe_scaled = scalerModel.transform(vector_dataframe)

#showing the scaled features
vector_dataframe_scaled.select('scaled_vectorized_features').show(5)

+--------------------------+
|scaled_vectorized_features|
+--------------------------+
|      [8.97331532253304...|
|      [9.31337983964615...|
|      [8.01894070998979...|
|      [8.17251823384733...|
|      [7.94215194806103...|
+--------------------------+
only showing top 5 rows



In [29]:
vector_dataframe_scaled.printSchema()

root
 |-- title: string (nullable = true)
 |-- artist: string (nullable = true)
 |-- album: string (nullable = true)
 |-- af_danceability: double (nullable = true)
 |-- af_energy: double (nullable = true)
 |-- af_loudness: double (nullable = true)
 |-- af_speechiness: double (nullable = true)
 |-- af_acousticness: double (nullable = true)
 |-- af_instrumentalness: double (nullable = true)
 |-- af_valence: double (nullable = true)
 |-- af_tempo: double (nullable = true)
 |-- language_id: double (nullable = true)
 |-- vectorized_features: vector (nullable = true)
 |-- scaled_vectorized_features: vector (nullable = true)



| Silhouette Score | Interpretation                                |
| ---------------- | --------------------------------------------- |
| **0.7 – 1.0**    | Excellent clusters — dense and well separated |
| **0.5 – 0.7**    | Good — meaningful structure                   |
| **0.25 – 0.5**   | Weak — clusters overlap somewhat              |
| **0.0 – 0.25**   | Very weak — clusters probably not meaningful  |
| **Negative**     | Bad — points assigned to the wrong cluster    |


In [34]:
#using squared eucledian
sil_score_squaredEuclidean = []
evaluator = ClusteringEvaluator(predictionCol = 'prediction',
                               featuresCol = 'scaled_vectorized_features',
                               metricName = 'silhouette',
                               distanceMeasure = 'squaredEuclidean' 
                               # distanceMeasure = 'cosine' #cosine gives us better results
                               )
for i in range(2,15):
    kmeans = KMeans(featuresCol = 'scaled_vectorized_features', k = i)
    model = kmeans.fit(vector_dataframe_scaled)
    predictions = model.transform(vector_dataframe_scaled)
    score = evaluator.evaluate(predictions)
    sil_score_squaredEuclidean.append(score)
    print("Silhouette Score with Euclidian for K = ", i , 'is', score)


Silhouette Score for K =  2 is 0.22353895051776462
Silhouette Score for K =  3 is 0.2500682067035294
Silhouette Score for K =  4 is 0.29563365375859507
Silhouette Score for K =  5 is 0.23115307206521082
Silhouette Score for K =  6 is 0.279244472939588
Silhouette Score for K =  7 is 0.2261657664611322
Silhouette Score for K =  8 is 0.28317650357154045
Silhouette Score for K =  9 is 0.22850796609771618
Silhouette Score for K =  10 is 0.2678315796058076
Silhouette Score for K =  11 is 0.19569639948044257
Silhouette Score for K =  12 is 0.3124989305190608
Silhouette Score for K =  13 is 0.2011263922381937
Silhouette Score for K =  14 is 0.2739895986001434


In [35]:
#using cosine similarity
sil_score_cosine = []
evaluator = ClusteringEvaluator(predictionCol = 'prediction',
                               featuresCol = 'scaled_vectorized_features',
                               metricName = 'silhouette', 
                               distanceMeasure = 'cosine'
                               )
for i in range(2,15):
    kmeans = KMeans(featuresCol = 'scaled_vectorized_features', k = i)
    model = kmeans.fit(vector_dataframe_scaled)
    predictions = model.transform(vector_dataframe_scaled)
    score = evaluator.evaluate(predictions)
    sil_score_cosine.append(score)
    print("Silhouette Score with Cosine for K = ", i , 'is', score)


Silhouette Score for K =  2 is 0.24238690930991852
Silhouette Score for K =  3 is 0.26606653994483337
Silhouette Score for K =  4 is 0.3166084352904843
Silhouette Score for K =  5 is 0.24261747646722825
Silhouette Score for K =  6 is 0.25271560887434374
Silhouette Score for K =  7 is 0.21484177189426215
Silhouette Score for K =  8 is 0.27516396644730096
Silhouette Score for K =  9 is 0.22642796768809426
Silhouette Score for K =  10 is 0.2714856518703668
Silhouette Score for K =  11 is 0.22197027233295988
Silhouette Score for K =  12 is 0.30719147876791963
Silhouette Score for K =  13 is 0.1928410053521714
Silhouette Score for K =  14 is 0.2744126014466831
