In [14]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StandardScaler
from pyspark.ml.clustering import GaussianMixture
import numpy as np

from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator

import random

In [2]:
# creating spark app
spark = SparkSession.builder.appName("K-means Implementation").getOrCreate()

#reading in the CSV file 
# df = spark.read.csv('numpy_array_for_modeling.csv', header=True, inferSchema=True)
df = spark.read.csv('numpy_array_for_modeling_with_cathegorical_columns.csv',
                    header=True, inferSchema = True)

df.printSchema()

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
25/12/02 17:04:45 WARN Utils: Your hostname, Bens-MacBook-Air-7.local, resolves to a loopback address: 127.0.0.1; using 10.0.0.76 instead (on interface en0)
25/12/02 17:04:45 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/12/02 17:04:46 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


root
 |-- title: string (nullable = true)
 |-- artist: string (nullable = true)
 |-- album: string (nullable = true)
 |-- af_danceability: double (nullable = true)
 |-- af_energy: double (nullable = true)
 |-- af_loudness: double (nullable = true)
 |-- af_speechiness: double (nullable = true)
 |-- af_acousticness: double (nullable = true)
 |-- af_instrumentalness: double (nullable = true)
 |-- af_valence: double (nullable = true)
 |-- af_tempo: double (nullable = true)
 |-- language_id: double (nullable = true)



In [3]:
#checking if there are any columns that we need to delete if they break the mold
df = df.orderBy('Title')
row_53 = df.limit(52).collect()[-1]
print(row_53)

Row(title='El Taxi', artist='"Pitbull, Sensato, Osmani Garcia ""La Voz"""', album='Dale', af_danceability=0.869, af_energy=0.691, af_loudness=-6.872, af_speechiness=0.192, af_acousticness=0.12, af_instrumentalness=0.0, af_valence=0.875, af_tempo=95.019, language_id=1.0)


In [4]:
columns_to_clean = ['title','artist', 'album']
# we are removing quotations from the titles
df = df.select(
    *[
        F.regexp_replace(c, r"[\"']", "").alias(c)
        if c in columns_to_clean else c
        for c in df.columns
    ]
)

In [5]:
#checking if there are any columns that we need to delete if they break the mold
df = df.orderBy('Title')
row_53 = df.limit(52).collect()[-1]
print(row_53)

Row(title='El Taxi', artist='Pitbull, Sensato, Osmani Garcia La Voz', album='Dale', af_danceability=0.869, af_energy=0.691, af_loudness=-6.872, af_speechiness=0.192, af_acousticness=0.12, af_instrumentalness=0.0, af_valence=0.875, af_tempo=95.019, language_id=1.0)


In [6]:
df.printSchema()

root
 |-- title: string (nullable = true)
 |-- artist: string (nullable = true)
 |-- album: string (nullable = true)
 |-- af_danceability: double (nullable = true)
 |-- af_energy: double (nullable = true)
 |-- af_loudness: double (nullable = true)
 |-- af_speechiness: double (nullable = true)
 |-- af_acousticness: double (nullable = true)
 |-- af_instrumentalness: double (nullable = true)
 |-- af_valence: double (nullable = true)
 |-- af_tempo: double (nullable = true)
 |-- language_id: double (nullable = true)



In [7]:
input_columns =  [
    "af_danceability",
    "af_energy",
    "af_loudness",
    "af_speechiness",
    "af_acousticness",
    "af_instrumentalness",
    "af_valence",
    "af_tempo",
    "language_id"
]
vector_assembler = VectorAssembler(inputCols = input_columns, 
                            outputCol = 'vectorized_features')
vector_dataframe = vector_assembler.transform(df) # df is our data without being vectorized
transformed_vectors = vector_dataframe.select('vectorized_features') # selecting the vectorized features so we can use them in a model
transformed_vectors.show(5)

+--------------------+
| vectorized_features|
+--------------------+
|[0.818,0.803,-4.2...|
|[0.849,0.759,-6.2...|
|[0.731,0.863,-5.3...|
|[0.745,0.875,-4.2...|
|[0.724,0.654,-8.3...|
+--------------------+
only showing top 5 rows


In [8]:
vector_dataframe.printSchema()

root
 |-- title: string (nullable = true)
 |-- artist: string (nullable = true)
 |-- album: string (nullable = true)
 |-- af_danceability: double (nullable = true)
 |-- af_energy: double (nullable = true)
 |-- af_loudness: double (nullable = true)
 |-- af_speechiness: double (nullable = true)
 |-- af_acousticness: double (nullable = true)
 |-- af_instrumentalness: double (nullable = true)
 |-- af_valence: double (nullable = true)
 |-- af_tempo: double (nullable = true)
 |-- language_id: double (nullable = true)
 |-- vectorized_features: vector (nullable = true)



In [9]:
transformed_vectors.printSchema()

root
 |-- vectorized_features: vector (nullable = true)



In [10]:
#scaling the data for k-means since it's a distance based algorithm 
scaler = StandardScaler(inputCol = 'vectorized_features',
                       outputCol = 'scaled_vectorized_features',
                       withStd = True,
                       withMean = False)
#check the summary statistics of our resutls by fitting the standard scaler
scalerModel = scaler.fit(vector_dataframe)

#normalizing each feature to have unit standard deviation
vector_dataframe_scaled = scalerModel.transform(vector_dataframe)

#showing the scaled features
vector_dataframe_scaled.select('scaled_vectorized_features').show(5)

+--------------------------+
|scaled_vectorized_features|
+--------------------------+
|      [8.97331532253304...|
|      [9.31337983964615...|
|      [8.01894070998979...|
|      [8.17251823384733...|
|      [7.94215194806103...|
+--------------------------+
only showing top 5 rows


In [11]:
vector_dataframe_scaled.printSchema()

root
 |-- title: string (nullable = true)
 |-- artist: string (nullable = true)
 |-- album: string (nullable = true)
 |-- af_danceability: double (nullable = true)
 |-- af_energy: double (nullable = true)
 |-- af_loudness: double (nullable = true)
 |-- af_speechiness: double (nullable = true)
 |-- af_acousticness: double (nullable = true)
 |-- af_instrumentalness: double (nullable = true)
 |-- af_valence: double (nullable = true)
 |-- af_tempo: double (nullable = true)
 |-- language_id: double (nullable = true)
 |-- vectorized_features: vector (nullable = true)
 |-- scaled_vectorized_features: vector (nullable = true)



| Silhouette Score | Interpretation                                |
| ---------------- | --------------------------------------------- |
| **0.7 – 1.0**    | Excellent clusters — dense and well separated |
| **0.5 – 0.7**    | Good — meaningful structure                   |
| **0.25 – 0.5**   | Weak — clusters overlap somewhat              |
| **0.0 – 0.25**   | Very weak — clusters probably not meaningful  |
| **Negative**     | Bad — points assigned to the wrong cluster    |


In [16]:
#using squared eucledian
sil_score_squaredEuclidean = []
evaluator = ClusteringEvaluator(predictionCol = 'prediction',
                               featuresCol = 'scaled_vectorized_features',
                               metricName = 'silhouette',
                               distanceMeasure = 'squaredEuclidean' 
                               # distanceMeasure = 'cosine' #cosine gives us better results
                               )

top_model_seed = 0
top_model_score = 0
for i in range(2,15):
    seed = random.randint(1,10000)
    kmeans = KMeans(featuresCol = 'scaled_vectorized_features', k = i, seed=seed)
    model = kmeans.fit(vector_dataframe_scaled)
    predictions = model.transform(vector_dataframe_scaled)
    score = evaluator.evaluate(predictions)
    sil_score_squaredEuclidean.append(score)
    print("Silhouette Score with Euclidian for K = ", i , 'is', score)
    if top_model_score < score:
        top_model_score = score
        top_model_seed = seed

Silhouette Score with Euclidian for K =  2 is 0.3096624988387972
Silhouette Score with Euclidian for K =  3 is 0.3282423864114304
Silhouette Score with Euclidian for K =  4 is 0.2685224102343196
Silhouette Score with Euclidian for K =  5 is 0.2204557405761854
Silhouette Score with Euclidian for K =  6 is 0.1963264112439814
Silhouette Score with Euclidian for K =  7 is 0.2605665517161189
Silhouette Score with Euclidian for K =  8 is 0.1535948767638622
Silhouette Score with Euclidian for K =  9 is 0.26109691179071376
Silhouette Score with Euclidian for K =  10 is 0.26656374546078015
Silhouette Score with Euclidian for K =  11 is 0.16548211955352823
Silhouette Score with Euclidian for K =  12 is 0.265549341972945
Silhouette Score with Euclidian for K =  13 is 0.23497141733738333
Silhouette Score with Euclidian for K =  14 is 0.23100245363397637


In [None]:
kmeans = KMeans(featuresCol = 'scaled_vectorized_features', k = 8, seed=top_model_seed)
model = kmeans.fit(vector_dataframe_scaled)
model.save('kmeans_model')
scalerModel.save('scaler_model_kmeans')

In [None]:
#using cosine similarity
sil_score_cosine = []
evaluator = ClusteringEvaluator(predictionCol = 'prediction',
                               featuresCol = 'scaled_vectorized_features',
                               metricName = 'silhouette', 
                               distanceMeasure = 'cosine'
                               )
for i in range(2,15):
    kmeans = KMeans(featuresCol = 'scaled_vectorized_features', k = i, seed=random.randint(1,10000))
    model = kmeans.fit(vector_dataframe_scaled)
    predictions = model.transform(vector_dataframe_scaled)
    score = evaluator.evaluate(predictions)
    sil_score_cosine.append(score)
    print("Silhouette Score with Cosine for K = ", i , 'is', score)


Silhouette Score with Cosine for K =  2 is 0.290325719630199
Silhouette Score with Cosine for K =  3 is 0.33653742996942393
Silhouette Score with Cosine for K =  4 is 0.21336792884566838
Silhouette Score with Cosine for K =  5 is 0.1803008220887235
Silhouette Score with Cosine for K =  6 is 0.3019975258138113
Silhouette Score with Cosine for K =  7 is 0.2971146223921529
Silhouette Score with Cosine for K =  8 is 0.2680694460564235
Silhouette Score with Cosine for K =  9 is 0.21002383558357204
Silhouette Score with Cosine for K =  10 is 0.2721058279811262
Silhouette Score with Cosine for K =  11 is 0.1794429043718618
Silhouette Score with Cosine for K =  12 is 0.1995899186874501
Silhouette Score with Cosine for K =  13 is 0.2073136812158296
Silhouette Score with Cosine for K =  14 is 0.2525237795959874
