In [2]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StandardScaler
from pyspark.ml.clustering import GaussianMixture
import numpy as np

from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator

import random

In [3]:
# creating spark app
spark = SparkSession.builder.appName("K-means Implementation").getOrCreate()

#reading in the CSV file 
# df = spark.read.csv('numpy_array_for_modeling.csv', header=True, inferSchema=True)
file_read = 'gs://term-project-fall-202511162025/running_big_data/big_data_ready_for_modeling.csv'
df = spark.read.csv(file_read,
                    header=True, inferSchema = True)

df.printSchema()

root
 |-- _c0: integer (nullable = true)
 |-- title: string (nullable = true)
 |-- artist: string (nullable = true)
 |-- album: string (nullable = true)
 |-- af_danceability: double (nullable = true)
 |-- af_energy: double (nullable = true)
 |-- af_loudness: double (nullable = true)
 |-- af_speechiness: double (nullable = true)
 |-- af_acousticness: double (nullable = true)
 |-- af_instrumentalness: double (nullable = true)
 |-- af_valence: double (nullable = true)
 |-- af_tempo: double (nullable = true)
 |-- language_id: double (nullable = true)



In [4]:
#checking if there are any columns that we need to delete if they break the mold
df = df.orderBy('Title')
row_53 = df.limit(52).collect()[-1]
print(row_53)

Row(_c0=3105490, title='"Alus Dziesma (""Vella Kalpi"")"', artist='Raimonds Pauls, Vīru Vokālais Ansamblis Edgara Račevska Vadībā', album='Teic, Kur Zeme Tā', af_danceability=0.684, af_energy=0.419, af_loudness=-7.714, af_speechiness=0.0296, af_acousticness=0.88, af_instrumentalness=0.756, af_valence=0.956, af_tempo=96.745, language_id=-2.0)


In [5]:
columns_to_clean = ['title','artist', 'album']
# we are removing quotations from the titles
df = df.select(
    *[
        F.regexp_replace(c, r"[\"']", "").alias(c)
        if c in columns_to_clean else c
        for c in df.columns
    ]
)

In [6]:
#checking if there are any columns that we need to delete if they break the mold
df = df.orderBy('Title')
row_53 = df.limit(52).collect()[-1]
print(row_53)

Row(_c0=1508866, title='#Summerbody', artist='DJ Muscleboy, Sverrir Bergmann', album='#Summerbody', af_danceability=0.414, af_energy=0.97, af_loudness=-2.954, af_speechiness=0.387, af_acousticness=0.0402, af_instrumentalness=0.0, af_valence=0.217, af_tempo=160.026, language_id=-2.0)


In [7]:
df.printSchema()

root
 |-- _c0: integer (nullable = true)
 |-- title: string (nullable = true)
 |-- artist: string (nullable = true)
 |-- album: string (nullable = true)
 |-- af_danceability: double (nullable = true)
 |-- af_energy: double (nullable = true)
 |-- af_loudness: double (nullable = true)
 |-- af_speechiness: double (nullable = true)
 |-- af_acousticness: double (nullable = true)
 |-- af_instrumentalness: double (nullable = true)
 |-- af_valence: double (nullable = true)
 |-- af_tempo: double (nullable = true)
 |-- language_id: double (nullable = true)



In [8]:
input_columns =  [
    "af_danceability",
    "af_energy",
    "af_loudness",
    "af_speechiness",
    "af_acousticness",
    "af_instrumentalness",
    "af_valence",
    "af_tempo",
    "language_id"
]
vector_assembler = VectorAssembler(inputCols = input_columns, 
                            outputCol = 'vectorized_features')
vector_dataframe = vector_assembler.transform(df) # df is our data without being vectorized
transformed_vectors = vector_dataframe.select('vectorized_features') # selecting the vectorized features so we can use them in a model
transformed_vectors.show(5)

+--------------------+
| vectorized_features|
+--------------------+
|[0.561,0.798,-7.0...|
|[0.66,0.593,-7.96...|
|[0.688,0.479,-7.4...|
|[0.72,0.601,-10.1...|
|[0.752,0.638,-8.4...|
+--------------------+
only showing top 5 rows



In [9]:
vector_dataframe.printSchema()

root
 |-- _c0: integer (nullable = true)
 |-- title: string (nullable = true)
 |-- artist: string (nullable = true)
 |-- album: string (nullable = true)
 |-- af_danceability: double (nullable = true)
 |-- af_energy: double (nullable = true)
 |-- af_loudness: double (nullable = true)
 |-- af_speechiness: double (nullable = true)
 |-- af_acousticness: double (nullable = true)
 |-- af_instrumentalness: double (nullable = true)
 |-- af_valence: double (nullable = true)
 |-- af_tempo: double (nullable = true)
 |-- language_id: double (nullable = true)
 |-- vectorized_features: vector (nullable = true)



In [10]:
transformed_vectors.printSchema()

root
 |-- vectorized_features: vector (nullable = true)



In [11]:
#scaling the data for k-means since it's a distance based algorithm 
scaler = StandardScaler(inputCol = 'vectorized_features',
                       outputCol = 'scaled_vectorized_features',
                       withStd = True,
                       withMean = False)
#check the summary statistics of our resutls by fitting the standard scaler
scalerModel = scaler.fit(vector_dataframe)

#normalizing each feature to have unit standard deviation
vector_dataframe_scaled = scalerModel.transform(vector_dataframe)

#showing the scaled features
vector_dataframe_scaled.select('scaled_vectorized_features').show(5)

+--------------------------+
|scaled_vectorized_features|
+--------------------------+
|      [3.64958367206017...|
|      [4.29362784948256...|
|      [4.47578175824848...|
|      [4.68395765398097...|
|      [4.89213354971346...|
+--------------------------+
only showing top 5 rows



In [12]:
vector_dataframe_scaled.printSchema()

root
 |-- _c0: integer (nullable = true)
 |-- title: string (nullable = true)
 |-- artist: string (nullable = true)
 |-- album: string (nullable = true)
 |-- af_danceability: double (nullable = true)
 |-- af_energy: double (nullable = true)
 |-- af_loudness: double (nullable = true)
 |-- af_speechiness: double (nullable = true)
 |-- af_acousticness: double (nullable = true)
 |-- af_instrumentalness: double (nullable = true)
 |-- af_valence: double (nullable = true)
 |-- af_tempo: double (nullable = true)
 |-- language_id: double (nullable = true)
 |-- vectorized_features: vector (nullable = true)
 |-- scaled_vectorized_features: vector (nullable = true)



| Silhouette Score | Interpretation                                |
| ---------------- | --------------------------------------------- |
| **0.7 – 1.0**    | Excellent clusters — dense and well separated |
| **0.5 – 0.7**    | Good — meaningful structure                   |
| **0.25 – 0.5**   | Weak — clusters overlap somewhat              |
| **0.0 – 0.25**   | Very weak — clusters probably not meaningful  |
| **Negative**     | Bad — points assigned to the wrong cluster    |


In [13]:
#using squared eucledian
sil_score_squaredEuclidean = []
evaluator = ClusteringEvaluator(predictionCol = 'prediction',
                               featuresCol = 'scaled_vectorized_features',
                               metricName = 'silhouette',
                               distanceMeasure = 'squaredEuclidean' 
                               # distanceMeasure = 'cosine' #cosine gives us better results
                               )

top_model_seed = 0
top_model_score = 0
for i in range(2,15):
    seed = random.randint(1,10000)
    kmeans = KMeans(featuresCol = 'scaled_vectorized_features', k = i, seed=seed)
    model = kmeans.fit(vector_dataframe_scaled)
    predictions = model.transform(vector_dataframe_scaled)
    score = evaluator.evaluate(predictions)
    sil_score_squaredEuclidean.append(score)
    print("Silhouette Score with Euclidian for K = ", i , 'is', score)
    if top_model_score < score:
        top_model_score = score
        top_model_seed = seed

Silhouette Score with Euclidian for K =  2 is 0.39942939907340136
Silhouette Score with Euclidian for K =  3 is 0.22429778141502657
Silhouette Score with Euclidian for K =  4 is 0.349248523984464
Silhouette Score with Euclidian for K =  5 is 0.20730141726415233
Silhouette Score with Euclidian for K =  6 is 0.2715562973802937
Silhouette Score with Euclidian for K =  7 is 0.26438983879245337
Silhouette Score with Euclidian for K =  8 is 0.26886196741862173
Silhouette Score with Euclidian for K =  9 is 0.2598426131544815
Silhouette Score with Euclidian for K =  10 is 0.24740655222946892
Silhouette Score with Euclidian for K =  11 is 0.22962987141405136
Silhouette Score with Euclidian for K =  12 is 0.25276568507859765
Silhouette Score with Euclidian for K =  13 is 0.2605782453622631
Silhouette Score with Euclidian for K =  14 is 0.21885541710836356


In [15]:
kmeans = KMeans(featuresCol = 'scaled_vectorized_features', k = 8, seed=top_model_seed)
model = kmeans.fit(vector_dataframe_scaled)
model.save('kmeans_model_big_data')
scalerModel.save('scaler_model_kmeans_big_data')

In [None]:
#using cosine similarity
sil_score_cosine = []
evaluator = ClusteringEvaluator(predictionCol = 'prediction',
                               featuresCol = 'scaled_vectorized_features',
                               metricName = 'silhouette', 
                               distanceMeasure = 'cosine'
                               )
for i in range(2,15):
    kmeans = KMeans(featuresCol = 'scaled_vectorized_features', k = i, seed=random.randint(1,10000))
    model = kmeans.fit(vector_dataframe_scaled)
    predictions = model.transform(vector_dataframe_scaled)
    score = evaluator.evaluate(predictions)
    sil_score_cosine.append(score)
    print("Silhouette Score with Cosine for K = ", i , 'is', score)


Silhouette Score with Cosine for K =  2 is 0.290325719630199
Silhouette Score with Cosine for K =  3 is 0.33653742996942393
Silhouette Score with Cosine for K =  4 is 0.21336792884566838
Silhouette Score with Cosine for K =  5 is 0.1803008220887235
Silhouette Score with Cosine for K =  6 is 0.3019975258138113
Silhouette Score with Cosine for K =  7 is 0.2971146223921529
Silhouette Score with Cosine for K =  8 is 0.2680694460564235
Silhouette Score with Cosine for K =  9 is 0.21002383558357204
Silhouette Score with Cosine for K =  10 is 0.2721058279811262
Silhouette Score with Cosine for K =  11 is 0.1794429043718618
Silhouette Score with Cosine for K =  12 is 0.1995899186874501
Silhouette Score with Cosine for K =  13 is 0.2073136812158296
Silhouette Score with Cosine for K =  14 is 0.2525237795959874
