In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StandardScaler
from pyspark.ml.clustering import GaussianMixture
import numpy as np

from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator
from pyspark.ml.functions import vector_to_array

import random

In [2]:
# creating spark app
spark = SparkSession.builder.appName("K-means Implementation").getOrCreate()

#reading in the CSV file 
df = spark.read.csv('big_data_ready_for_modeling.csv', header=True, inferSchema=True)
# file_read = 'gs://term-project-fall-202511162025/running_big_data/big_data_ready_for_modeling.csv'
# df = spark.read.csv(file_read,
#                     header=True, inferSchema = True)

df.printSchema()

root
 |-- _c0: integer (nullable = true)
 |-- title: string (nullable = true)
 |-- artist: string (nullable = true)
 |-- album: string (nullable = true)
 |-- af_danceability: double (nullable = true)
 |-- af_energy: double (nullable = true)
 |-- af_loudness: double (nullable = true)
 |-- af_speechiness: double (nullable = true)
 |-- af_acousticness: double (nullable = true)
 |-- af_instrumentalness: double (nullable = true)
 |-- af_valence: double (nullable = true)
 |-- af_tempo: double (nullable = true)
 |-- language_id: double (nullable = true)



In [3]:
#checking if there are any columns that we need to delete if they break the mold
df = df.orderBy('title')
row_53 = df.limit(52).collect()[-1]
print(row_53)

Row(_c0=3105490, title='"Alus Dziesma (""Vella Kalpi"")"', artist='Raimonds Pauls, Vīru Vokālais Ansamblis Edgara Račevska Vadībā', album='Teic, Kur Zeme Tā', af_danceability=0.684, af_energy=0.419, af_loudness=-7.714, af_speechiness=0.0296, af_acousticness=0.88, af_instrumentalness=0.756, af_valence=0.956, af_tempo=96.745, language_id=-2.0)


In [4]:
columns_to_clean = ['title','artist', 'album']
# we are removing quotations from the titles
df = df.select(
    *[
        F.regexp_replace(c, r"[\"']", "").alias(c)
        if c in columns_to_clean else c
        for c in df.columns
    ]
)

In [5]:
#checking if there are any columns that we need to delete if they break the mold
df = df.orderBy('Title')
row_53 = df.limit(52).collect()[-1]
print(row_53)

Row(_c0=1508866, title='#Summerbody', artist='DJ Muscleboy, Sverrir Bergmann', album='#Summerbody', af_danceability=0.414, af_energy=0.97, af_loudness=-2.954, af_speechiness=0.387, af_acousticness=0.0402, af_instrumentalness=0.0, af_valence=0.217, af_tempo=160.026, language_id=-2.0)


In [6]:
df.printSchema()

root
 |-- _c0: integer (nullable = true)
 |-- title: string (nullable = true)
 |-- artist: string (nullable = true)
 |-- album: string (nullable = true)
 |-- af_danceability: double (nullable = true)
 |-- af_energy: double (nullable = true)
 |-- af_loudness: double (nullable = true)
 |-- af_speechiness: double (nullable = true)
 |-- af_acousticness: double (nullable = true)
 |-- af_instrumentalness: double (nullable = true)
 |-- af_valence: double (nullable = true)
 |-- af_tempo: double (nullable = true)
 |-- language_id: double (nullable = true)



In [7]:
input_columns =  [
    "af_danceability",
    "af_energy",
    "af_loudness",
    "af_speechiness",
    "af_acousticness",
    "af_instrumentalness",
    "af_valence",
    "af_tempo",
    "language_id"
]
vector_assembler = VectorAssembler(inputCols = input_columns, 
                            outputCol = 'vectorized_features')
vector_dataframe = vector_assembler.transform(df) # df is our data without being vectorized
transformed_vectors = vector_dataframe.select('vectorized_features') # selecting the vectorized features so we can use them in a model
transformed_vectors.show(5)

+--------------------+
| vectorized_features|
+--------------------+
|[0.561,0.798,-7.0...|
|[0.66,0.593,-7.96...|
|[0.688,0.479,-7.4...|
|[0.72,0.601,-10.1...|
|[0.752,0.638,-8.4...|
+--------------------+
only showing top 5 rows



In [8]:
vector_dataframe.printSchema()

root
 |-- _c0: integer (nullable = true)
 |-- title: string (nullable = true)
 |-- artist: string (nullable = true)
 |-- album: string (nullable = true)
 |-- af_danceability: double (nullable = true)
 |-- af_energy: double (nullable = true)
 |-- af_loudness: double (nullable = true)
 |-- af_speechiness: double (nullable = true)
 |-- af_acousticness: double (nullable = true)
 |-- af_instrumentalness: double (nullable = true)
 |-- af_valence: double (nullable = true)
 |-- af_tempo: double (nullable = true)
 |-- language_id: double (nullable = true)
 |-- vectorized_features: vector (nullable = true)



In [9]:
transformed_vectors.printSchema()

root
 |-- vectorized_features: vector (nullable = true)



In [10]:
#scaling the data for k-means since it's a distance based algorithm 
scaler = StandardScaler(inputCol = 'vectorized_features',
                       outputCol = 'scaled_vectorized_features',
                       withStd = True,
                       withMean = False)
#check the summary statistics of our resutls by fitting the standard scaler
scalerModel = scaler.fit(vector_dataframe)

#normalizing each feature to have unit standard deviation
vector_dataframe_scaled = scalerModel.transform(vector_dataframe)

#showing the scaled features
vector_dataframe_scaled.select('scaled_vectorized_features').show(5)

+--------------------------+
|scaled_vectorized_features|
+--------------------------+
|      [3.64958367206017...|
|      [4.29362784948256...|
|      [4.47578175824848...|
|      [4.68395765398097...|
|      [4.89213354971346...|
+--------------------------+
only showing top 5 rows



In [11]:
vector_dataframe_scaled.printSchema()

root
 |-- _c0: integer (nullable = true)
 |-- title: string (nullable = true)
 |-- artist: string (nullable = true)
 |-- album: string (nullable = true)
 |-- af_danceability: double (nullable = true)
 |-- af_energy: double (nullable = true)
 |-- af_loudness: double (nullable = true)
 |-- af_speechiness: double (nullable = true)
 |-- af_acousticness: double (nullable = true)
 |-- af_instrumentalness: double (nullable = true)
 |-- af_valence: double (nullable = true)
 |-- af_tempo: double (nullable = true)
 |-- language_id: double (nullable = true)
 |-- vectorized_features: vector (nullable = true)
 |-- scaled_vectorized_features: vector (nullable = true)



| Silhouette Score | Interpretation                                |
| ---------------- | --------------------------------------------- |
| **0.7 – 1.0**    | Excellent clusters — dense and well separated |
| **0.5 – 0.7**    | Good — meaningful structure                   |
| **0.25 – 0.5**   | Weak — clusters overlap somewhat              |
| **0.0 – 0.25**   | Very weak — clusters probably not meaningful  |
| **Negative**     | Bad — points assigned to the wrong cluster    |


In [12]:
#using squared eucledian
sil_score_squaredEuclidean = []
evaluator = ClusteringEvaluator(predictionCol = 'prediction',
                               featuresCol = 'scaled_vectorized_features',
                               metricName = 'silhouette',
                               distanceMeasure = 'squaredEuclidean' 
                               # distanceMeasure = 'cosine' #cosine gives us better results
                               )

top_model_seed = 0
top_model_score = 0
for i in range(2,15):
    seed = random.randint(1,10000)
    kmeans = KMeans(featuresCol = 'scaled_vectorized_features', k = i, seed=seed)
    model = kmeans.fit(vector_dataframe_scaled)
    predictions = model.transform(vector_dataframe_scaled)
    score = evaluator.evaluate(predictions)
    sil_score_squaredEuclidean.append(score)
    print("Silhouette Score with Euclidian for K = ", i , 'is', score)
    if top_model_score < score:
        top_model_score = score
        top_model_seed = seed

Silhouette Score with Euclidian for K =  2 is 0.399443770829862
Silhouette Score with Euclidian for K =  3 is 0.37216769171451497
Silhouette Score with Euclidian for K =  4 is 0.242148771132395
Silhouette Score with Euclidian for K =  5 is 0.263713717507036
Silhouette Score with Euclidian for K =  6 is 0.23694962316202306
Silhouette Score with Euclidian for K =  7 is 0.2913089501411729
Silhouette Score with Euclidian for K =  8 is 0.26354607876409264
Silhouette Score with Euclidian for K =  9 is 0.2653593328172245
Silhouette Score with Euclidian for K =  10 is 0.2584189095256405
Silhouette Score with Euclidian for K =  11 is 0.2510620785155972
Silhouette Score with Euclidian for K =  12 is 0.21654304203557256
Silhouette Score with Euclidian for K =  13 is 0.25690377122378927
Silhouette Score with Euclidian for K =  14 is 0.249145886313278


In [19]:
kmeans = KMeans(featuresCol = 'scaled_vectorized_features', k = 13, seed=top_model_seed)
model = kmeans.fit(vector_dataframe_scaled)
# model.save('kmeans_model_big_data')
# scalerModel.save('scaler_model_kmeans_big_data')

In [20]:
df_with_KMeans_predictions = model.transform(vector_dataframe_scaled)
df_with_KMeans_predictions.printSchema()

root
 |-- _c0: integer (nullable = true)
 |-- title: string (nullable = true)
 |-- artist: string (nullable = true)
 |-- album: string (nullable = true)
 |-- af_danceability: double (nullable = true)
 |-- af_energy: double (nullable = true)
 |-- af_loudness: double (nullable = true)
 |-- af_speechiness: double (nullable = true)
 |-- af_acousticness: double (nullable = true)
 |-- af_instrumentalness: double (nullable = true)
 |-- af_valence: double (nullable = true)
 |-- af_tempo: double (nullable = true)
 |-- language_id: double (nullable = true)
 |-- vectorized_features: vector (nullable = true)
 |-- scaled_vectorized_features: vector (nullable = true)
 |-- prediction: integer (nullable = false)



In [21]:
# check the distribution of clusters
df_with_KMeans_predictions.groupBy('prediction').count().orderBy('prediction').show()

+----------+-----+
|prediction|count|
+----------+-----+
|         0|12715|
|         1| 8478|
|         2| 6667|
|         3| 7638|
|         4| 9558|
|         5| 2094|
|         6| 7246|
|         7| 7881|
|         8| 7305|
|         9| 9514|
|        10|19696|
|        11| 9128|
|        12| 4851|
+----------+-----+



In [23]:
cluster_sizes = model.summary.clusterSizes
print("Cluster Sizes: ",cluster_sizes)

Cluster Sizes:  [12715, 8478, 6667, 7638, 9558, 2094, 7246, 7881, 7305, 9514, 19696, 9128, 4851]


In [24]:
total_points = df_with_KMeans_predictions.count()
cluster_distribution_percentage = [size / total_points * 100 for size in cluster_sizes]
print("Cluster Distribution (Percentage):", cluster_distribution_percentage)

Cluster Distribution (Percentage): [11.275061851007795, 7.517890237738426, 5.911980917079745, 6.773017885803974, 8.47558326165415, 1.8568603630365963, 6.425410788234563, 6.988498816185013, 6.477729203429961, 8.436566138457582, 17.465483147263036, 8.094279557687702, 4.301637832421456]


In [49]:
# saving each cluster as a separate csv

# start by figuring out how many unique predictions there are
unique_predictions = predictions.select('prediction').distinct().collect()
for row in unique_predictions:
    cluster_number = row['prediction']
    cluster_df = predictions.filter(predictions['prediction'] == cluster_number)
    output_path = f'big_data_clusters/cluster_{cluster_number}_big_data.csv'

    # convert the scaled_vectorized_features into strings so they can be added to the csv
    df_out = cluster_df.withColumn(
        "scaled_vectorized_features_str",
        F.array_join(vector_to_array("scaled_vectorized_features"), ",")
    )

    df_out.select('title', 'artist', 'album', 'language_id', 'scaled_vectorized_features_str').coalesce(1).write.mode("overwrite").csv(output_path, header=True)

+--------+--------------------+--------------------+--------------------+---------------+---------+-----------+--------------+---------------+-------------------+----------+--------+-----------+--------------------+--------------------------+----------+
|     _c0|               title|              artist|               album|af_danceability|af_energy|af_loudness|af_speechiness|af_acousticness|af_instrumentalness|af_valence|af_tempo|language_id| vectorized_features|scaled_vectorized_features|prediction|
+--------+--------------------+--------------------+--------------------+---------------+---------+-----------+--------------+---------------+-------------------+----------+--------+-----------+--------------------+--------------------------+----------+
|  485174|                   !|            O.S.T.R.|W drodze po szczę...|          0.561|    0.798|     -7.064|          0.32|          0.307|                0.0|      0.14| 140.837|        9.0|[0.561,0.798,-7.0...|      [3.64958367206017

In [34]:
#using cosine similarity
sil_score_cosine = []
evaluator = ClusteringEvaluator(predictionCol = 'prediction',
                               featuresCol = 'scaled_vectorized_features',
                               metricName = 'silhouette', 
                               distanceMeasure = 'cosine'
                               )
for i in range(2,15):
    kmeans = KMeans(featuresCol = 'scaled_vectorized_features', k = i, seed=random.randint(1,10000))
    model = kmeans.fit(vector_dataframe_scaled)
    predictions = model.transform(vector_dataframe_scaled)
    score = evaluator.evaluate(predictions)
    sil_score_cosine.append(score)
    print("Silhouette Score with Cosine for K = ", i , 'is', score)


Silhouette Score with Cosine for K =  2 is 0.4238623727664529
Silhouette Score with Cosine for K =  3 is 0.4016587915720338
Silhouette Score with Cosine for K =  4 is 0.24742673237517163
Silhouette Score with Cosine for K =  5 is 0.26559821358679087
Silhouette Score with Cosine for K =  6 is 0.22567116675989374
Silhouette Score with Cosine for K =  7 is 0.24131017443656583
Silhouette Score with Cosine for K =  8 is 0.23831258644450318
Silhouette Score with Cosine for K =  9 is 0.23440292046149092
Silhouette Score with Cosine for K =  10 is 0.2622376563000166
Silhouette Score with Cosine for K =  11 is 0.23625565949301608
Silhouette Score with Cosine for K =  12 is 0.24215659796099806
Silhouette Score with Cosine for K =  13 is 0.22694024559017276
Silhouette Score with Cosine for K =  14 is 0.2264450660041378
