In [24]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StandardScaler
from pyspark.ml.clustering import GaussianMixture
import numpy as np

from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator
from pyspark.ml.functions import vector_to_array

import random

In [25]:
# creating spark app
spark = SparkSession.builder.appName("K-means Implementation").getOrCreate()

#reading in the CSV file 
df = spark.read.csv('big_data_ready_for_modeling.csv', header=True, inferSchema=True)
# file_read = 'gs://term-project-fall-202511162025/running_big_data/big_data_ready_for_modeling.csv'
# df = spark.read.csv(file_read,
#                     header=True, inferSchema = True)

df.printSchema()

root
 |-- _c0: integer (nullable = true)
 |-- title: string (nullable = true)
 |-- artist: string (nullable = true)
 |-- album: string (nullable = true)
 |-- af_danceability: double (nullable = true)
 |-- af_energy: double (nullable = true)
 |-- af_loudness: double (nullable = true)
 |-- af_speechiness: double (nullable = true)
 |-- af_acousticness: double (nullable = true)
 |-- af_instrumentalness: double (nullable = true)
 |-- af_valence: double (nullable = true)
 |-- af_tempo: double (nullable = true)
 |-- language_id: double (nullable = true)



In [26]:
#checking if there are any columns that we need to delete if they break the mold
df = df.orderBy('title')
row_53 = df.limit(52).collect()[-1]
print(row_53)

Row(_c0=3105490, title='"Alus Dziesma (""Vella Kalpi"")"', artist='Raimonds Pauls, Vīru Vokālais Ansamblis Edgara Račevska Vadībā', album='Teic, Kur Zeme Tā', af_danceability=0.684, af_energy=0.419, af_loudness=-7.714, af_speechiness=0.0296, af_acousticness=0.88, af_instrumentalness=0.756, af_valence=0.956, af_tempo=96.745, language_id=-2.0)


In [27]:
columns_to_clean = ['title','artist', 'album']
# we are removing quotations from the titles
df = df.select(
    *[
        F.regexp_replace(c, r"[\"']", "").alias(c)
        if c in columns_to_clean else c
        for c in df.columns
    ]
)

In [28]:
#checking if there are any columns that we need to delete if they break the mold
df = df.orderBy('Title')
row_53 = df.limit(52).collect()[-1]
print(row_53)

Row(_c0=1508866, title='#Summerbody', artist='DJ Muscleboy, Sverrir Bergmann', album='#Summerbody', af_danceability=0.414, af_energy=0.97, af_loudness=-2.954, af_speechiness=0.387, af_acousticness=0.0402, af_instrumentalness=0.0, af_valence=0.217, af_tempo=160.026, language_id=-2.0)


In [29]:
df.printSchema()

root
 |-- _c0: integer (nullable = true)
 |-- title: string (nullable = true)
 |-- artist: string (nullable = true)
 |-- album: string (nullable = true)
 |-- af_danceability: double (nullable = true)
 |-- af_energy: double (nullable = true)
 |-- af_loudness: double (nullable = true)
 |-- af_speechiness: double (nullable = true)
 |-- af_acousticness: double (nullable = true)
 |-- af_instrumentalness: double (nullable = true)
 |-- af_valence: double (nullable = true)
 |-- af_tempo: double (nullable = true)
 |-- language_id: double (nullable = true)



In [30]:
input_columns =  [
    # "af_danceability",
    "af_energy",
    "af_loudness",
    # "af_speechiness",
    "af_acousticness",
    # "af_instrumentalness",
    "af_valence",
    "af_tempo",
    "language_id"
]
vector_assembler = VectorAssembler(inputCols = input_columns, 
                            outputCol = 'vectorized_features')
vector_dataframe = vector_assembler.transform(df) # df is our data without being vectorized
transformed_vectors = vector_dataframe.select('vectorized_features') # selecting the vectorized features so we can use them in a model
transformed_vectors.show(5)

+--------------------+
| vectorized_features|
+--------------------+
|[0.798,-7.064,0.3...|
|[0.593,-7.96,0.09...|
|[0.479,-7.476,0.5...|
|[0.601,-10.197,0....|
|[0.638,-8.496,0.5...|
+--------------------+
only showing top 5 rows



In [31]:
vector_dataframe.printSchema()

root
 |-- _c0: integer (nullable = true)
 |-- title: string (nullable = true)
 |-- artist: string (nullable = true)
 |-- album: string (nullable = true)
 |-- af_danceability: double (nullable = true)
 |-- af_energy: double (nullable = true)
 |-- af_loudness: double (nullable = true)
 |-- af_speechiness: double (nullable = true)
 |-- af_acousticness: double (nullable = true)
 |-- af_instrumentalness: double (nullable = true)
 |-- af_valence: double (nullable = true)
 |-- af_tempo: double (nullable = true)
 |-- language_id: double (nullable = true)
 |-- vectorized_features: vector (nullable = true)



In [32]:
transformed_vectors.printSchema()

root
 |-- vectorized_features: vector (nullable = true)



In [33]:
#scaling the data for k-means since it's a distance based algorithm 
scaler = StandardScaler(inputCol = 'vectorized_features',
                       outputCol = 'scaled_vectorized_features',
                       withStd = True,
                       withMean = False)
#check the summary statistics of our resutls by fitting the standard scaler
scalerModel = scaler.fit(vector_dataframe)

#normalizing each feature to have unit standard deviation
vector_dataframe_scaled = scalerModel.transform(vector_dataframe)

#showing the scaled features
vector_dataframe_scaled.select('scaled_vectorized_features').show(5)

+--------------------------+
|scaled_vectorized_features|
+--------------------------+
|      [3.85309309078550...|
|      [2.86326341207494...|
|      [2.31282154196272...|
|      [2.90189091173194...|
|      [3.08054309764555...|
+--------------------------+
only showing top 5 rows



In [34]:
vector_dataframe_scaled.printSchema()

root
 |-- _c0: integer (nullable = true)
 |-- title: string (nullable = true)
 |-- artist: string (nullable = true)
 |-- album: string (nullable = true)
 |-- af_danceability: double (nullable = true)
 |-- af_energy: double (nullable = true)
 |-- af_loudness: double (nullable = true)
 |-- af_speechiness: double (nullable = true)
 |-- af_acousticness: double (nullable = true)
 |-- af_instrumentalness: double (nullable = true)
 |-- af_valence: double (nullable = true)
 |-- af_tempo: double (nullable = true)
 |-- language_id: double (nullable = true)
 |-- vectorized_features: vector (nullable = true)
 |-- scaled_vectorized_features: vector (nullable = true)



| Silhouette Score | Interpretation                                |
| ---------------- | --------------------------------------------- |
| **0.7 – 1.0**    | Excellent clusters — dense and well separated |
| **0.5 – 0.7**    | Good — meaningful structure                   |
| **0.25 – 0.5**   | Weak — clusters overlap somewhat              |
| **0.0 – 0.25**   | Very weak — clusters probably not meaningful  |
| **Negative**     | Bad — points assigned to the wrong cluster    |


In [35]:
#using squared eucledian
sil_score_squaredEuclidean = []
evaluator = ClusteringEvaluator(predictionCol = 'prediction',
                               featuresCol = 'scaled_vectorized_features',
                               metricName = 'silhouette',
                               distanceMeasure = 'squaredEuclidean' 
                               # distanceMeasure = 'cosine' #cosine gives us better results
                               )

top_model_seed = 0
top_model_score = 0
for i in range(2,15):
    seed = random.randint(1,10000)
    kmeans = KMeans(featuresCol = 'scaled_vectorized_features', k = i, seed=seed)
    model = kmeans.fit(vector_dataframe_scaled)
    predictions = model.transform(vector_dataframe_scaled)
    score = evaluator.evaluate(predictions)
    sil_score_squaredEuclidean.append(score)
    print("Silhouette Score with Euclidian for K = ", i , 'is', score)
    if top_model_score < score:
        top_model_score = score
        top_model_seed = seed

Silhouette Score with Euclidian for K =  2 is 0.4627006935332518
Silhouette Score with Euclidian for K =  3 is 0.42153588525621283
Silhouette Score with Euclidian for K =  4 is 0.3055032192852368
Silhouette Score with Euclidian for K =  5 is 0.30768537450051214
Silhouette Score with Euclidian for K =  6 is 0.31406900556715195
Silhouette Score with Euclidian for K =  7 is 0.2835289367024554
Silhouette Score with Euclidian for K =  8 is 0.30907340242625575
Silhouette Score with Euclidian for K =  9 is 0.30244830860231553
Silhouette Score with Euclidian for K =  10 is 0.2663279886094102
Silhouette Score with Euclidian for K =  11 is 0.29370952254270966
Silhouette Score with Euclidian for K =  12 is 0.287657997124122
Silhouette Score with Euclidian for K =  13 is 0.2801632620223955
Silhouette Score with Euclidian for K =  14 is 0.26880271536044986


In [36]:
kmeans = KMeans(featuresCol = 'scaled_vectorized_features', k = 9, seed=top_model_seed)
model = kmeans.fit(vector_dataframe_scaled)
# model.save('kmeans_model_big_data')
# scalerModel.save('scaler_model_kmeans_big_data')


In [37]:
df_with_KMeans_predictions = model.transform(vector_dataframe_scaled)
df_with_KMeans_predictions.printSchema()

root
 |-- _c0: integer (nullable = true)
 |-- title: string (nullable = true)
 |-- artist: string (nullable = true)
 |-- album: string (nullable = true)
 |-- af_danceability: double (nullable = true)
 |-- af_energy: double (nullable = true)
 |-- af_loudness: double (nullable = true)
 |-- af_speechiness: double (nullable = true)
 |-- af_acousticness: double (nullable = true)
 |-- af_instrumentalness: double (nullable = true)
 |-- af_valence: double (nullable = true)
 |-- af_tempo: double (nullable = true)
 |-- language_id: double (nullable = true)
 |-- vectorized_features: vector (nullable = true)
 |-- scaled_vectorized_features: vector (nullable = true)
 |-- prediction: integer (nullable = false)



In [38]:
# check the distribution of clusters
df_with_KMeans_predictions.groupBy('prediction').count().orderBy('prediction').show()

+----------+-----+
|prediction|count|
+----------+-----+
|         0| 5397|
|         1|17326|
|         2| 5274|
|         3| 6902|
|         4|21576|
|         5|14510|
|         6|26783|
|         7| 6741|
|         8| 8262|
+----------+-----+



In [39]:
cluster_sizes = model.summary.clusterSizes
print("Cluster Sizes: ",cluster_sizes)

Cluster Sizes:  [5397, 17326, 5274, 6902, 21576, 14510, 26783, 6741, 8262]


In [40]:
total_points = df_with_KMeans_predictions.count()
cluster_distribution_percentage = [size / total_points * 100 for size in cluster_sizes]
print("Cluster Distribution (Percentage):", cluster_distribution_percentage)

Cluster Distribution (Percentage): [4.78580486117885, 15.363879011447978, 4.676734266788447, 6.1203678250614075, 19.132578411116334, 12.866783126867723, 23.74990024031001, 5.977600624273971, 7.326351632955282]


In [41]:
# saving each cluster as a separate csv

# start by figuring out how many unique predictions there are
unique_predictions = predictions.select('prediction').distinct().collect()
for row in unique_predictions:
    cluster_number = row['prediction']
    cluster_df = predictions.filter(predictions['prediction'] == cluster_number)
    output_path = f'big_data_clusters/cluster_{cluster_number}_big_data.csv'

    # convert the scaled_vectorized_features into strings so they can be added to the csv
    df_out = cluster_df.withColumn(
        "scaled_vectorized_features_str",
        F.array_join(vector_to_array("scaled_vectorized_features"), ",")
    )

    df_out.select('title', 'artist', 'album', 'language_id', 'scaled_vectorized_features_str').coalesce(1).write.mode("overwrite").csv(output_path, header=True)

In [42]:
#using cosine similarity
sil_score_cosine = []
evaluator_cosine = ClusteringEvaluator(predictionCol = 'prediction',
                               featuresCol = 'scaled_vectorized_features',
                               metricName = 'silhouette', 
                               distanceMeasure = 'cosine'
                               )
for i in range(2,15):
    kmeans = KMeans(featuresCol = 'scaled_vectorized_features', k = i, seed=random.randint(1,10000))
    model = kmeans.fit(vector_dataframe_scaled)
    predictions = model.transform(vector_dataframe_scaled)
    score = evaluator_cosine.evaluate(predictions)
    sil_score_cosine.append(score)
    print("Silhouette Score with Cosine for K = ", i , 'is', score)


Silhouette Score with Cosine for K =  2 is 0.5040122312208837
Silhouette Score with Cosine for K =  3 is 0.26831738849364006
Silhouette Score with Cosine for K =  4 is 0.19787288158965943
Silhouette Score with Cosine for K =  5 is 0.26488662844321575
Silhouette Score with Cosine for K =  6 is 0.2666941424412123
Silhouette Score with Cosine for K =  7 is 0.24533923524835868
Silhouette Score with Cosine for K =  8 is 0.27925662845329363
Silhouette Score with Cosine for K =  9 is 0.25187492062247874
Silhouette Score with Cosine for K =  10 is 0.2535813624890983
Silhouette Score with Cosine for K =  11 is 0.22892614721228643
Silhouette Score with Cosine for K =  12 is 0.23973219929887646
Silhouette Score with Cosine for K =  13 is 0.23774279367383278
Silhouette Score with Cosine for K =  14 is 0.2166420990084379


In [43]:
kmeans = KMeans(featuresCol = 'scaled_vectorized_features', k = 2, seed=top_model_seed)
model = kmeans.fit(vector_dataframe_scaled)

# model.save('kmeans_model_big_data')
# scalerModel.save('scaler_model_kmeans_big_data')
df_with_KMeans_predictions_cosine = model.transform(vector_dataframe_scaled)
df_with_KMeans_predictions_cosine.printSchema()

root
 |-- _c0: integer (nullable = true)
 |-- title: string (nullable = true)
 |-- artist: string (nullable = true)
 |-- album: string (nullable = true)
 |-- af_danceability: double (nullable = true)
 |-- af_energy: double (nullable = true)
 |-- af_loudness: double (nullable = true)
 |-- af_speechiness: double (nullable = true)
 |-- af_acousticness: double (nullable = true)
 |-- af_instrumentalness: double (nullable = true)
 |-- af_valence: double (nullable = true)
 |-- af_tempo: double (nullable = true)
 |-- language_id: double (nullable = true)
 |-- vectorized_features: vector (nullable = true)
 |-- scaled_vectorized_features: vector (nullable = true)
 |-- prediction: integer (nullable = false)



In [44]:
df_with_KMeans_predictions_cosine.groupBy('prediction').count().orderBy('prediction').show()

+----------+-----+
|prediction|count|
+----------+-----+
|         0|33921|
|         1|78850|
+----------+-----+

