In [34]:
from pyspark.sql import SparkSession
from pyspark.ml.clustering import KMeansModel
from pyspark.ml.feature import VectorAssembler, StandardScalerModel
from pyspark.sql import functions as F
from pyspark.ml.functions import array_to_vector

In [35]:
# creating spark app
spark = SparkSession.builder.appName("kmeans_inference").getOrCreate()

In [36]:
kmeans_model = KMeansModel.load('kmeans_model_big_data')
scalerModel = StandardScalerModel.load('scaler_model_kmeans_big_data')

In [37]:
input_columns =  [
    "af_danceability",
    "af_energy",
    "af_loudness",
    "af_speechiness",
    "af_acousticness",
    "af_instrumentalness",
    "af_valence",
    "af_tempo",
    "language_id"
]
# example_data = [0.723,0.777,-3.503,0.108,0.0426,3.68E-6,0.961,91.017,0.0]
# example_data = [0.666,0.83,-5.715,0.0751,0.0123,0.0,0.702,113.03,0.0]
# example_data = [0.627,0.416,-14.174,0.908,0.289,0.0,0.542,94.791,4.0]
# 0.561|    0.798|     -7.064|          0.32|          0.307|                0.0|      0.14| 140.837|        9.0
example_data = [0.561,0.798,-7.064,0.32,0.307,0.0,0.14,140.837,9.0]
example_df = spark.createDataFrame([example_data], input_columns)
example_df.show()

vector_assembler = VectorAssembler(inputCols = input_columns, 
                            outputCol = 'vectorized_features')
vector_dataframe = vector_assembler.transform(example_df) # df is our data without being vectorized
transformed_vectors = vector_dataframe.select('vectorized_features') # selecting the vectorized features so we can use them in a model
transformed_vectors.show(5)

# scale it next, using the same model as from training
input_scaled = scalerModel.transform(transformed_vectors)
input_scaled.show(1)

+---------------+---------+-----------+--------------+---------------+-------------------+----------+--------+-----------+
|af_danceability|af_energy|af_loudness|af_speechiness|af_acousticness|af_instrumentalness|af_valence|af_tempo|language_id|
+---------------+---------+-----------+--------------+---------------+-------------------+----------+--------+-----------+
|          0.561|    0.798|     -7.064|          0.32|          0.307|                0.0|      0.14| 140.837|        9.0|
+---------------+---------+-----------+--------------+---------------+-------------------+----------+--------+-----------+

+--------------------+
| vectorized_features|
+--------------------+
|[0.561,0.798,-7.0...|
+--------------------+

+--------------------+--------------------------+
| vectorized_features|scaled_vectorized_features|
+--------------------+--------------------------+
|[0.561,0.798,-7.0...|      [3.64958367206017...|
+--------------------+--------------------------+



In [43]:
preds = kmeans_model.transform(input_scaled)
preds.select("prediction").show(truncate=False)

+----------+
|prediction|
+----------+
|7         |
+----------+



In [39]:
# once we know what cluster it is, we can calculate the cosine similarity between this song and the others in that cluster
# start by getting the relevant cluster
cluster_number = preds.select("prediction").collect()[0][0]
print(f"Cluster number: {cluster_number}")
# get the relevant csv from that cluster number
cluster_df = spark.read.csv(f'big_data_clusters/cluster_{cluster_number}_big_data.csv', header=True, inferSchema=True)
df = cluster_df.withColumn(
    "scaled_vectorized_features",
    F.split("scaled_vectorized_features_str", ",").cast("array<double>")
)
df.show(5)


Cluster number: 7
+------------------+--------------------+--------------------+-----------+------------------------------+--------------------------+
|             title|              artist|               album|language_id|scaled_vectorized_features_str|scaled_vectorized_features|
+------------------+--------------------+--------------------+-----------+------------------------------+--------------------------+
|         #BabyBaby|             MONSTAR|Những Bản Hits Do...|       18.0|          4.670946660497695...|      [4.67094666049769...|
|#BrooklynBloodPop!|                SyKo|  #BrooklynBloodPop!|        9.0|          4.495298248473408...|      [4.49529824847340...|
|   #COMOTODOSFAZEM|                 NTS|     #COMOTODOSFAZEM|       21.0|          3.831737580826103...|      [3.83173758082610...|
|           #DDDing|The Darkraver, Th...|             #DDDing|       25.0|          3.909803541725786...|      [3.90980354172578...|
|          #Lávkóma|           Kis Grófo|          

In [44]:
import numpy as np

pdf = df.toPandas()
X = np.vstack(pdf.scaled_vectorized_features.values)
v = np.array(preds.select("scaled_vectorized_features").collect()[0][0])

cos = (X @ v) / (np.linalg.norm(X, axis=1) * np.linalg.norm(v))

pdf["cosine_similarity"] = cos
lang_filtered_pdf = pdf[pdf["language_id"] == 9.0]
lang_filtered_pdf.sort_values("cosine_similarity", ascending=False).head(20)

Unnamed: 0,title,artist,album,language_id,scaled_vectorized_features_str,scaled_vectorized_features,cosine_similarity
2492,Oh My Buddha,Quebonafide,Egzotyka,9.0,"4.3586828168989635,4.519417459868718,-1.309135...","[4.3586828168989635, 4.519417459868718, -1.309...",0.955679
1052,Elektryczny,"Smolik, Brodka, Dawid Podsiadło",Elektryczny,9.0,"4.196045398357958,3.9303480900995043,-1.215941...","[4.196045398357958, 3.9303480900995043, -1.215...",0.947273
370,BDSM,Quebonafide,Dla fanów eklektyki,9.0,"4.339166326674043,3.375077782530164,-1.8445148...","[4.339166326674043, 3.375077782530164, -1.8445...",0.94603
3762,W Drodze Po Szczęście,O.S.T.R.,W drodze po szczęście,9.0,"4.378199307123885,4.529074334782966,-1.5665931...","[4.378199307123885, 4.529074334782966, -1.5665...",0.941764
948,Dla sióstr,"SB Maffija, Bedoes, Białas, White 2115",Hotel Maffija,9.0,"4.638419176789494,4.002774651956375,-1.5549785...","[4.638419176789494, 4.002774651956375, -1.5549...",0.937809
393,Bajki robotów,"Kartky, Przyłu",Outside Of Society,9.0,"4.313144339707482,3.713068404528893,-2.0732125...","[4.313144339707482, 3.713068404528893, -2.0732...",0.936723
3783,Wyb?,"Parker Polhill, Abby Rhodes",Wyb?,9.0,"5.145847922637434,3.3605924701587897,-1.271526...","[5.145847922637434, 3.3605924701587897, -1.271...",0.935681
1710,Każdy Nowy Dzień,Young Igi,Każdy Nowy Dzień,9.0,"4.6839576539809755,3.7999802787571375,-1.39071...","[4.6839576539809755, 3.7999802787571375, -1.39...",0.935083
1334,Gorzka woda (prod. Auer) - Remix,"Pezet, Paluch, KęKę, Sokół, Ten Typ Mes",Muzyka współczesna,9.0,"4.3326608299324025,4.147627775670116,-1.341213...","[4.3326608299324025, 4.147627775670116, -1.341...",0.932655
1,#BrooklynBloodPop!,SyKo,#BrooklynBloodPop!,9.0,"4.495298248473408,3.9303480900995043,-1.047529...","[4.495298248473408, 3.9303480900995043, -1.047...",0.931597
