In [1]:
from pyspark.sql import SparkSession
from pyspark.ml.clustering import KMeansModel
from pyspark.ml.feature import VectorAssembler, StandardScalerModel
from pyspark.sql import functions as F

In [2]:
# creating spark app
spark = SparkSession.builder.appName("kmeans_inference").getOrCreate()

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
25/12/02 17:16:12 WARN Utils: Your hostname, Bens-MacBook-Air-7.local, resolves to a loopback address: 127.0.0.1; using 10.0.0.76 instead (on interface en0)
25/12/02 17:16:12 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/12/02 17:16:13 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/12/02 17:16:13 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [3]:
kmeans_model = KMeansModel.load('kmeans_model')
scalerModel = StandardScalerModel.load('scaler_model_kmeans')

In [4]:
input_columns =  [
    "af_danceability",
    "af_energy",
    "af_loudness",
    "af_speechiness",
    "af_acousticness",
    "af_instrumentalness",
    "af_valence",
    "af_tempo",
    "language_id"
]
# example_data = [0.723,0.777,-3.503,0.108,0.0426,3.68E-6,0.961,91.017,0.0]
example_data = [0.666,0.83,-5.715,0.0751,0.0123,0.0,0.702,113.03,0.0]
example_df = spark.createDataFrame([example_data], input_columns)
example_df.show()

vector_assembler = VectorAssembler(inputCols = input_columns, 
                            outputCol = 'vectorized_features')
vector_dataframe = vector_assembler.transform(example_df) # df is our data without being vectorized
transformed_vectors = vector_dataframe.select('vectorized_features') # selecting the vectorized features so we can use them in a model
transformed_vectors.show(5)

# scale it next, using the same model as from training
input_scaled = scalerModel.transform(transformed_vectors)
input_scaled.show(1)

+---------------+---------+-----------+--------------+---------------+-------------------+----------+--------+-----------+
|af_danceability|af_energy|af_loudness|af_speechiness|af_acousticness|af_instrumentalness|af_valence|af_tempo|language_id|
+---------------+---------+-----------+--------------+---------------+-------------------+----------+--------+-----------+
|          0.666|     0.83|     -5.715|        0.0751|         0.0123|                0.0|     0.702|  113.03|        0.0|
+---------------+---------+-----------+--------------+---------------+-------------------+----------+--------+-----------+

+--------------------+
| vectorized_features|
+--------------------+
|[0.666,0.83,-5.71...|
+--------------------+

+--------------------+--------------------------+
| vectorized_features|scaled_vectorized_features|
+--------------------+--------------------------+
|[0.666,0.83,-5.71...|      [7.30590220636553...|
+--------------------+--------------------------+



In [5]:
preds = kmeans_model.transform(input_scaled)
preds.select("prediction").show(truncate=False)

+----------+
|prediction|
+----------+
|1         |
+----------+

