In [79]:
pip uninstall pyspark_hnsw -y

Note: you may need to restart the kernel to use updated packages.




In [80]:
# pip install git+https://github.com/Jelmerro/hnswlib-spark.git

In [81]:
from pyspark_hnsw.knn import HnswSimilarity, BruteForceSimilarity
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import Normalizer
from pyspark.sql import functions as F
from pyspark.ml import Pipeline

spark = SparkSession.builder \
    .config("spark.jars.packages", "com.github.jelmerk:hnswlib-spark_3_5_2.12:2.0.0-beta.2") \
    .config("spark.ui.showConsoleProgress", "false") \
    .getOrCreate()


In [82]:
df = spark.read.csv('numpy_array_for_modeling.csv', header=True, inferSchema=True)
df.printSchema()

root
 |-- rank: double (nullable = true)
 |-- trend: double (nullable = true)
 |-- streams: double (nullable = true)
 |-- popularity: double (nullable = true)
 |-- duration_ms: double (nullable = true)
 |-- af_danceability: double (nullable = true)
 |-- af_energy: double (nullable = true)
 |-- af_key: double (nullable = true)
 |-- af_loudness: double (nullable = true)
 |-- af_mode: double (nullable = true)
 |-- af_speechiness: double (nullable = true)
 |-- af_acousticness: double (nullable = true)
 |-- af_instrumentalness: double (nullable = true)
 |-- af_liveness: double (nullable = true)
 |-- af_valence: double (nullable = true)
 |-- af_tempo: double (nullable = true)
 |-- af_time_signature: double (nullable = true)
 |-- language_id: double (nullable = true)
 |-- date: double (nullable = true)



In [83]:
items = df

In [84]:
df.printSchema()
df.show(5)

root
 |-- rank: double (nullable = true)
 |-- trend: double (nullable = true)
 |-- streams: double (nullable = true)
 |-- popularity: double (nullable = true)
 |-- duration_ms: double (nullable = true)
 |-- af_danceability: double (nullable = true)
 |-- af_energy: double (nullable = true)
 |-- af_key: double (nullable = true)
 |-- af_loudness: double (nullable = true)
 |-- af_mode: double (nullable = true)
 |-- af_speechiness: double (nullable = true)
 |-- af_acousticness: double (nullable = true)
 |-- af_instrumentalness: double (nullable = true)
 |-- af_liveness: double (nullable = true)
 |-- af_valence: double (nullable = true)
 |-- af_tempo: double (nullable = true)
 |-- af_time_signature: double (nullable = true)
 |-- language_id: double (nullable = true)
 |-- date: double (nullable = true)

+----+-----+--------+----------+-----------+---------------+---------+------+-----------+-------+--------------+---------------+-------------------+-----------+----------+--------+------------

In [85]:
# creating an ID column because the algorithm needs it
df = df.withColumn('id', F.monotonically_increasing_id())

In [86]:
feature_columns = [
    "rank", "trend", "streams", "popularity", "duration_ms",
    "af_danceability", "af_energy", "af_key", "af_loudness",
    "af_mode", "af_speechiness", "af_acousticness",
    "af_instrumentalness", "af_liveness", "af_valence",
    "af_tempo", "af_time_signature", "language_id", "date"]

In [87]:
assembler = VectorAssembler(
    inputCols = feature_columns,
    outputCol = 'featuresAsVectors'
)
df = assembler.transform(df)

In [88]:
items = df 

In [89]:
df.createOrReplaceTempView('items')
items = spark.table('items')

In [90]:
normalizer = Normalizer(inputCol = 'features',
                        outputCol = 'normalizedFeatures')

In [91]:
hnsw = HnswSimilarity(
    identifierCol='id',
    featuresCol='normalizedFeatures',
    numPartitions=1,
    numThreads=2,
    k=4,
    distanceFunction='cosine',
    predictionCol='approximate' # the approximate predictions will be stored in this column
)

bruteForce = BruteForceSimilarity(
    identifierCol= 'id',
    featuresCol="normalizedFeatures",
    numPartitions=1,
    numThreads=2,
    k=4,
    distanceFunction="inner-product",
    predictionCol='exact' # the exact predictions will be stored in this column
)

In [92]:
# # ------------------------------
# # 8. Build the complete pipeline
# # ------------------------------
# from pyspark.ml import Pipeline

# pipeline = Pipeline(stages=[
#     normalizer,
#     hnsw,
#     bruteForce
# ])


# # ------------------------------
# # 9. Fit the HNSW + exact models
# # ------------------------------
# items = df

# model = pipeline.fit(items)


# # ------------------------------
# # 10. Sample queries (exact kNN is expensive)
# # ------------------------------
# queries = items.sample(0.02)   # take 2% of rows


# # ------------------------------
# # 11. Run approximate + exact search
# # ------------------------------
# output = model.transform(queries)


# # ------------------------------
# # 12. Evaluate accuracy
# # ------------------------------
# from pyspark_hnsw.knn import KnnSimilarityEvaluator

# evaluator = KnnSimilarityEvaluator(
#     approximateNeighborsCol="approximate",
#     exactNeighborsCol="exact"
# )

# accuracy = evaluator.evaluate(output)

# print("Approximate kNN Accuracy:", accuracy)

In [93]:
# ------------------------------
# 1. Load CSV
# ------------------------------
df = spark.read.csv(
    'numpy_array_for_modeling.csv',
    header=True,
    inferSchema=True
)

df = df.na.drop()   # optional: drop rows with nulls

df.printSchema()


# ------------------------------
# 2. Add a unique ID column
# ------------------------------
from pyspark.sql import functions as F

df = df.withColumn("id", F.monotonically_increasing_id())


# ------------------------------
# 3. Assemble all feature columns into ONE vector
# ------------------------------
from pyspark.ml.feature import VectorAssembler

feature_cols = [
    "rank", "trend", "streams", "popularity", "duration_ms",
    "af_danceability", "af_energy", "af_key", "af_loudness",
    "af_mode", "af_speechiness", "af_acousticness",
    "af_instrumentalness", "af_liveness", "af_valence",
    "af_tempo", "af_time_signature", "language_id", "date"
]

assembler = VectorAssembler(
    inputCols=feature_cols,
    outputCol="featuresAsMlLibVector"
)

df = assembler.transform(df)


# ------------------------------
# 4. Convert MLlib vector â†’ array<float>
# ------------------------------
from pyspark_hnsw.conversion import VectorConverter

converter = VectorConverter(
    inputCol="featuresAsMlLibVector",
    outputCol="features",
    outputType="array<float>"
)


# ------------------------------
# 5. Normalize the feature vector
# ------------------------------
from pyspark_hnsw.knn import Normalizer

normalizer = Normalizer(
    inputCol="features",
    outputCol="normalizedFeatures",
    p=2  # L2 normalization (default)
)


# ------------------------------
# 6. HNSW Approximate kNN
# ------------------------------
from pyspark_hnsw.knn import HnswSimilarity

hnsw = HnswSimilarity(
    identifierCol="id",
    featuresCol="normalizedFeatures",
    numPartitions=1,
    numThreads=4,
    k=10,
    distanceFunction="inner-product",
    predictionCol="approximate",
    m=48,
    efConstruction=200
)


# ------------------------------
# 7. Exact Brute-Force kNN
# ------------------------------
from pyspark_hnsw.knn import BruteForceSimilarity

bruteForce = BruteForceSimilarity(
    identifierCol="id",
    featuresCol="normalizedFeatures",
    numPartitions=1,
    numThreads=4,
    k=10,
    distanceFunction="inner-product",
    predictionCol="exact"
)


# ------------------------------
# 8. Build the complete pipeline
# ------------------------------
from pyspark.ml import Pipeline

pipeline = Pipeline(stages=[
    converter,
    normalizer,
    hnsw,
    bruteForce
])


# ------------------------------
# 9. Fit the HNSW + exact models
# ------------------------------
items = df

model = pipeline.fit(items)


# ------------------------------
# 10. Sample queries (exact kNN is expensive)
# ------------------------------
queries = items.sample(0.02)   # take 2% of rows


# ------------------------------
# 11. Run approximate + exact search
# ------------------------------
output = model.transform(queries)


# ------------------------------
# 12. Evaluate accuracy
# ------------------------------
from pyspark_hnsw.knn import KnnSimilarityEvaluator

evaluator = KnnSimilarityEvaluator(
    approximateNeighborsCol="approximate",
    exactNeighborsCol="exact"
)

accuracy = evaluator.evaluate(output)

print("Approximate kNN Accuracy:", accuracy)


root
 |-- rank: double (nullable = true)
 |-- trend: double (nullable = true)
 |-- streams: double (nullable = true)
 |-- popularity: double (nullable = true)
 |-- duration_ms: double (nullable = true)
 |-- af_danceability: double (nullable = true)
 |-- af_energy: double (nullable = true)
 |-- af_key: double (nullable = true)
 |-- af_loudness: double (nullable = true)
 |-- af_mode: double (nullable = true)
 |-- af_speechiness: double (nullable = true)
 |-- af_acousticness: double (nullable = true)
 |-- af_instrumentalness: double (nullable = true)
 |-- af_liveness: double (nullable = true)
 |-- af_valence: double (nullable = true)
 |-- af_tempo: double (nullable = true)
 |-- af_time_signature: double (nullable = true)
 |-- language_id: double (nullable = true)
 |-- date: double (nullable = true)



ImportError: cannot import name 'Normalizer' from 'pyspark_hnsw.knn' (C:\Users\st114\anaconda3\Lib\site-packages\pyspark_hnsw\knn.py)