# Jupyter hnswlib example

This notebook demonstrates how to use hnswlib with pyspark in a jupyter notebook

## Download data 

The first line is dropped for it to be readable as csv input

In [None]:
!curl "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.vec.gz" | gunzip | tail -n +2 | gzip > data.gz 

In [2]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler
from pyspark_hnsw.conversion import VectorConverter
from pyspark_hnsw.knn import *
from pyspark_hnsw.linalg import Normalizer

Read the data as a spark dataframe

In [3]:
words_df = spark.read \
    .option('delimiter', ' ') \
    .option('inferSchema', 'true') \
    .option("quote", "\u0000") \
    .csv('data.gz') \
    .withColumnRenamed('_c0', 'id')

Inspect the schema

In [4]:
words_df.printSchema()

root
 |-- id: string (nullable = true)
 |-- _c1: double (nullable = true)
 |-- _c2: double (nullable = true)
 |-- _c3: double (nullable = true)
 |-- _c4: double (nullable = true)
 |-- _c5: double (nullable = true)
 |-- _c6: double (nullable = true)
 |-- _c7: double (nullable = true)
 |-- _c8: double (nullable = true)
 |-- _c9: double (nullable = true)
 |-- _c10: double (nullable = true)
 |-- _c11: double (nullable = true)
 |-- _c12: double (nullable = true)
 |-- _c13: double (nullable = true)
 |-- _c14: double (nullable = true)
 |-- _c15: double (nullable = true)
 |-- _c16: double (nullable = true)
 |-- _c17: double (nullable = true)
 |-- _c18: double (nullable = true)
 |-- _c19: double (nullable = true)
 |-- _c20: double (nullable = true)
 |-- _c21: double (nullable = true)
 |-- _c22: double (nullable = true)
 |-- _c23: double (nullable = true)
 |-- _c24: double (nullable = true)
 |-- _c25: double (nullable = true)
 |-- _c26: double (nullable = true)
 |-- _c27: double (nullable = true

## Fit the model

The cosine distance is obtained with the inner product after normalizing all vectors to unit norm. This is faster than calculating the cosine distance directly

In [5]:
vector_assembler = VectorAssembler(inputCols=words_df.columns[1:], outputCol='features_as_vector')

converter = VectorConverter(inputCols='features_as_vector', outputCol='features')

normalizer = Normalizer(inputCol='features', outputCol='normalized_features')

hnsw = HnswSimilarity(identifierCol='id', queryIdentifierCol='id', featuresCol='normalized_features', 
                      distanceFunction='inner-product', m=48, ef=5, k=10, efConstruction=200, numPartitions=2, 
                      excludeSelf=True, predictionCol='approximate', outputFormat='minimal')
 
pipeline = Pipeline(stages=[vector_assembler, converter, normalizer, hnsw])

model = pipeline.fit(words_df)


## Transform the data

Show the most similar words for a 1% sample

In [6]:
words_df_sample = words_df.sample(0.01)

model.transform(words_df_sample) \
    .show(100, False)

+--------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

## Save the model

In [7]:
model.save("/tmp/model")