In [101]:
import pyspark_csv as pycsv
sc.addPyFile('pyspark_csv.py')
plaintext_rdd = sc.textFile('datos/properati_final.csv')
dataframe = pycsv.csvToDataFrame(sqlCtx, plaintext_rdd)
data = dataframe.rdd

In [2]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:97% !important; }</style>"))

In [102]:
from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(inputCols=["price","surface_total_in_m2"], outputCol="price_sup")
dataframe = assembler.transform(dataframe)

In [103]:
dataframe.select('price_sup').show()

+----------------+
|       price_sup|
+----------------+
|  [71000.0,29.0]|
|  [140000.0,0.0]|
|  [165000.0,0.0]|
|  [148000.0,0.0]|
|[132000.0,112.0]|
| [190000.0,96.0]|
|  [250000.0,0.0]|
| [119000.0,95.0]|
|  [98000.0,43.0]|
| [110000.0,53.0]|
|[190000.0,103.0]|
|  [92000.0,45.0]|
|  [159000.0,0.0]|
|   [69500.0,0.0]|
| [110000.0,54.0]|
|  [71000.0,30.0]|
|  [96000.0,47.0]|
|  [86000.0,47.0]|
|  [95000.0,50.0]|
|  [87000.0,68.0]|
+----------------+
only showing top 20 rows



In [104]:
from pyspark.ml.feature import BucketedRandomProjectionLSH
from pyspark.ml.linalg import Vectors
from pyspark.sql.functions import col

key = Vectors.dense([70150.0,50.0])

brp = BucketedRandomProjectionLSH(inputCol="price_sup", outputCol="bucket", bucketLength=500.0,
                                  numHashTables=2)
model = brp.fit(dataframe)

# Compute the locality sensitive hashes for the input rows, then perform approximate nearest
# neighbor search.
# We could avoid computing hashes by passing in the already-transformed dataset, e.g.
# `model.approxNearestNeighbors(transformedA, key, 2)`
print("Approximately searching dfA for 20 nearest neighbors of",key)
model.approxNearestNeighbors(dataframe, key, 20,'distancia').select('place_name','price_sup','bucket','distancia').show()


('Approximately searching dfA for 20 nearest neighbors of', DenseVector([70150.0, 50.0]))
+----------------+--------------+-----------------+------------------+
|      place_name|     price_sup|           bucket|         distancia|
+----------------+--------------+-----------------+------------------+
|         Almagro|[70140.0,33.0]|[[83.0], [139.0]]| 19.72308292331602|
| Lomas de Zamora|[70200.0,32.0]|[[83.0], [139.0]]| 53.14132102234569|
|        Saavedra|[70205.0,37.0]|[[83.0], [139.0]]|56.515484603779164|
|          Bernal|[70210.0,32.0]|[[83.0], [139.0]]|  62.6418390534633|
|          Bernal|[70210.0,32.0]|[[83.0], [139.0]]|  62.6418390534633|
|       San Telmo|[70066.0,37.0]|[[83.0], [139.0]]|              85.0|
|     Villa Bosch|[70000.0,50.0]|[[83.0], [139.0]]|             150.0|
|      La Matanza|[70000.0,50.0]|[[83.0], [139.0]]|             150.0|
|      San Miguel|[70000.0,50.0]|[[83.0], [139.0]]|             150.0|
|        AdroguÃ©|[70000.0,50.0]|[[83.0], [139.0]]|       