In [86]:
from pyspark.sql import SparkSession
my_spark = SparkSession \
    .builder \
    .appName("myApp") \
    .config("spark.mongodb.input.uri", "mongodb://127.0.0.1/mydatabase.tweets_test") \
    .config("spark.mongodb.output.uri", "mongodb://127.0.0.1/mydatabase.tweets_test") \
    .config("spark.io.compression.codec", "snappy").getOrCreate() #this line's config is for solving lz4 error
dataFrame=my_spark.read.format("com.mongodb.spark.sql.DefaultSource").load()
dataFrame.printSchema()

root
 |-- _id: struct (nullable = true)
 |    |-- oid: string (nullable = true)
 |-- created_at: string (nullable = true)
 |-- followers_count: string (nullable = true)
 |-- language: string (nullable = true)
 |-- loc_lat: double (nullable = true)
 |-- loc_long: double (nullable = true)
 |-- text: string (nullable = true)
 |-- user_id: string (nullable = true)
 |-- user_location: string (nullable = true)
 |-- user_timezone: string (nullable = true)



In [67]:
from pyspark.sql.functions import udf, col,split
from pyspark.ml.clustering import KMeans
#split = udf(lambda x: x.split(','))
#df.withColumn("user_location", split_udf(col("user_location"))).show()

df=dataFrame.withColumn(
    "user_location",
    split(col("user_location"), ",\s*").cast("array<float>").alias("user_location")
)
df_loc = df.select('user_location')

In [90]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
df = dataFrame.select('loc_lat','loc_long')
assembler = VectorAssembler(
    inputCols=['loc_lat','loc_long'],
    outputCol='features')
trainingData = assembler.transform(df)
#trainingData.show()

+-----------+-----------------+--------------------+
|    loc_lat|         loc_long|            features|
+-----------+-----------------+--------------------+
| 19.4326009|      -99.1333416|[19.4326009,-99.1...|
| -1.2832533|       36.8172449|[-1.2832533,36.81...|
| 13.2904027|      108.4265113|[13.2904027,108.4...|
| 33.3841541|     -111.8540448|[33.3841541,-111....|
| 61.0666922|     -107.9917071|[61.0666922,-107....|
| 39.9524152|      -75.1635755|[39.9524152,-75.1...|
|  50.000678|       -86.000977|[50.000678,-86.00...|
| 64.4989922|-165.398799443163|[64.4989922,-165....|
| 12.9791198|       77.5912997|[12.9791198,77.59...|
| 36.7014631|     -118.7559974|[36.7014631,-118....|
|-24.6423816|       25.9131082|[-24.6423816,25.9...|
|  50.285829|      -107.800598|[50.285829,-107.8...|
|-13.2687204|       33.9301963|[-13.2687204,33.9...|
| 27.7567667|      -81.4639835|[27.7567667,-81.4...|
| 40.0757384|      -74.4041622|[40.0757384,-74.4...|
| 34.0536834|     -118.2427669|[34.0536834,-11

In [94]:
kmeans = KMeans(k=3,seed=1)
model = kmeans.fit(trainingData)

# Shows the result.
centers = model.clusterCenters()
print("Cluster Centers: ")
for center in centers:
    print(center)

Cluster Centers: 
[ 1.65985157 46.61919159]
[  42.75016289 -118.0784741 ]
[ 35.00724347 -81.71916936]


In [95]:
cluster_ind = model.transform(trainingData)
cluster_ind.collect()

[Row(loc_lat=19.4326009, loc_long=-99.1333416, features=DenseVector([19.4326, -99.1333]), prediction=2),
 Row(loc_lat=-1.2832533, loc_long=36.8172449, features=DenseVector([-1.2833, 36.8172]), prediction=0),
 Row(loc_lat=13.2904027, loc_long=108.4265113, features=DenseVector([13.2904, 108.4265]), prediction=0),
 Row(loc_lat=33.3841541, loc_long=-111.8540448, features=DenseVector([33.3842, -111.854]), prediction=1),
 Row(loc_lat=61.0666922, loc_long=-107.9917071, features=DenseVector([61.0667, -107.9917]), prediction=1),
 Row(loc_lat=39.9524152, loc_long=-75.1635755, features=DenseVector([39.9524, -75.1636]), prediction=2),
 Row(loc_lat=50.000678, loc_long=-86.000977, features=DenseVector([50.0007, -86.001]), prediction=2),
 Row(loc_lat=64.4989922, loc_long=-165.398799443163, features=DenseVector([64.499, -165.3988]), prediction=1),
 Row(loc_lat=12.9791198, loc_long=77.5912997, features=DenseVector([12.9791, 77.5913]), prediction=0),
 Row(loc_lat=36.7014631, loc_long=-118.7559974, featu

In [100]:
import numpy as np
np.array(cluster_ind.select('features'))

array(DataFrame[features: vector], dtype=object)

In [97]:
from gmplot import gmplot

# Place map
gmap = gmplot.GoogleMapPlotter(37.766956, -122.438481, 13)

# Polygon
golden_gate_park_lats, golden_gate_park_lons = zip(*[
    (37.771269, -122.511015),
    (37.773495, -122.464830),
    (37.774797, -122.454538),
    (37.771988, -122.454018),
    (37.773646, -122.440979),
    (37.772742, -122.440797),
    (37.771096, -122.453889),
    (37.768669, -122.453518),
    (37.766227, -122.460213),
    (37.764028, -122.510347),
    (37.771269, -122.511015)
    ])
gmap.plot(golden_gate_park_lats, golden_gate_park_lons, 'cornflowerblue', edge_width=10)

# Scatter points
top_attraction_lats, top_attraction_lons = zip(*[
    (37.769901, -122.498331),
    (37.768645, -122.475328),
    (37.771478, -122.468677),
    (37.769867, -122.466102),
    (37.767187, -122.467496),
    (37.770104, -122.470436)
    ])
gmap.scatter(top_attraction_lats, top_attraction_lons, '#3B0B39', size=40, marker=False)

# Marker
hidden_gem_lat, hidden_gem_lon = 37.770776, -122.461689
gmap.marker(hidden_gem_lat, hidden_gem_lon, 'cornflowerblue')

# Draw
gmap.draw("my_map.html")