In [2]:
from pyspark.sql.functions import udf, col,split
from pyspark.ml.clustering import KMeans
import json
import elasticsearch
from elasticsearch import Elasticsearch
from elasticsearch import helpers

In [3]:
from pyspark.sql import SparkSession
my_spark = SparkSession \
    .builder \
    .appName("myApp") \
    .config("spark.mongodb.input.uri", "mongodb://127.0.0.1/mydatabase.tweets_test") \
    .config("spark.mongodb.output.uri", "mongodb://127.0.0.1/mydatabase.tweets_test") \
    .config("spark.io.compression.codec", "snappy").getOrCreate() #this line's config is for solving lz4 error
dataFrame=my_spark.read.format("com.mongodb.spark.sql.DefaultSource").load()
dataFrame.printSchema()

root
 |-- _id: struct (nullable = true)
 |    |-- oid: string (nullable = true)
 |-- created_at: string (nullable = true)
 |-- loc_lat: double (nullable = true)
 |-- loc_long: double (nullable = true)
 |-- text: string (nullable = true)



In [4]:
#split = udf(lambda x: x.split(','))
#df.withColumn("user_location", split_udf(col("user_location"))).show()
#df=dataFrame.withColumn("user_location",
   # split(col("user_location"), ",\s*").cast("array<float>").alias("user_location")
#)
#df_loc = df.select('user_location')

In [5]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(
    inputCols=['loc_lat','loc_long'],
    outputCol='features')
trainingData = assembler.transform(dataFrame)
#trainingData.show()

In [6]:
kmeans = KMeans(k=3,seed=1)
model = kmeans.fit(trainingData)
# Shows the result.
centers = model.clusterCenters()
print("Cluster Centers: ")
for center in centers:
    print(center)

Cluster Centers: 
[54.7023545 -3.2765753]
[43.4773251 65.9795093]
[ 34.86553795 -85.3075054 ]


In [7]:
cluster_ind = model.transform(trainingData)
dataToKibana = cluster_ind.toPandas().to_dict('record')
for item in dataToKibana:
    item['created_at'] =item['created_at'][:10]+'T'+item['created_at'][11:]+'Z'
    
for item in dataToKibana:
    item['loc_lat'] =round(item['loc_lat'],2)
    item['loc_long'] =round(item['loc_long'],2)

In [8]:
dataToKibana

[{'_id': Row(oid='5c9b0b44566b181e0f151227'),
  'created_at': '2019-03-27T05:33:33Z',
  'features': DenseVector([57.255, 59.4294]),
  'loc_lat': 57.25,
  'loc_long': 59.43,
  'prediction': 1,
  'text': 'my whole life i’ve been scared to fully be me bc i always felt i had to be humble or that i had too big of a person… https://t.co/V2na5uK00V'},
 {'_id': Row(oid='5c9b0b44566b181e0f151228'),
  'created_at': '2019-03-27T05:33:33Z',
  'features': DenseVector([40.1353, -79.8955]),
  'loc_lat': 40.14,
  'loc_long': -79.9,
  'prediction': 2,
  'text': '@TheBat2019 @Tully_of_rivia I literally had a vcr in a closet for years and years and years until I finally threw it out maybe 5 years ago'},
 {'_id': Row(oid='5c9b0b44566b181e0f151229'),
  'created_at': '2019-03-27T05:33:33Z',
  'features': DenseVector([29.5958, -90.7195]),
  'loc_lat': 29.6,
  'loc_long': -90.72,
  'prediction': 2,
  'text': 'RT @kenliedavis: nice teeth are (and i cannot stress this enough) EVERYTHING'},
 {'_id': Row(oid='5c9

In [14]:
es = Elasticsearch(hosts='http://localhost',port=9200)
actions = []
mappings = {
    "mappings":{
        "tweet": {
            "properties": {
                "text": { "type": "text"  },
                "timestamp": { "type": "date" },
                "location": {"type": "geo_point"},
                "prediction": {"type": "integer"}
                }
            }
    }
}
es.indices.create(index="loc10", body=mappings)
for msg in dataToKibana:
    print(msg["text"])
    print("-------------")
    action = {
            "index": "loc6",
            "type": "tweet",
            "source": {
                'text' : msg["text"],
                'timestamp': msg["created_at"],
                'location': {"lat": msg["loc_long"],"lon": msg["loc_lat"]},
                'prediction': msg["prediction"]
                }
            }
    actions.append(json.dumps(action))
helpers.bulk(es, actions, index='loc10', doc_type='tweet')
    

my whole life i’ve been scared to fully be me bc i always felt i had to be humble or that i had too big of a person… https://t.co/V2na5uK00V
-------------
@TheBat2019 @Tully_of_rivia I literally had a vcr in a closet for years and years and years until I finally threw it out maybe 5 years ago
-------------
RT @kenliedavis: nice teeth are (and i cannot stress this enough) EVERYTHING
-------------
RT @hyunjinphotos: 190109

HE CUT HIS HAIR AND IT LOOKS SO GOOD

©️in the flames https://t.co/j3inVXidM0
-------------
In #Doha today with @SafeToNet and our @ooredoo friends for the #OoredooInnovate roadshow. Here to keep @qatar chil… https://t.co/ut4mgyJVQB
-------------
my whole life i’ve been scared to fully be me bc i always felt i had to be humble or that i had too big of a person… https://t.co/V2na5uK00V
-------------
@TheBat2019 @Tully_of_rivia I literally had a vcr in a closet for years and years and years until I finally threw it out maybe 5 years ago
-------------
RT @kenliedavis: ni

(10, [])