In [1]:
import os
import collections

from pyspark.sql import SparkSession
from operator import add

spark = SparkSession \
    .builder \
    .appName("myApp") \
    .config("spark.mongodb.input.uri", "mongodb://127.0.0.1/bigdata.raw") \
    .config("spark.mongodb.output.uri", "mongodb://127.0.0.1/bigdata.t7") \
    .config('spark.jars.packages', 'org.mongodb.spark:mongo-spark-connector_2.11:2.3.1')\
    .getOrCreate()

In [2]:
df = spark.read.format("com.mongodb.spark.sql.DefaultSource").load()
df = df.limit(100000)
df.show()

+--------------------+--------------------+--------------------+
|                 _id|               emoji|            sentence|
+--------------------+--------------------+--------------------+
|[5ea4eddb59a37f98...|      :red_heart:,18|No object is so b...|
|[5ea4eddb59a37f98...|:person_shrugging...|Cant expect diffe...|
|[5ea4eddb59a37f98...|:face_with_tears_...|“ Lets go Marcus ...|
|[5ea4eddb59a37f98...|:face_with_tears_...|Asahd really is a...|
|[5ea4eddb59a37f98...|:face_with_tears_...|Yoongi Tweet Hell...|
|[5ea4eddb59a37f98...|:backhand_index_p...|we cannot afford ...|
|[5ea4eddb59a37f98...|:party_popper:,8 ...|ranks 6th in Janu...|
|[5ea4eddb59a37f98...|:person_facepalmi...|Ok people are rea...|
|[5ea4eddb59a37f98...|:smiling_face_wit...|Cant wait to meet...|
|[5ea4eddb59a37f98...| :clapping_hands:,11|Congratulations M...|
|[5ea4eddb59a37f98...|:face_with_tears_...|Met orlando brown...|
|[5ea4eddb59a37f98...|      :weary_face:,4|Im goin to bed :w...|
|[5ea4eddb59a37f98...|  :

## Find the average of the number of emoji used in a sentence.

In [3]:
emojis = df.select('emoji')
def cal_count(line):
    res = 0
    
    words = line.emoji.split(" ")
    for word in words:
        res += 1
    return res

emojis_count = emojis.rdd.map(cal_count)
emojis_mapped = emojis_count.map(lambda x: (x, 1))
total_count = emojis_mapped.reduceByKey(add).sortByKey()
total_count.take(10)

[(1, 82117),
 (2, 12203),
 (3, 3395),
 (4, 1265),
 (5, 427),
 (6, 237),
 (7, 110),
 (8, 66),
 (9, 53),
 (10, 30)]

In [4]:
result = total_count.toDF()
result = result.selectExpr("_1 as num", "_2 as counts")
result.show()

+---+------+
|num|counts|
+---+------+
|  1| 82117|
|  2| 12203|
|  3|  3395|
|  4|  1265|
|  5|   427|
|  6|   237|
|  7|   110|
|  8|    66|
|  9|    53|
| 10|    30|
| 11|    15|
| 12|    17|
| 13|    14|
| 14|     8|
| 15|     6|
| 16|     4|
| 17|     7|
| 18|    14|
| 19|     3|
| 20|     1|
+---+------+
only showing top 20 rows



In [5]:
result.write.format("com.mongodb.spark.sql.DefaultSource").mode("append").save()