In [4]:
import os
import collections

from pyspark.sql import SparkSession
from operator import add

In [5]:
spark = SparkSession \
    .builder \
    .appName("myApp") \
    .config("spark.mongodb.input.uri", "mongodb://127.0.0.1/bigdata.raw") \
    .config("spark.mongodb.output.uri", "mongodb://127.0.0.1/bigdata.t1") \
    .config('spark.jars.packages', 'org.mongodb.spark:mongo-spark-connector_2.11:2.3.1')\
    .getOrCreate()

In [6]:
df = spark.read.format("com.mongodb.spark.sql.DefaultSource").load()
df = df.limit(100000)
df.show()

+--------------------+--------------------+--------------------+
|                 _id|               emoji|            sentence|
+--------------------+--------------------+--------------------+
|[5ea4eddb59a37f98...|      :red_heart:,18|No object is so b...|
|[5ea4eddb59a37f98...|:person_shrugging...|Cant expect diffe...|
|[5ea4eddb59a37f98...|:face_with_tears_...|“ Lets go Marcus ...|
|[5ea4eddb59a37f98...|:face_with_tears_...|Asahd really is a...|
|[5ea4eddb59a37f98...|:face_with_tears_...|Yoongi Tweet Hell...|
|[5ea4eddb59a37f98...|:backhand_index_p...|we cannot afford ...|
|[5ea4eddb59a37f98...|:party_popper:,8 ...|ranks 6th in Janu...|
|[5ea4eddb59a37f98...|:person_facepalmi...|Ok people are rea...|
|[5ea4eddb59a37f98...|:smiling_face_wit...|Cant wait to meet...|
|[5ea4eddb59a37f98...| :clapping_hands:,11|Congratulations M...|
|[5ea4eddb59a37f98...|:face_with_tears_...|Met orlando brown...|
|[5ea4eddb59a37f98...|      :weary_face:,4|Im goin to bed :w...|
|[5ea4eddb59a37f98...|  :

## Find the appearance frequency of every emoji.

In [7]:
emojis = df.select('emoji')
# emojis.show()
def split_str(line):
    res = []
    
    words = line.emoji.split(" ")
    for word in words:
        tmp = word.split(',')[0]
        res.append(tmp)
    return " ".join(res)

result = emojis.rdd.map(split_str) \
        .flatMap(lambda x: x.split(" ")) \
        .map(lambda x: (x, 1)) \
        .reduceByKey(add) \
        .sortBy(lambda x: x[1], ascending= False) 

result.take(10)

[(':face_with_tears_of_joy:', 19559),
 (':red_heart:', 11474),
 (':loudly_crying_face:', 7697),
 (':fire:', 6155),
 (':smiling_face_with_heart-eyes:', 5732),
 (':female_sign:', 4648),
 (':clapping_hands:', 3631),
 (':folded_hands:', 3438),
 (':male_sign:', 3400),
 (':backhand_index_pointing_right:', 2706)]

In [9]:
result = result.toDF()
result = result.selectExpr("_1 as emoji", "_2 as fre")
result.show()

+--------------------+-----+
|               emoji|  fre|
+--------------------+-----+
|:face_with_tears_...|19559|
|         :red_heart:|11474|
|:loudly_crying_face:| 7697|
|              :fire:| 6155|
|:smiling_face_wit...| 5732|
|       :female_sign:| 4648|
|    :clapping_hands:| 3631|
|      :folded_hands:| 3438|
|         :male_sign:| 3400|
|:backhand_index_p...| 2706|
|        :weary_face:| 2688|
|  :person_shrugging:| 2598|
|        :two_hearts:| 2530|
|:smiling_face_wit...| 2397|
|          :sparkles:| 2351|
|     :raising_hands:| 2252|
|:person_facepalming:| 2235|
|    :hundred_points:| 2076|
|     :thinking_face:| 2000|
|:double_exclamati...| 1781|
+--------------------+-----+
only showing top 20 rows



In [10]:
result.write.format("com.mongodb.spark.sql.DefaultSource").mode("append").save()