In [1]:
import os
import collections

from pyspark.sql import SparkSession
from operator import add

spark = SparkSession \
    .builder \
    .appName("myApp") \
    .config("spark.mongodb.input.uri", "mongodb://127.0.0.1/bigdata.raw") \
    .config("spark.mongodb.output.uri", "mongodb://127.0.0.1/bigdata.t9") \
    .config('spark.jars.packages', 'org.mongodb.spark:mongo-spark-connector_2.11:2.3.1')\
    .getOrCreate()

In [2]:
df = spark.read.format("com.mongodb.spark.sql.DefaultSource").load()
df = df.limit(100000)
df.show()

+--------------------+--------------------+--------------------+
|                 _id|               emoji|            sentence|
+--------------------+--------------------+--------------------+
|[5ea4eddb59a37f98...|      :red_heart:,18|No object is so b...|
|[5ea4eddb59a37f98...|:person_shrugging...|Cant expect diffe...|
|[5ea4eddb59a37f98...|:face_with_tears_...|“ Lets go Marcus ...|
|[5ea4eddb59a37f98...|:face_with_tears_...|Asahd really is a...|
|[5ea4eddb59a37f98...|:face_with_tears_...|Yoongi Tweet Hell...|
|[5ea4eddb59a37f98...|:backhand_index_p...|we cannot afford ...|
|[5ea4eddb59a37f98...|:party_popper:,8 ...|ranks 6th in Janu...|
|[5ea4eddb59a37f98...|:person_facepalmi...|Ok people are rea...|
|[5ea4eddb59a37f98...|:smiling_face_wit...|Cant wait to meet...|
|[5ea4eddb59a37f98...| :clapping_hands:,11|Congratulations M...|
|[5ea4eddb59a37f98...|:face_with_tears_...|Met orlando brown...|
|[5ea4eddb59a37f98...|      :weary_face:,4|Im goin to bed :w...|
|[5ea4eddb59a37f98...|  :

## Analyze the relation between the length of sentence and the number of emoji used in the sentence.

In [3]:
def length_relation(line):
    
    sentence = line.sentence.split(' ')
    emojis = line.emoji.split(' ')
    
    sentence_length = len(sentence)
    emojis_length = len(emojis)
    return (sentence_length, emojis_length)

length = df.rdd.map(length_relation)
length_mapped = length.map(lambda x: (x, 1))
relation = length_mapped.reduceByKey(add).sortByKey() \
                        .map(lambda x: (x[0][0], x[0][1], x[1]))

relation.take(10)

[(5, 1, 7058),
 (6, 1, 6929),
 (6, 2, 383),
 (7, 1, 6319),
 (7, 2, 497),
 (7, 3, 75),
 (8, 1, 5976),
 (8, 2, 554),
 (8, 3, 139),
 (8, 4, 13)]

In [4]:
result = relation.toDF()
result = result.selectExpr("_1 as sent_len", "_2 as emoji_len", "_3 as count")
result.show()

+--------+---------+-----+
|sent_len|emoji_len|count|
+--------+---------+-----+
|       5|        1| 7058|
|       6|        1| 6929|
|       6|        2|  383|
|       7|        1| 6319|
|       7|        2|  497|
|       7|        3|   75|
|       8|        1| 5976|
|       8|        2|  554|
|       8|        3|  139|
|       8|        4|   13|
|       9|        1| 5537|
|       9|        2|  633|
|       9|        3|  129|
|       9|        4|   20|
|      10|        1| 5170|
|      10|        2|  666|
|      10|        3|  157|
|      10|        4|   33|
|      10|        5|    6|
|      11|        1| 4859|
+--------+---------+-----+
only showing top 20 rows



In [5]:
result.write.format("com.mongodb.spark.sql.DefaultSource").mode("append").save()