In [1]:
import os
import collections

from pyspark.sql import SparkSession
from operator import add

spark = SparkSession \
    .builder \
    .appName("myApp") \
    .config("spark.mongodb.input.uri", "mongodb://127.0.0.1/bigdata.raw") \
    .config("spark.mongodb.output.uri", "mongodb://127.0.0.1/bigdata.t3") \
    .config('spark.jars.packages', 'org.mongodb.spark:mongo-spark-connector_2.11:2.3.1')\
    .getOrCreate()

In [2]:
df = spark.read.format("com.mongodb.spark.sql.DefaultSource").load()
df = df.limit(100000)
df.show()

+--------------------+--------------------+--------------------+
|                 _id|               emoji|            sentence|
+--------------------+--------------------+--------------------+
|[5ea4eddb59a37f98...|      :red_heart:,18|No object is so b...|
|[5ea4eddb59a37f98...|:person_shrugging...|Cant expect diffe...|
|[5ea4eddb59a37f98...|:face_with_tears_...|“ Lets go Marcus ...|
|[5ea4eddb59a37f98...|:face_with_tears_...|Asahd really is a...|
|[5ea4eddb59a37f98...|:face_with_tears_...|Yoongi Tweet Hell...|
|[5ea4eddb59a37f98...|:backhand_index_p...|we cannot afford ...|
|[5ea4eddb59a37f98...|:party_popper:,8 ...|ranks 6th in Janu...|
|[5ea4eddb59a37f98...|:person_facepalmi...|Ok people are rea...|
|[5ea4eddb59a37f98...|:smiling_face_wit...|Cant wait to meet...|
|[5ea4eddb59a37f98...| :clapping_hands:,11|Congratulations M...|
|[5ea4eddb59a37f98...|:face_with_tears_...|Met orlando brown...|
|[5ea4eddb59a37f98...|      :weary_face:,4|Im goin to bed :w...|
|[5ea4eddb59a37f98...|  :

## For every emoji, determine it is used more with words begin with lower case or word begin with upper case.

In [3]:
def check_case(line):
    res = []
    sentence = line.sentence.split(' ')
    emojis = line.emoji.split(' ')
    
    for emoji in emojis:
        e, place = emoji.split(',')
        word = sentence[int(place)-1]

        if word[0].isupper():
            res.append((e, (1, 0)))
        elif word[0].islower():
            res.append((e, (0, 1)))
        else:
            res.append((e, (0, 0)))
    return res


result = df.rdd.map(check_case) \
        .flatMap(lambda x: x) \

upper = result.map(lambda x: (x[0], x[1][0])) \
                .reduceByKey(add)

lower = result.map(lambda x: (x[0], x[1][1])) \
                .reduceByKey(add) 

result = upper.join(lower) \
                .map(lambda x: (x[0], x[1][0], x[1][1]))

result.take(10)

[(':red_heart:', 2315, 5469),
 (':person_shrugging:', 226, 2081),
 (':female_sign:', 119, 61),
 (':face_with_tears_of_joy:', 2809, 15592),
 (':backhand_index_pointing_down:', 306, 607),
 (':party_popper:', 771, 532),
 (':person_facepalming:', 231, 1627),
 (':smiling_face_with_heart-eyes:', 1216, 4118),
 (':clapping_hands:', 827, 1289),
 (':weary_face:', 322, 2248)]

In [4]:
result = result.toDF()
result = result.selectExpr("_1 as emoji", "_2 as upper", "_3 as lower")
result.show()

+--------------------+-----+-----+
|               emoji|upper|lower|
+--------------------+-----+-----+
|         :red_heart:| 2315| 5469|
|  :person_shrugging:|  226| 2081|
|       :female_sign:|  119|   61|
|:face_with_tears_...| 2809|15592|
|:backhand_index_p...|  306|  607|
|      :party_popper:|  771|  532|
|:person_facepalming:|  231| 1627|
|:smiling_face_wit...| 1216| 4118|
|    :clapping_hands:|  827| 1289|
|        :weary_face:|  322| 2248|
|         :male_sign:|  121|   47|
|:loudly_crying_face:| 1399| 5868|
|      :folded_hands:|  734| 1977|
|    :hundred_points:|  464| 1246|
|:rolling_on_the_f...|  257| 1369|
|     :flexed_biceps:|  336|  925|
|:backhand_index_p...| 1039| 1051|
|       :crying_face:|  154|  701|
|      :purple_heart:|  388|  966|
|      :yellow_heart:|  183|  509|
+--------------------+-----+-----+
only showing top 20 rows



In [5]:
result.write.format("com.mongodb.spark.sql.DefaultSource").mode("append").save()