In [1]:
from pymongo import MongoClient
from pyspark import SparkContext
from pyspark.sql import SparkSession
import pyspark.sql.functions as F

In [2]:
spark = SparkSession \
    .builder \
    .appName("myApp") \
    .config("spark.mongodb.input.uri", "mongodb://127.0.0.1/bigdata.raw") \
    .config("spark.mongodb.output.uri", "mongodb://127.0.0.1/bigdata.top10Word") \
    .config('spark.jars.packages', 'org.mongodb.spark:mongo-spark-connector_2.11:2.3.1')\
    .getOrCreate()

In [3]:
df = spark.read.format("com.mongodb.spark.sql.DefaultSource").load()
df = df.limit(100000)

In [4]:
df = df.select("emoji", "sentence")

### Preprocessing

In [5]:
def divide_emoji(x):
    return x.split(" ")

def formed_(x):
    result = []
    for emoji in x[0]:
        result.append([emoji, x[1]])
    return result

In [6]:
rdd = df.rdd.map(list)
rdd = rdd.map(lambda x: (divide_emoji(x[0]), x[1]))
rdd = rdd.flatMap(lambda x: formed_(x))
rdd.take(5)

[[':red_heart:,18',
  'No object is so beautiful that under certain conditions it will not look ugly Oscar Wilde ↺ RT :red_heart: …'],
 [':person_shrugging:,13',
  'Cant expect different results doing the same thingdoing stuff different from now on :person_shrugging: 🏻 \u200d :female_sign: ️'],
 [':female_sign:,15',
  'Cant expect different results doing the same thingdoing stuff different from now on :person_shrugging: 🏻 \u200d :female_sign: ️'],
 [':face_with_tears_of_joy:,14',
  '“ Lets go Marcus ” “ Shiiit where we goin Home ” Marcus Peters :face_with_tears_of_joy:'],
 [':face_with_tears_of_joy:,14',
  'Asahd really is a grown man in the body of a 1 year old :face_with_tears_of_joy:']]

In [7]:
def separate_emoji(x):
    temp = x.split(',')
    return temp[0]

def separate_position(x):
    temp = x.split(',')
    return temp[1]

In [8]:
rdd = rdd.map(lambda x: (separate_emoji(x[0]), int(separate_position(x[0])), x[1]))

### For every emoji, find the top 10 words appears mostwhen using the emoji.

In [9]:
def count_words(sentence):
    sentence = sentence.split(' ')
    words_frequency={}
    for word in sentence:
        word =''.join(ch for ch in word if ch.isalpha())
        word = word.lower()
        if word == '':
            continue
        if word in words_frequency:
            words_frequency[word] += 1
        else:
            words_frequency[word] = 1
    temp = list(words_frequency.items())
    temp.sort(key = lambda x: x[1], reverse=True)
    result = []
    num_top_word = 10
    if len(temp) < 10:
        num_top_word = len(temp)
    for i in range(num_top_word):
        result.append(temp[i][0])
    return result

In [10]:
def combine_string(x):
    all_sentence = ""
    for item in x:
        sentence = item[1].split(item[0])
        for word in sentence:
            if word != item[0]:
                all_sentence = all_sentence + word + " "
    return all_sentence

In [11]:
temp_rdd = rdd.map(lambda x: (x[0], x[2]))
group_by_emoji = temp_rdd.groupBy(lambda x: x[0])
operate_rdd = group_by_emoji.map(lambda x: (x[0], list(x[1])))
combined = operate_rdd.map(lambda x: (x[0], combine_string(x[1])))
result = combined.map(lambda x: (x[0], count_words(x[1])))
result.take(5)

[(':red_heart:',
  ['you', 'i', 'the', 'to', 'love', 'my', 'and', 'a', 'for', 'so']),
 (':person_shrugging:',
  ['femalesign', 'malesign', 'i', 'to', 'you', 'the', 'a', 'me', 'my', 'it']),
 (':female_sign:',
  ['i',
   'personshrugging',
   'personfacepalming',
   'to',
   'the',
   'you',
   'a',
   'my',
   'me',
   'and']),
 (':face_with_tears_of_joy:',
  ['i', 'the', 'to', 'a', 'this', 'you', 'my', 'and', 'is', 'me']),
 (':backhand_index_pointing_down:',
  ['the', 'to', 'this', 'is', 'a', 'for', 'you', 'and', 'of', 'on'])]

In [12]:
result_df = result.toDF()

In [13]:
result_df.write.format("com.mongodb.spark.sql.DefaultSource").mode("append").save()