In [1]:
from pymongo import MongoClient
from pyspark import SparkContext
from pyspark.sql import SparkSession
import pyspark.sql.functions as F

In [2]:
spark = SparkSession \
    .builder \
    .appName("myApp") \
    .config("spark.mongodb.input.uri", "mongodb://127.0.0.1/bigdata.raw") \
    .config("spark.mongodb.output.uri", "mongodb://127.0.0.1/bigdata.top10Word") \
    .config('spark.jars.packages', 'org.mongodb.spark:mongo-spark-connector_2.11:2.3.1')\
    .getOrCreate()

In [3]:
df = spark.read.format("com.mongodb.spark.sql.DefaultSource").load()

In [4]:
df = df.select("emoji", "sentence")

### Preprocessing

In [5]:
def divide_emoji(x):
    return x.split(" ")

def formed_(x):
    result = []
    for emoji in x[0]:
        result.append([emoji, x[1]])
    return result

In [6]:
rdd = df.rdd.map(list)
rdd = rdd.map(lambda x: (divide_emoji(x[0]), x[1]))
rdd = rdd.flatMap(lambda x: formed_(x))
rdd.take(5)

[[':sparkling_heart:,6', 'One of the things Ive waited :sparkling_heart:'],
 [':face_with_tears_of_joy:,13',
  'This video must confuse the crap out of liberals Who do they defend :face_with_tears_of_joy:'],
 [':face_with_tears_of_joy:,5',
  'Stalin Atrocities Lollu Sabha Mix :face_with_tears_of_joy: Dedicated to all உப ி ஸ ் in the social media :folded_hands: 🏼'],
 [':folded_hands:,16',
  'Stalin Atrocities Lollu Sabha Mix :face_with_tears_of_joy: Dedicated to all உப ி ஸ ் in the social media :folded_hands: 🏼'],
 [':face_with_tears_of_joy:,18',
  'I only use dpns now for icord and thumbs or if I cant find a long enough circ :face_with_tears_of_joy: Too many WIPs around']]

In [7]:
def separate_emoji(x):
    temp = x.split(',')
    return temp[0]

def separate_position(x):
    temp = x.split(',')
    return temp[1]

In [8]:
rdd = rdd.map(lambda x: (separate_emoji(x[0]), int(separate_position(x[0])), x[1]))

### For every emoji, find the top 10 words appears mostwhen using the emoji.

In [9]:
def count_words(sentence):
    sentence = sentence.split(' ')
    words_frequency={}
    for word in sentence:
        word =''.join(ch for ch in word if ch.isalpha())
        word = word.lower()
        if word == '':
            continue
        if word in words_frequency:
            words_frequency[word] += 1
        else:
            words_frequency[word] = 1
    temp = list(words_frequency.items())
    temp.sort(key = lambda x: x[1], reverse=True)
    result = []
    num_top_word = 10
    if len(temp) < 10:
        num_top_word = len(temp)
    for i in range(num_top_word):
        result.append(temp[i][0])
    return result

In [10]:
def combine_string(x):
    all_sentence = ""
    for item in x:
        sentence = item[1].split(item[0])
        for word in sentence:
            if word != item[0]:
                all_sentence = all_sentence + word + " "
    return all_sentence

In [11]:
temp_rdd = rdd.map(lambda x: (x[0], x[2]))
group_by_emoji = temp_rdd.groupBy(lambda x: x[0])
operate_rdd = group_by_emoji.map(lambda x: (x[0], list(x[1])))
combined = operate_rdd.map(lambda x: (x[0], combine_string(x[1])))
result = combined.map(lambda x: (x[0], count_words(x[1])))
result.take(5)

[(':sparkling_heart:',
  ['one',
   'of',
   'the',
   'things',
   'ive',
   'waited',
   'twohearts',
   'i',
   'want',
   'sum']),
 (':face_with_tears_of_joy:',
  ['i',
   'the',
   'and',
   'momma',
   'this',
   'start',
   'a',
   'now',
   'changyoon',
   'gunmin']),
 (':folded_hands:',
  ['stalin',
   'atrocities',
   'lollu',
   'sabha',
   'mix',
   'facewithtearsofjoy',
   'dedicated',
   'to',
   'all',
   'உப']),
 (':speaking_head:',
  ['queens',
   'if',
   'your',
   'king',
   'doesnt',
   'treat',
   'you',
   'like',
   'husband',
   'does']),
 (':heart_suit:',
  ['vip', 'pump', 'notifier', 'for', 'bittrex', 'private', 'and', 'dai'])]

In [12]:
result_df = result.toDF()

In [13]:
result_df.write.format("com.mongodb.spark.sql.DefaultSource").mode("append").save()