In [1]:
from pymongo import MongoClient
from pyspark import SparkContext
from pyspark.sql import SparkSession
import pyspark.sql.functions as F

In [2]:
spark = SparkSession \
    .builder \
    .appName("myApp") \
    .config("spark.mongodb.input.uri", "mongodb://127.0.0.1/bigdata.raw") \
    .config("spark.mongodb.output.uri", "mongodb://127.0.0.1/bigdata.top10Word") \
    .config('spark.jars.packages', 'org.mongodb.spark:mongo-spark-connector_2.11:2.3.1')\
    .getOrCreate()

In [3]:
df = spark.read.format("com.mongodb.spark.sql.DefaultSource").load()

In [4]:
df = df.select("emoji", "sentence")

### Preprocessing

In [5]:
def divide_emoji(x):
    return x.split(" ")

def formed_(x):
    result = []
    for emoji in x[0]:
        result.append([emoji, x[1]])
    return result

In [6]:
rdd = df.rdd.map(list)
rdd = rdd.map(lambda x: (divide_emoji(x[0]), x[1]))
rdd = rdd.flatMap(lambda x: formed_(x))
rdd.take(5)

[[':sparkling_heart:,6', 'One of the things Ive waited :sparkling_heart:'],
 [':face_with_tears_of_joy:,13',
  'This video must confuse the crap out of liberals Who do they defend :face_with_tears_of_joy:'],
 [':face_with_tears_of_joy:,5',
  'Stalin Atrocities Lollu Sabha Mix :face_with_tears_of_joy: Dedicated to all உப ி ஸ ் in the social media :folded_hands: 🏼'],
 [':folded_hands:,16',
  'Stalin Atrocities Lollu Sabha Mix :face_with_tears_of_joy: Dedicated to all உப ி ஸ ் in the social media :folded_hands: 🏼'],
 [':face_with_tears_of_joy:,18',
  'I only use dpns now for icord and thumbs or if I cant find a long enough circ :face_with_tears_of_joy: Too many WIPs around']]

In [7]:
def separate_emoji(x):
    temp = x.split(',')
    return temp[0]

def separate_position(x):
    temp = x.split(',')
    return temp[1]

In [8]:
rdd = rdd.map(lambda x: (separate_emoji(x[0]), int(separate_position(x[0])), x[1]))

### For every emoji, find the top 10 words appears mostwhen using the emoji.

In [9]:
def count_words(sentence):
    sentence = sentence.split(' ')
    words_frequency={}
    for word in sentence:
        word =''.join(ch for ch in word if ch.isalpha())
        word = word.lower()
        if word == '':
            continue
        if word in words_frequency:
            words_frequency[word] += 1
        else:
            words_frequency[word] = 1
    temp = list(words_frequency.items())
    temp.sort(key = lambda x: x[1], reverse=True)
    result = []
    num_top_word = 10
    if len(temp) < 10:
        num_top_word = len(temp)
    for i in range(num_top_word):
        result.append(temp[i][0])
    return result

In [10]:
def combine_string(x):
    all_sentence = ""
    for item in x:
        sentence = item[1].split(item[0])
        for word in sentence:
            if word != item[0]:
                all_sentence = all_sentence + word + " "
    return all_sentence

In [11]:
temp_rdd = rdd.map(lambda x: (x[0], x[2]))
group_by_emoji = temp_rdd.groupBy(lambda x: x[0])
operate_rdd = group_by_emoji.map(lambda x: (x[0], list(x[1])))
combined = operate_rdd.map(lambda x: (x[0], combine_string(x[1])))
result = combined.map(lambda x: (x[0], count_words(x[1])))
result.take(5)

[(':sparkling_heart:',
  ['one',
   'of',
   'the',
   'things',
   'ive',
   'waited',
   'twohearts',
   'i',
   'want',
   'sum']),
 (':face_with_tears_of_joy:',
  ['i',
   'the',
   'and',
   'momma',
   'this',
   'start',
   'a',
   'now',
   'changyoon',
   'gunmin']),
 (':folded_hands:',
  ['stalin',
   'atrocities',
   'lollu',
   'sabha',
   'mix',
   'facewithtearsofjoy',
   'dedicated',
   'to',
   'all',
   'உப']),
 (':speaking_head:',
  ['queens',
   'if',
   'your',
   'king',
   'doesnt',
   'treat',
   'you',
   'like',
   'husband',
   'does']),
 (':heart_suit:',
  ['vip', 'pump', 'notifier', 'for', 'bittrex', 'private', 'and', 'dai'])]

In [12]:
result_df = result.toDF()

In [13]:
#result_df.write.format("com.mongodb.spark.sql.DefaultSource").mode("append").save()

In [14]:
result_df.show()

+--------------------+--------------------+
|                  _1|                  _2|
+--------------------+--------------------+
|   :sparkling_heart:|[one, of, the, th...|
|:face_with_tears_...|[i, the, and, mom...|
|      :folded_hands:|[stalin, atrociti...|
|     :speaking_head:|[queens, if, your...|
|        :heart_suit:|[vip, pump, notif...|
|:backhand_index_p...|[answers, for, xi...|
|           :OK_hand:|[sex, with, the, ...|
|:person_facepalming:|[bro, i, never, s...|
|         :male_sign:|[i, u, bro, never...|
|:loudly_crying_face:|[i, bro, never, s...|
|       :right_arrow:|[my, to, click, s...|
|        :weary_face:|[i, wanna, the, s...|
|  :person_shrugging:|[i, where, need, ...|
|       :female_sign:|[i, my, amp, pers...|
|      :winking_face:|[in, comments, an...|
|       :crying_face:|[i, hope, that, c...|
|        :two_hearts:|[i, want, sum, sp...|
|          :sparkles:|[some, que, twohe...|
|      :flushed_face:|[bang, what, nick...|
|      :purple_heart:|[everyday,

In [22]:
result_dict = {}
result_dict['name'] = 'flare'
temp = result_df.collect()
result_children = []
for row in temp:
    temp_dict = {}
    temp_dict['name'] = row._1
    children = []
    words = row._2
    for i in range(len(words)):
        each_word = {}
        each_word['name'] = words[i]
        each_word['value'] = 1
        children.append(each_word)
    temp_dict['children'] = children
    result_children.append(temp_dict)
result_dict['children'] = result_children
    

for ele in result_dict['children']:
    print(ele['name'])
    print(ele['children'])

:sparkling_heart:
[{'name': 'one', 'value': 1}, {'name': 'of', 'value': 1}, {'name': 'the', 'value': 1}, {'name': 'things', 'value': 1}, {'name': 'ive', 'value': 1}, {'name': 'waited', 'value': 1}, {'name': 'twohearts', 'value': 1}, {'name': 'i', 'value': 1}, {'name': 'want', 'value': 1}, {'name': 'sum', 'value': 1}]
:face_with_tears_of_joy:
[{'name': 'i', 'value': 1}, {'name': 'the', 'value': 1}, {'name': 'and', 'value': 1}, {'name': 'momma', 'value': 1}, {'name': 'this', 'value': 1}, {'name': 'start', 'value': 1}, {'name': 'a', 'value': 1}, {'name': 'now', 'value': 1}, {'name': 'changyoon', 'value': 1}, {'name': 'gunmin', 'value': 1}]
:folded_hands:
[{'name': 'stalin', 'value': 1}, {'name': 'atrocities', 'value': 1}, {'name': 'lollu', 'value': 1}, {'name': 'sabha', 'value': 1}, {'name': 'mix', 'value': 1}, {'name': 'facewithtearsofjoy', 'value': 1}, {'name': 'dedicated', 'value': 1}, {'name': 'to', 'value': 1}, {'name': 'all', 'value': 1}, {'name': 'உப', 'value': 1}]
:speaking_head:


In [20]:
import json 
f = open('flare-2.json',)
i = 0
data = json.load(f)
for ele in data['children']:
    print(ele['name'])
    print(ele['children'])

analytics
[{'name': 'cluster', 'children': [{'name': 'AgglomerativeCluster', 'value': 3938}, {'name': 'CommunityStructure', 'value': 3812}, {'name': 'HierarchicalCluster', 'value': 6714}, {'name': 'MergeEdge', 'value': 743}]}, {'name': 'graph', 'children': [{'name': 'BetweennessCentrality', 'value': 3534}, {'name': 'LinkDistance', 'value': 5731}, {'name': 'MaxFlowMinCut', 'value': 7840}, {'name': 'ShortestPaths', 'value': 5914}, {'name': 'SpanningTree', 'value': 3416}]}, {'name': 'optimization', 'children': [{'name': 'AspectRatioBanker', 'value': 7074}]}]
animate
[{'name': 'Easing', 'value': 17010}, {'name': 'FunctionSequence', 'value': 5842}, {'name': 'interpolate', 'children': [{'name': 'ArrayInterpolator', 'value': 1983}, {'name': 'ColorInterpolator', 'value': 2047}, {'name': 'DateInterpolator', 'value': 1375}, {'name': 'Interpolator', 'value': 8746}, {'name': 'MatrixInterpolator', 'value': 2202}, {'name': 'NumberInterpolator', 'value': 1382}, {'name': 'ObjectInterpolator', 'value':