In [1]:
from pymongo import MongoClient
from pyspark import SparkContext
from pyspark.sql import SparkSession
import pyspark.sql.functions as F

In [2]:
spark = SparkSession \
    .builder \
    .appName("myApp") \
    .config("spark.mongodb.input.uri", "mongodb://127.0.0.1/bigdata.raw") \
    .config("spark.mongodb.output.uri", "mongodb://127.0.0.1/bigdata.raw") \
    .config('spark.jars.packages', 'org.mongodb.spark:mongo-spark-connector_2.11:2.3.1')\
    .getOrCreate()

In [3]:
df = spark.read.format("com.mongodb.spark.sql.DefaultSource").load()

In [4]:
df = df.select("emoji", "sentence")
df.show()

+--------------------+--------------------+
|               emoji|            sentence|
+--------------------+--------------------+
| :sparkling_heart:,6|One of the things...|
|:face_with_tears_...|This video must c...|
|:face_with_tears_...|Stalin Atrocities...|
|:face_with_tears_...|I only use dpns n...|
|:speaking_head:,1...|Queens if your ki...|
|:face_with_tears_...|Yous spent £ 75m ...|
|      :heart_suit:,5|VIP Pump Notifier...|
|:backhand_index_p...|answers for xiumi...|
|        :OK_hand:,10|Sex with the righ...|
|:person_facepalmi...|Bro I never saw t...|
|    :right_arrow:,14|Support my dream ...|
|:face_with_tears_...|Chance The Rapper...|
|     :right_arrow:,3|CLICK TO LISTEN :...|
|     :weary_face:,13|I wanna have mone...|
|:face_with_tears_...|lmao its a whole ...|
|:person_shrugging...|Its days where I ...|
|   :winking_face:,17|600 comments and ...|
|:person_shrugging...|Slightly regretti...|
|:crying_face:,11 ...|I hope that I can...|
|:face_with_tears_...|Why do Mar

In [5]:
df.count()

1980000

## Preprocessing

In [6]:
def divide_emoji(x):
    return x.split(" ")

def formed_(x):
    result = []
    for emoji in x[0]:
        result.append([emoji, x[1]])
    return result

In [7]:
rdd = df.rdd.map(list)
rdd = rdd.map(lambda x: (divide_emoji(x[0]), x[1]))
rdd = rdd.flatMap(lambda x: formed_(x))
rdd.take(5)

[[':sparkling_heart:,6', 'One of the things Ive waited :sparkling_heart:'],
 [':face_with_tears_of_joy:,13',
  'This video must confuse the crap out of liberals Who do they defend :face_with_tears_of_joy:'],
 [':face_with_tears_of_joy:,5',
  'Stalin Atrocities Lollu Sabha Mix :face_with_tears_of_joy: Dedicated to all உப ி ஸ ் in the social media :folded_hands: 🏼'],
 [':folded_hands:,16',
  'Stalin Atrocities Lollu Sabha Mix :face_with_tears_of_joy: Dedicated to all உப ி ஸ ் in the social media :folded_hands: 🏼'],
 [':face_with_tears_of_joy:,18',
  'I only use dpns now for icord and thumbs or if I cant find a long enough circ :face_with_tears_of_joy: Too many WIPs around']]

In [8]:
def separate_emoji(x):
    temp = x.split(',')
    return temp[0]

def separate_position(x):
    temp = x.split(',')
    return temp[1]

In [9]:
rdd = rdd.map(lambda x: (separate_emoji(x[0]), int(separate_position(x[0])), x[1]))
rdd.take(5)

[(':sparkling_heart:', 6, 'One of the things Ive waited :sparkling_heart:'),
 (':face_with_tears_of_joy:',
  13,
  'This video must confuse the crap out of liberals Who do they defend :face_with_tears_of_joy:'),
 (':face_with_tears_of_joy:',
  5,
  'Stalin Atrocities Lollu Sabha Mix :face_with_tears_of_joy: Dedicated to all உப ி ஸ ் in the social media :folded_hands: 🏼'),
 (':folded_hands:',
  16,
  'Stalin Atrocities Lollu Sabha Mix :face_with_tears_of_joy: Dedicated to all உப ி ஸ ் in the social media :folded_hands: 🏼'),
 (':face_with_tears_of_joy:',
  18,
  'I only use dpns now for icord and thumbs or if I cant find a long enough circ :face_with_tears_of_joy: Too many WIPs around')]

## For every emoji, find the top 10 words appears mostwhen using the emoji.

In [54]:
def count_words(sentence):
    sentence = sentence.split(' ')
    words_frequency={}
    for word in sentence:
        word =''.join(ch for ch in word if ch.isalpha())
        word = word.lower()
        if word == '':
            continue
        if word in words_frequency:
            words_frequency[word] += 1
        else:
            words_frequency[word] = 1
    temp = list(words_frequency.items())
    temp.sort(key = lambda x: x[1], reverse=True)
    result = []
    num_top_word = 10
    if len(temp) < 10:
        num_top_word = len(temp)
    for i in range(num_top_word):
        result.append(temp[i][0])
    return result

In [11]:
temp_rdd = rdd.map(lambda x: (x[0], x[2]))
group_by_emoji = temp_rdd.groupBy(lambda x: x[0])

In [12]:
operate_rdd = group_by_emoji.map(lambda x: (x[0], list(x[1])))
operate_rdd.take(1)

[(':speaking_head:',
  [(':speaking_head:',
    'Queens if your king doesnt treat you like husband does on the red carpet :speaking_head: drop him :face_with_tears_of_joy: …'),
   (':speaking_head:',
    'listen if u not good then SAY THAT :speaking_head: gah damn got me waiting on nothing'),
   (':speaking_head:',
    'unpopular opinion chanel carried the cheetah girls and dorindas white ass was dead weight :speaking_head:'),
   (':speaking_head:',
    '“ once my heart turn against you its no love for you nomore “ :speaking_head:'),
   (':speaking_head:', 'Oh my fucking God :speaking_head:'),
   (':speaking_head:', 'So this happened gotta love NYC :speaking_head:'),
   (':speaking_head:',
    'Cardi B clocked all of yall that had something to say about her getting pregnant :speaking_head:'),
   (':speaking_head:',
    'one of yall lurking ass bitches go back and tell her that :speaking_head:'),
   (':speaking_head:',
    'QUOTE Francesco :speaking_head: I definitely could have won mor

In [15]:
def combine_string(x):
    all_sentence = ""
    for item in x:
        sentence = item[1].split(item[0])
        for word in sentence:
            if word != item[0]:
                all_sentence = all_sentence + word + " "
    return all_sentence

In [16]:
combined = operate_rdd.map(lambda x: (x[0], combine_string(x[1])))
result = combined.map(lambda x: (x[0], count_words(x[1])))

## Find which emoji used most with words begin with letter A, B, C...Z (alphabet order)

In [48]:
def get_letter_stat(x):
    letters = {}
    words = x.split(' ')
    for word in words:
        word =''.join(ch for ch in word if ch.isalpha())
        if word == '':
            continue
        word = word.lower()
        if word[0] not in letters:
            letters[word[0]] = 1
        else:
            letters[word[0]] += 1
    temp = list(letters.items())
    temp.sort(key = lambda x: x[1], reverse=True)
    return temp[0]

In [49]:
top_letter_for_emoji = combined.map(lambda x: (x[0], get_letter_stat(x[1])))

In [50]:
def get_the_top(x):
    x.sort(key = lambda x: x[1], reverse=True)
    return x[0][0]

In [51]:
key_is_letter = top_letter_for_emoji.map(lambda x: (x[1][0], (x[0], x[1][1])))
key_is_letter.take(3)

[('t', (':speaking_head:', 29721)),
 ('t', (':loudly_crying_face:', 215116)),
 ('t', (':crying_face:', 25138))]

In [53]:
group_by_letter = key_is_letter.groupBy(lambda x: x[0]).map(lambda x: (x[0], list(x[1])))
result_1 = group_by_letter.map(lambda x: (x[0], [row[1] for row in x[1]]))
result_1 = result_1.map(lambda x: (x[0], get_the_top(x[1])))
result_1.take(3)

[('t', ':face_with_tears_of_joy:'),
 ('f', ':backhand_index_pointing_right:'),
 ('m', ':person_facepalming:')]

## Find the most pair-wise frequency words and emojis

In [42]:
def get_pair(x):
    pos = x[1]
    sentence = x[2]
    words = sentence.split(' ')
    word =''.join(ch for ch in words[pos-1] if ch.isalpha())
    word = word.lower()
    pair = (word, words[pos])
    return pair

def pair_frequency(x):
    pair_freq = {}
    emoji = x[0]
    pairs = x[1]
    for ele in pairs:
        if ele[1] != emoji or ele[0]=='':
            continue
        if ele in pair_freq:
            pair_freq[ele] += 1
        else:
            pair_freq[ele] = 1
    result = list(pair_freq.items())
    result.sort(key = lambda x: x[1], reverse=True)
    return result

In [43]:
pairs = rdd.map(lambda x: (x[0], get_pair(x)))
group_pairs = pairs.groupBy(lambda x: x[0])
group_pairs = group_pairs.map(lambda x: (x[0], list(x[1])))
temp = group_pairs.map(lambda x: (x[0], [row[1] for row in x[1]]))
result_2 = temp.map(lambda x: (x[0], pair_frequency(x)))
result_2.take(1)

[(':speaking_head:',
  [(('you', ':speaking_head:'), 340),
   (('army', ':speaking_head:'), 337),
   (('me', ':speaking_head:'), 323),
   (('back', ':speaking_head:'), 309),
   (('up', ':speaking_head:'), 203),
   (('it', ':speaking_head:'), 194),
   (('life', ':speaking_head:'), 192),
   (('parents', ':speaking_head:'), 183),
   (('first', ':speaking_head:'), 145),
   (('that', ':speaking_head:'), 144),
   (('students', ':speaking_head:'), 144),
   (('pregnant', ':speaking_head:'), 136),
   (('line', ':speaking_head:'), 136),
   (('facts', ':speaking_head:'), 134),
   (('go', ':speaking_head:'), 133),
   (('on', ':speaking_head:'), 123),
   (('shit', ':speaking_head:'), 105),
   (('day', ':speaking_head:'), 102),
   (('armys', ':speaking_head:'), 100),
   (('to', ':speaking_head:'), 96),
   (('do', ':speaking_head:'), 89),
   (('nigga', ':speaking_head:'), 79),
   (('other', ':speaking_head:'), 78),
   (('destruction', ':speaking_head:'), 78),
   (('hurts', ':speaking_head:'), 76),
  