In [1]:
from pymongo import MongoClient
from pyspark import SparkContext
from pyspark.sql import SparkSession
import pyspark.sql.functions as F

In [2]:
spark = SparkSession \
    .builder \
    .appName("myApp") \
    .config("spark.mongodb.input.uri", "mongodb://127.0.0.1/bigdata.raw") \
    .config("spark.mongodb.output.uri", "mongodb://127.0.0.1/bigdata.raw") \
    .config('spark.jars.packages', 'org.mongodb.spark:mongo-spark-connector_2.11:2.3.1')\
    .getOrCreate()

In [3]:
df = spark.read.format("com.mongodb.spark.sql.DefaultSource").load()

In [4]:
df = df.select("emoji", "sentence")
df.show()

+--------------------+--------------------+
|               emoji|            sentence|
+--------------------+--------------------+
|      :red_heart:,18|No object is so b...|
|:person_shrugging...|Cant expect diffe...|
|:face_with_tears_...|“ Lets go Marcus ...|
|:face_with_tears_...|Asahd really is a...|
|:face_with_tears_...|Yoongi Tweet Hell...|
|:backhand_index_p...|we cannot afford ...|
|:party_popper:,8 ...|ranks 6th in Janu...|
|:person_facepalmi...|Ok people are rea...|
|:smiling_face_wit...|Cant wait to meet...|
| :clapping_hands:,11|Congratulations M...|
|:face_with_tears_...|Met orlando brown...|
|      :weary_face:,4|Im goin to bed :w...|
|  :clapping_hands:,9|Will and Jada on ...|
|:person_shrugging...|EVERYBODY is preg...|
|       :male_sign:,8|I promise to fuck...|
|    :folded_hands:,3|God keep working ...|
|    :party_popper:,4|Happy Birthday to...|
|  :hundred_points:,8|Over 3M Dollars i...|
|:loudly_crying_fa...|I be considering ...|
|:rolling_on_the_f...|My teacher

In [None]:
df.count()

## Preprocessing

In [10]:
def divide_emoji(x):
    return x.split(" ")

def formed_(x):
    result = []
    for emoji in x[0]:
        result.append([emoji, x[1]])
    return result

In [11]:
rdd = df.rdd.map(list)
rdd = rdd.map(lambda x: (divide_emoji(x[0]), x[1]))
rdd = rdd.flatMap(lambda x: formed_(x))
rdd.take(5)

[[':red_heart:,18',
  'No object is so beautiful that under certain conditions it will not look ugly Oscar Wilde ↺ RT :red_heart: …'],
 [':person_shrugging:,13',
  'Cant expect different results doing the same thingdoing stuff different from now on :person_shrugging: 🏻 \u200d :female_sign: ️'],
 [':female_sign:,15',
  'Cant expect different results doing the same thingdoing stuff different from now on :person_shrugging: 🏻 \u200d :female_sign: ️'],
 [':face_with_tears_of_joy:,14',
  '“ Lets go Marcus ” “ Shiiit where we goin Home ” Marcus Peters :face_with_tears_of_joy:'],
 [':face_with_tears_of_joy:,14',
  'Asahd really is a grown man in the body of a 1 year old :face_with_tears_of_joy:']]

In [12]:
def separate_emoji(x):
    temp = x.split(',')
    return temp[0]

def separate_position(x):
    temp = x.split(',')
    return temp[1]

In [13]:
rdd = rdd.map(lambda x: (separate_emoji(x[0]), int(separate_position(x[0])), x[1]))
rdd.take(5)

[(':red_heart:',
  18,
  'No object is so beautiful that under certain conditions it will not look ugly Oscar Wilde ↺ RT :red_heart: …'),
 (':person_shrugging:',
  13,
  'Cant expect different results doing the same thingdoing stuff different from now on :person_shrugging: 🏻 \u200d :female_sign: ️'),
 (':female_sign:',
  15,
  'Cant expect different results doing the same thingdoing stuff different from now on :person_shrugging: 🏻 \u200d :female_sign: ️'),
 (':face_with_tears_of_joy:',
  14,
  '“ Lets go Marcus ” “ Shiiit where we goin Home ” Marcus Peters :face_with_tears_of_joy:'),
 (':face_with_tears_of_joy:',
  14,
  'Asahd really is a grown man in the body of a 1 year old :face_with_tears_of_joy:')]

## For every emoji, find the top 10 words appears mostwhen using the emoji.

In [14]:
def count_words(sentence):
    sentence = sentence.split(' ')
    words_frequency={}
    for word in sentence:
        word =''.join(ch for ch in word if ch.isalpha())
        word = word.lower()
        if word == '':
            continue
        if word in words_frequency:
            words_frequency[word] += 1
        else:
            words_frequency[word] = 1
    temp = list(words_frequency.items())
    temp.sort(key = lambda x: x[1], reverse=True)
    result = []
    num_top_word = 10
    if len(temp) < 10:
        num_top_word = len(temp)
    for i in range(num_top_word):
        result.append(temp[i][0])
    return result

In [15]:
temp_rdd = rdd.map(lambda x: (x[0], x[2]))
group_by_emoji = temp_rdd.groupBy(lambda x: x[0])

In [16]:
operate_rdd = group_by_emoji.map(lambda x: (x[0], list(x[1])))
# operate_rdd.take(1)

In [17]:
def combine_string(x):
    all_sentence = ""
    for item in x:
        sentence = item[1].split(item[0])
        for word in sentence:
            if word != item[0]:
                all_sentence = all_sentence + word + " "
    return all_sentence

In [None]:
combined = operate_rdd.map(lambda x: (x[0], combine_string(x[1])))
result = combined.map(lambda x: (x[0], count_words(x[1])))
result.take(1)

## Find which emoji used most with words begin with letter A, B, C...Z (alphabet order)

In [None]:
def get_letter_stat(x):
    letters = {}
    words = x.split(' ')
    for word in words:
        word =''.join(ch for ch in word if ch.isalpha())
        if word == '':
            continue
        word = word.lower()
        if word[0] not in letters:
            letters[word[0]] = 1
        else:
            letters[word[0]] += 1
    temp = list(letters.items())
    temp.sort(key = lambda x: x[1], reverse=True)
    return temp[0]

In [None]:
top_letter_for_emoji = combined.map(lambda x: (x[0], get_letter_stat(x[1])))

In [None]:
def get_the_top(x):
    x.sort(key = lambda x: x[1], reverse=True)
    return x[0][0]

In [None]:
key_is_letter = top_letter_for_emoji.map(lambda x: (x[1][0], (x[0], x[1][1])))
key_is_letter.take(3)

In [None]:
group_by_letter = key_is_letter.groupBy(lambda x: x[0]).map(lambda x: (x[0], list(x[1])))
result_1 = group_by_letter.map(lambda x: (x[0], [row[1] for row in x[1]]))
result_1 = result_1.map(lambda x: (x[0], get_the_top(x[1])))
result_1.take(3)

## Find the most pair-wise frequency words and emojis

In [None]:
def get_pair(x):
    pos = x[1]
    sentence = x[2]
    words = sentence.split(' ')
    word =''.join(ch for ch in words[pos-1] if ch.isalpha())
    word = word.lower()
    pair = (word, words[pos])
    return pair

def pair_frequency(x):
    pair_freq = {}
    emoji = x[0]
    pairs = x[1]
    for ele in pairs:
        if ele[1] != emoji or ele[0]=='':
            continue
        if ele in pair_freq:
            pair_freq[ele] += 1
        else:
            pair_freq[ele] = 1
    result = list(pair_freq.items())
    result.sort(key = lambda x: x[1], reverse=True)
    return result

In [None]:
pairs = rdd.map(lambda x: (x[0], get_pair(x)))
group_pairs = pairs.groupBy(lambda x: x[0])
group_pairs = group_pairs.map(lambda x: (x[0], list(x[1])))
temp = group_pairs.map(lambda x: (x[0], [row[1] for row in x[1]]))
result_2 = temp.map(lambda x: (x[0], pair_frequency(x)))
result_2.take(1)