In [1]:
from pymongo import MongoClient
from pyspark import SparkContext
from pyspark.sql import SparkSession
import pyspark.sql.functions as F

In [2]:
spark = SparkSession \
    .builder \
    .appName("myApp") \
    .config("spark.mongodb.input.uri", "mongodb://127.0.0.1/bigdata.raw") \
    .config("spark.mongodb.output.uri", "mongodb://127.0.0.1/bigdata.pairWise") \
    .config('spark.jars.packages', 'org.mongodb.spark:mongo-spark-connector_2.11:2.3.1')\
    .getOrCreate()

In [3]:
df = spark.read.format("com.mongodb.spark.sql.DefaultSource").load()

In [10]:
df = df.select("emoji", "sentence")

In [11]:
def divide_emoji(x):
    return x.split(" ")

def formed_(x):
    result = []
    for emoji in x[0]:
        result.append([emoji, x[1]])
    return result

def separate_emoji(x):
    temp = x.split(',')
    return temp[0]

def separate_position(x):
    temp = x.split(',')
    return temp[1]

In [12]:
rdd = df.rdd.map(list)
rdd = rdd.map(lambda x: (divide_emoji(x[0]), x[1]))
rdd = rdd.flatMap(lambda x: formed_(x))
rdd = rdd.map(lambda x: (separate_emoji(x[0]), int(separate_position(x[0])), x[1]))
rdd.take(5)

[(':sparkling_heart:', 6, 'One of the things Ive waited :sparkling_heart:'),
 (':face_with_tears_of_joy:',
  13,
  'This video must confuse the crap out of liberals Who do they defend :face_with_tears_of_joy:'),
 (':face_with_tears_of_joy:',
  5,
  'Stalin Atrocities Lollu Sabha Mix :face_with_tears_of_joy: Dedicated to all உப ி ஸ ் in the social media :folded_hands: 🏼'),
 (':folded_hands:',
  16,
  'Stalin Atrocities Lollu Sabha Mix :face_with_tears_of_joy: Dedicated to all உப ி ஸ ் in the social media :folded_hands: 🏼'),
 (':face_with_tears_of_joy:',
  18,
  'I only use dpns now for icord and thumbs or if I cant find a long enough circ :face_with_tears_of_joy: Too many WIPs around')]

In [13]:
def get_pair(x):
    pos = x[1]
    sentence = x[2]
    words = sentence.split(' ')
    word =''.join(ch for ch in words[pos-1] if ch.isalpha())
    word = word.lower()
    pair = (word, words[pos])
    return pair

def pair_frequency(x):
    pair_freq = {}
    emoji = x[0]
    pairs = x[1]
    for ele in pairs:
        if ele[1] != emoji or ele[0]=='':
            continue
        if ele in pair_freq:
            pair_freq[ele] += 1
        else:
            pair_freq[ele] = 1
    result = list(pair_freq.items())
    result.sort(key = lambda x: x[1], reverse=True)
    return result

In [23]:
pairs = rdd.map(lambda x: (x[0], get_pair(x)))
group_pairs = pairs.groupBy(lambda x: x[0])
group_pairs = group_pairs.map(lambda x: (x[0], list(x[1])))
temp = group_pairs.map(lambda x: (x[0], [row[1] for row in x[1]]))
result_2 = temp.map(lambda x: (x[0], pair_frequency(x)))
result_2 = result_2.filter(lambda x: len(x[1]) != 0)
result_2 = result_2.sortBy(lambda x: len(x[1]), ascending = False)
result_2.take(2)

[(':face_with_tears_of_joy:',
  [(('defend', ':face_with_tears_of_joy:'), 1),
   (('mix', ':face_with_tears_of_joy:'), 1),
   (('circ', ':face_with_tears_of_joy:'), 1),
   (('forward', ':face_with_tears_of_joy:'), 1),
   (('like', ':face_with_tears_of_joy:'), 1),
   (('show', ':face_with_tears_of_joy:'), 1),
   (('time', ':face_with_tears_of_joy:'), 1),
   (('son', ':face_with_tears_of_joy:'), 1),
   (('places', ':face_with_tears_of_joy:'), 1),
   (('momma', ':face_with_tears_of_joy:'), 1),
   (('loop', ':face_with_tears_of_joy:'), 1),
   (('laugh', ':face_with_tears_of_joy:'), 1),
   (('min', ':face_with_tears_of_joy:'), 1),
   (('through', ':face_with_tears_of_joy:'), 1),
   (('etc', ':face_with_tears_of_joy:'), 1)]),
 (':weary_face:',
  [(('things', ':weary_face:'), 1),
   (('is', ':weary_face:'), 1),
   (('like', ':weary_face:'), 1),
   (('piece', ':weary_face:'), 1)])]

In [24]:
result_df = result_2.toDF()

In [25]:
result_df.write.format("com.mongodb.spark.sql.DefaultSource").mode("append").save()