# RDD Social Media
### Muhammad Naufal Satriandana 13520068

## Initial Setup

In [11]:
import datetime
import time

url = "hdfs://127.0.0.1:9000/socmed_input/"
out_url = "hdfs://127.0.0.1:9000/socmed_output/"

## Instagram

In [12]:
ig = spark.read.json(f"{url}instagram*.json").rdd
# map the lines to a key-value pair
def map(line):
    date = line['created_time']
    # format date
    date = datetime.datetime.fromtimestamp(int(date)).strftime('%Y-%m-%d')
    return (("instagram", date), 1)

ig = ig.map(map)

ig = ig.reduceByKey(lambda x, y: x + y)

ig = ig.map(lambda x: (x[0][0], x[0][1], x[1]))

ig.take(5)

[('instagram', '2022-01-22', 8),
 ('instagram', '2022-01-19', 20),
 ('instagram', '2022-01-08', 8),
 ('instagram', '2021-12-31', 16),
 ('instagram', '2021-12-29', 1)]

## Youtube

In [13]:
yt = spark.read.json(f"{url}youtube*.json").rdd
# map the lines to a key-value pair
def map(line):
    snippet = line['snippet']
    # check if comment is a reply
    if snippet['topLevelComment'] is not None:
        snippet = snippet['topLevelComment']['snippet']
    # get date from snippet.publishedAt
    date = snippet['publishedAt'][0:10]
    return (("youtube", date), 1)

yt = yt.map(map)

yt = yt.reduceByKey(lambda x, y: x + y)

yt = yt.map(lambda x: (x[0][0], x[0][1], x[1]))

yt.take(5)

[('youtube', '2021-05-21', 2),
 ('youtube', '2021-05-18', 2),
 ('youtube', '2021-06-01', 2),
 ('youtube', '2021-05-14', 2),
 ('youtube', '2021-05-12', 1)]

## Twitter

In [4]:
twt = spark.read.json(f"{url}twitter*.json").rdd
# map the lines to a key-value pair
def map(line):
    date = line['created_at']
    # format date
    date = datetime.datetime.strptime(date, '%a %b %d %H:%M:%S %z %Y').strftime('%Y-%m-%d')
    return (("twitter", date), 1)

twt = twt.map(map)

twt = twt.reduceByKey(lambda x, y: x + y)

twt = twt.map(lambda x: (x[0][0], x[0][1], x[1]))

twt.take(5)

[('twitter', '2021-12-12', 5),
 ('twitter', '2021-03-31', 1),
 ('twitter', '2021-03-23', 2),
 ('twitter', '2021-03-30', 1),
 ('twitter', '2021-03-24', 1)]

## Facebook

In [5]:
fb = spark.read.json(f"{url}facebook*.json").rdd

def map(line):
    result = []
    postDate = line["created_time"][0:10]
    comments = line["comments"]["data"]
    result.append((("facebook", postDate), 1))
    for comment in comments:
        date = comment["created_time"][0:10]
        result.append((("facebook", date), 1))
    return result

fb = fb.flatMap(map)

fb = fb.reduceByKey(lambda x, y: x + y)

fb = fb.map(lambda x: (x[0][0], x[0][1], x[1]))
fb.take(5)

[('facebook', '2021-05-03', 2),
 ('facebook', '2021-05-04', 3),
 ('facebook', '2021-05-06', 1),
 ('facebook', '2021-10-22', 1),
 ('facebook', '2021-04-27', 34)]

## Others

In [6]:
others = spark.read.option("multiline","true").json(f"{url}*.json.json").rdd
# map the lines to a key-value pair
def map(line):
    graphImages = line['GraphImages']
    result = []
    if(graphImages is not None):
        for image in graphImages:
            if image['taken_at_timestamp'] is not None:
                date = datetime.datetime.fromtimestamp(image['taken_at_timestamp']).strftime('%Y-%m-%d')
                result.append((("instagram", date), 1))
            if image['comments'] is not None:
                comments = image['comments']['data']
                for comment in comments:
                    date = datetime.datetime.fromtimestamp(comment['created_at']).strftime('%Y-%m-%d')
                    result.append((("instagram", date), 1))
    return result

others = others.flatMap(map)

others = others.reduceByKey(lambda x, y: x + y)

others.map(lambda x: (x[0][0], x[0][1], x[1])).collect()

others.take(5)

[(('instagram', '2022-02-09'), 55),
 (('instagram', '2022-02-11'), 268),
 (('instagram', '2022-02-12'), 199),
 (('instagram', '2022-02-13'), 48),
 (('instagram', '2022-02-08'), 53)]

## Combined

In [7]:
RDD = ig.union(yt.union(twt.union(fb)))
RDD.collect()

[('instagram', '2022-01-22', 8),
 ('instagram', '2022-01-19', 20),
 ('instagram', '2022-01-08', 8),
 ('instagram', '2021-12-31', 16),
 ('instagram', '2021-12-29', 1),
 ('instagram', '2021-01-17', 12),
 ('instagram', '2022-01-26', 70),
 ('instagram', '2022-01-25', 23),
 ('instagram', '2021-03-14', 2),
 ('instagram', '2022-01-23', 9),
 ('instagram', '2022-01-16', 8),
 ('instagram', '2022-01-14', 17),
 ('instagram', '2022-01-13', 11),
 ('instagram', '2022-01-10', 12),
 ('instagram', '2022-01-09', 9),
 ('instagram', '2022-01-05', 18),
 ('instagram', '2022-01-03', 11),
 ('instagram', '2022-01-01', 8),
 ('instagram', '2021-03-22', 2),
 ('instagram', '2021-03-29', 1),
 ('instagram', '2021-01-16', 15),
 ('instagram', '2022-01-21', 18),
 ('instagram', '2022-01-20', 19),
 ('instagram', '2022-01-18', 18),
 ('instagram', '2022-01-12', 10),
 ('instagram', '2022-01-11', 12),
 ('instagram', '2022-01-04', 19),
 ('instagram', '2021-03-20', 38),
 ('instagram', '2021-03-21', 14),
 ('instagram', '2021-01-

## Export

In [8]:
df = RDD.toDF(['social_media', 'date', 'count'])
df.show()

+------------+----------+-----+
|social_media|      date|count|
+------------+----------+-----+
|   instagram|2022-01-22|    8|
|   instagram|2022-01-19|   20|
|   instagram|2022-01-08|    8|
|   instagram|2021-12-31|   16|
|   instagram|2021-12-29|    1|
|   instagram|2021-01-17|   12|
|   instagram|2022-01-26|   70|
|   instagram|2022-01-25|   23|
|   instagram|2021-03-14|    2|
|   instagram|2022-01-23|    9|
|   instagram|2022-01-16|    8|
|   instagram|2022-01-14|   17|
|   instagram|2022-01-13|   11|
|   instagram|2022-01-10|   12|
|   instagram|2022-01-09|    9|
|   instagram|2022-01-05|   18|
|   instagram|2022-01-03|   11|
|   instagram|2022-01-01|    8|
|   instagram|2021-03-22|    2|
|   instagram|2021-03-29|    1|
+------------+----------+-----+
only showing top 20 rows



In [9]:
df.write.csv(out_url)

AnalysisException: 'path hdfs://127.0.0.1:9000/sample_output already exists.;'