# Notebook Tugas 2
Penyusun: M. Abdi Haryadi. H (13519156)

Berkas yang diabaikan:
- Pengguna Instagram (format: `*.json.json`). Alasan: JSON-nya rusak.

In [1]:
import pyspark.sql.functions as psf
import time

In [2]:
def hadoop_to_absolute_path(hadoop_path):
    return f"hdfs://localhost:9000{hadoop_path}"

In [3]:
def read_json_from_hadoop_path(hadoop_path: str):
    absolute_path = hadoop_to_absolute_path(hadoop_path)
    return spark.read.json(absolute_path)

## Instagram Dataframe

In [4]:
print("Reading Instagram contents ...")
df = read_json_from_hadoop_path("/social_media/raw_json/instagram_*.json")

print("Merging as one dataframe ...")
coalesce_df = df.coalesce(1)

print("Transforming to ID-date format ...")
id_date_df = coalesce_df.select(
    psf.col("id"),
    psf.from_unixtime(timestamp="created_time", format="yyyy-MM-dd").alias("date")
)

print("Removing duplicates ...")
distinct_df = id_date_df.distinct()

print("Grouping by date column ...")
date_group = distinct_df.groupBy("date")

print("Counting each group ...")
count_df = date_group.agg(psf.count("id").alias("count"))

print("Done!")
instagram_df = count_df

Reading Instagram contents ...
Merging as one dataframe ...
Transforming to ID-date format ...
Removing duplicates ...
Grouping by date column ...
Counting each group ...
Done!


## Facebook Dataframe

In [5]:
print("Reading Facebook posts ...")
df = read_json_from_hadoop_path("/social_media/raw_json/facebook_post_*.json")

print("Merging as one dataframe ...")
coalesce_df = df.coalesce(1)

print("Transforming to ID-date format ...")
id_date_df = coalesce_df.select(
    psf.col("id"),
    psf.to_date("created_time").alias("date")
)

print("Removing duplicates ...")
distinct_df = id_date_df.distinct()

print("Grouping by date column ...")
date_group = distinct_df.groupBy("date")

print("Counting each group ...")
count_df = date_group.agg(psf.count("id").alias("count"))

print("Done!")
facebook_df = count_df

Reading Facebook posts ...
Merging as one dataframe ...
Transforming to ID-date format ...
Removing duplicates ...
Grouping by date column ...
Counting each group ...
Done!


## YouTube Dataframe

### YouTube Comment Dataframe

In [6]:
print("Reading Youtube comments ...")
df = read_json_from_hadoop_path("/social_media/raw_json/youtube_comment_*.json")

print("Merging as one dataframe ...")
coalesce_df = df.coalesce(1)

print("Transforming to ID-date format ...")
id_date_df = coalesce_df.select(
    "snippet.topLevelComment.id",
    psf.to_date("snippet.topLevelComment.snippet.publishedAt").alias("date")
)

print("Removing duplicates ...")
distinct_df = id_date_df.distinct()

print("Grouping by date column ...")
date_group = distinct_df.groupBy("date")

print("Counting each group ...")
count_df = date_group.agg(psf.count("id").alias("count"))

print("Done!")
youtube_comment_df = count_df

Reading Youtube comments ...
Merging as one dataframe ...
Transforming to ID-date format ...
Removing duplicates ...
Grouping by date column ...
Counting each group ...
Done!


### YouTube Video Dataframe

In [7]:
print("Reading YouTube videos ...")
df = read_json_from_hadoop_path("/social_media/raw_json/youtube_video_*.json")

print("Merging as one dataframe ...")
coalesce_df = df.coalesce(1)

print("Transforming to ID-date format ...")
id_date_df = coalesce_df.select(
    "id",
    psf.to_date("snippet.publishedAt").alias("date")
)

print("Removing duplicates ...")
distinct_df = id_date_df.distinct()

print("Grouping by date column ...")
date_group = distinct_df.groupBy("date")

print("Counting each group ...")
count_df = date_group.agg(psf.count("id").alias("count"))

print("Done!")
youtube_video_df = count_df

Reading YouTube videos ...
Merging as one dataframe ...
Transforming to ID-date format ...
Removing duplicates ...
Grouping by date column ...
Counting each group ...
Done!


### Merging

In [8]:
print("Combining comment dataframe and video dataframe ...")
df = youtube_comment_df.union(youtube_video_df)

print("Merging as one dataframe ...")
coalesce_df = df.coalesce(1)

print("Done!")
youtube_df = coalesce_df

Combining comment dataframe and video dataframe ...
Merging as one dataframe ...
Done!


## Twitter Dataframe

In [9]:
print("Reading Twitter status ...")
df = read_json_from_hadoop_path("/social_media/raw_json/twitter_status_*.json")

print("Merging as one dataframe ...")
coalesce_df = df.coalesce(1)

print("Transforming to ID-date format ...")
id_date_df = coalesce_df.select(
    psf.col("id"),
    psf.to_date(psf.substring("created_at", 5, 26), "MMM dd HH:mm:ss Z yyyy").alias("date")
)

print("Removing duplicates ...")
distinct_df = id_date_df.distinct()

print("Grouping by date column ...")
date_group = distinct_df.groupBy("date")

print("Counting each group ...")
count_df = date_group.agg(psf.count("id").alias("count"))

print("Done!")
twitter_df = count_df

Reading Twitter status ...
Merging as one dataframe ...
Transforming to ID-date format ...
Removing duplicates ...
Grouping by date column ...
Counting each group ...
Done!


## Merging

In [10]:
print("Labeling Instagram ...")
labeled_df = instagram_df.select(
    psf.col("date"),
    psf.lit("instagram").alias("social_media"),
    psf.col("count")
)

print("Done!")
merged_df = labeled_df

Labeling Instagram ...
Done!


In [11]:
print("Labeling Facebook ...")
labeled_df = facebook_df.select(
    psf.col("date"),
    psf.lit("facebook").alias("social_media"),
    psf.col("count")
)

print("Unioning with previous result ...")
union_df = labeled_df.union(merged_df)

print("Merging as one dataframe ...")
coalesce_df = union_df.coalesce(1)

print("Done!")
merged_df = coalesce_df

Labeling Facebook ...
Unioning with previous result ...
Merging as one dataframe ...
Done!


In [12]:
print("Labeling YouTube ...")
labeled_df = youtube_df.select(
    psf.col("date"),
    psf.lit("youtube").alias("social_media"),
    psf.col("count")
)

print("Unioning with previous result ...")
union_df = labeled_df.union(merged_df)

print("Merging as one dataframe ...")
coalesce_df = union_df.coalesce(1)

print("Done!")
merged_df = coalesce_df

Labeling YouTube ...
Unioning with previous result ...
Merging as one dataframe ...
Done!


In [13]:
print("Labeling Twitter ...")
labeled_df = twitter_df.select(
    psf.col("date"),
    psf.lit("twitter").alias("social_media"),
    psf.col("count")
)

print("Unioning with previous result ...")
union_df = labeled_df.union(merged_df)

print("Merging as one dataframe ...")
coalesce_df = union_df.coalesce(1)

print("Done!")
merged_df = coalesce_df

Labeling Twitter ...
Unioning with previous result ...
Merging as one dataframe ...
Done!


In [15]:
print("Writing ...")
start_time = time.time()
merged_df.write.csv(hadoop_to_absolute_path("/social_media/task_2_output"))
end_time = time.time()
print("Done!")

Writing ...
Done!


In [16]:
print("Duration: {} s".format(end_time - start_time))

Duration: 30.029176950454712 s
