In [None]:
!pip install pyspark
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"

from pyspark.sql import SparkSession
from pyspark.sql.functions import col, rand, floor
import time



In [None]:
# Inisiasi Spark Session
spark = SparkSession.builder \
    .appName("TeknikPenyimpanan") \
    .master("local[*]") \
    .getOrCreate()

print("Spark siap digunakan!")

Spark siap digunakan!


In [None]:
# Generate 2 Juta data dummy
# Kolom: ID, Kategori (1-5), Jumlah Barang, Harga
print(" mengenerate data...")
df = spark.range(0, 2000000).withColumn("kategori", floor(rand() * 5)) \
                            .withColumn("jumlah", floor(rand() * 100)) \
                            .withColumn("harga", floor(rand() * 100000))

# Caching agar proses generate tidak diulang-ulang saat write
df.cache()
print(f"Jumlah baris data: {df.count()}")
df.show(5)

 mengenerate data...
Jumlah baris data: 2000000
+---+--------+------+-----+
| id|kategori|jumlah|harga|
+---+--------+------+-----+
|  0|       1|    36|72679|
|  1|       1|    44|46874|
|  2|       0|    39|32905|
|  3|       4|    15|50235|
|  4|       2|    52|50818|
+---+--------+------+-----+
only showing top 5 rows



In [None]:
# Simpan sebagai CSV (Row-based, uncompressed text)
start_time = time.time()
df.write.mode("overwrite").csv("data_csv")
print(f"Waktu tulis CSV: {time.time() - start_time:.2f} detik")

# Simpan sebagai Parquet (Columnar, compressed)
start_time = time.time()
df.write.mode("overwrite").parquet("data_parquet")
print(f"Waktu tulis Parquet: {time.time() - start_time:.2f} detik")

# Membandingkan ukuran file
print("\n--- PERBANDINGAN UKURAN STORAGE ---")
!du -sh data_csv
!du -sh data_parquet

Waktu tulis CSV: 4.41 detik
Waktu tulis Parquet: 3.94 detik

--- PERBANDINGAN UKURAN STORAGE ---
36M	data_csv
19M	data_parquet


In [None]:
# Teknik: Partitioning by 'kategori'
# Data akan disimpan dalam folder terpisah: kategori=0, kategori=1, dst.
start_time = time.time()
df.write.mode("overwrite").partitionBy("kategori").parquet("data_partitioned")
print(f"Waktu tulis Partitioned Parquet: {time.time() - start_time:.2f} detik")

# Lihat struktur folder yang terbentuk
print("\n--- STRUKTUR FOLDER PARTISI ---")
!ls -R data_partitioned | head -n 15

Waktu tulis Partitioned Parquet: 3.59 detik

--- STRUKTUR FOLDER PARTISI ---
data_partitioned:
kategori=0
kategori=1
kategori=2
kategori=3
kategori=4
_SUCCESS

data_partitioned/kategori=0:
part-00000-7a8ea741-45cb-4e39-8db9-1d313d094a53.c000.snappy.parquet
part-00001-7a8ea741-45cb-4e39-8db9-1d313d094a53.c000.snappy.parquet

data_partitioned/kategori=1:
part-00000-7a8ea741-45cb-4e39-8db9-1d313d094a53.c000.snappy.parquet
part-00001-7a8ea741-45cb-4e39-8db9-1d313d094a53.c000.snappy.parquet


In [None]:
# Query 1: Membaca dari file Parquet POLOS (Tanpa Partisi)
# Spark harus scan semua file untuk mencari kategori=3
start = time.time()
spark.read.parquet("data_parquet").filter("kategori = 3").count()
print(f"Waktu Query Non-Partitioned: {time.time() - start:.4f} detik")

# Query 2: Membaca dari file Parquet BERPARTISI
# Spark langsung loncat ke folder kategori=3, mengabaikan folder 0,1,2,4
start = time.time()
spark.read.parquet("data_partitioned").filter("kategori = 3").count()
print(f"Waktu Query Partitioned: {time.time() - start:.4f} detik")

Waktu Query Non-Partitioned: 1.7237 detik
Waktu Query Partitioned: 0.5699 detik


In [None]:
import os
# Membuat file ZIP untuk data_csv
print("Mengompresi data_csv...")
!tar -czvf data_csv.tar.gz data_csv

# Membuat file ZIP untuk data_parquet
print("Mengompresi data_parquet...")
!tar -czvf data_parquet.tar.gz data_parquet

print("\nFile tar.gz siap diunduh.")
# Memastikan file terkompresi ada
!ls *.tar.gz

Mengompresi data_csv...
data_csv/
data_csv/part-00001-5f0e8fbb-8856-4146-8d94-e59a441bb54a-c000.csv
data_csv/.part-00001-5f0e8fbb-8856-4146-8d94-e59a441bb54a-c000.csv.crc
data_csv/.part-00000-5f0e8fbb-8856-4146-8d94-e59a441bb54a-c000.csv.crc
data_csv/._SUCCESS.crc
data_csv/_SUCCESS
data_csv/part-00000-5f0e8fbb-8856-4146-8d94-e59a441bb54a-c000.csv
Mengompresi data_parquet...
data_parquet/
data_parquet/._SUCCESS.crc
data_parquet/.part-00000-c0c929a0-51ba-45c4-aa9f-98e03a30f98a-c000.snappy.parquet.crc
data_parquet/part-00000-c0c929a0-51ba-45c4-aa9f-98e03a30f98a-c000.snappy.parquet
data_parquet/.part-00001-c0c929a0-51ba-45c4-aa9f-98e03a30f98a-c000.snappy.parquet.crc
data_parquet/_SUCCESS
data_parquet/part-00001-c0c929a0-51ba-45c4-aa9f-98e03a30f98a-c000.snappy.parquet

File tar.gz siap diunduh.
data_csv.tar.gz  data_parquet.tar.gz


In [None]:
from google.colab import files

#file CSV
files.download('data_csv.tar.gz')

#file Parquet
files.download('data_parquet.tar.gz')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>