Potential Interview Questions
-----------------------------
1. What is partitioning in spark?
2. What is bucketing in spark?
3. Why do we need these two?
4. When to use partitioning?
5. When to use bucketing?

In [0]:
df = spark.read.format("csv")\
                      .option("header","true")\
                      .option("inferSchema","true")\
                      .option("mode","PERMISSIVE")\
                      .load('/FileStore/tables/emp_write.csv')

In [0]:
df.show(truncate=False)

+---+--------+---+------+-------+------+
|id |name    |age|salary|address|gender|
+---+--------+---+------+-------+------+
|1  |Manish  |26 |75000 |INDIA  |m     |
|2  |Nikita  |23 |100000|USA    |f     |
|3  |Pritam  |22 |150000|INDIA  |m     |
|4  |Prantosh|17 |200000|JAPAN  |m     |
|5  |Vikash  |31 |300000|USA    |m     |
|6  |Rahul   |55 |300000|INDIA  |m     |
|7  |Raju    |67 |540000|USA    |m     |
|8  |Praveen |28 |70000 |JAPAN  |m     |
|9  |Dev     |32 |150000|JAPAN  |m     |
|10 |Sherin  |16 |25000 |RUSSIA |f     |
|11 |Ragu    |12 |35000 |INDIA  |f     |
|12 |Sweta   |43 |200000|INDIA  |f     |
|13 |Raushan |48 |650000|USA    |m     |
|14 |Mukesh  |36 |95000 |RUSSIA |m     |
|15 |Prakash |52 |750000|INDIA  |m     |
+---+--------+---+------+-------+------+



In [0]:
df.write.format("csv")\
        .option("header","true")\
        .option("mode","overwrite")\
        .partitionBy("address")\
        .save('/FileStore/tables/partition_by_address')

In [0]:
# How to delete a file in dbfs
dbutils.fs.rm('file_path')

In [0]:
# How to delete a folder in dbfs
%fs rm -r '/FileStore/tables/partition_by_csv'

In [0]:
display(dbutils.fs.ls('/FileStore/tables/partition_by_address'))

path,name,size,modificationTime
dbfs:/FileStore/tables/partition_by_address/_SUCCESS,_SUCCESS,0,1703243842000
dbfs:/FileStore/tables/partition_by_address/address=INDIA/,address=INDIA/,0,0
dbfs:/FileStore/tables/partition_by_address/address=JAPAN/,address=JAPAN/,0,0
dbfs:/FileStore/tables/partition_by_address/address=RUSSIA/,address=RUSSIA/,0,0
dbfs:/FileStore/tables/partition_by_address/address=USA/,address=USA/,0,0


In [0]:
df.write.format("csv")\
        .option("header","true")\
        .option("mode","overwrite")\
        .partitionBy("id")\
        .save('/FileStore/tables/partition_by_id')

In [0]:
display(dbutils.fs.ls('/FileStore/tables/partition_by_id'))

path,name,size,modificationTime
dbfs:/FileStore/tables/partition_by_id/_SUCCESS,_SUCCESS,0,1703243914000
dbfs:/FileStore/tables/partition_by_id/id=1/,id=1/,0,0
dbfs:/FileStore/tables/partition_by_id/id=10/,id=10/,0,0
dbfs:/FileStore/tables/partition_by_id/id=11/,id=11/,0,0
dbfs:/FileStore/tables/partition_by_id/id=12/,id=12/,0,0
dbfs:/FileStore/tables/partition_by_id/id=13/,id=13/,0,0
dbfs:/FileStore/tables/partition_by_id/id=14/,id=14/,0,0
dbfs:/FileStore/tables/partition_by_id/id=15/,id=15/,0,0
dbfs:/FileStore/tables/partition_by_id/id=2/,id=2/,0,0
dbfs:/FileStore/tables/partition_by_id/id=3/,id=3/,0,0


In [0]:
df.write.format("csv")\
        .option("header","true")\
        .option("mode","overwrite")\
        .partitionBy("address","gender")\
        .save('/FileStore/tables/partition_by_address_gender')

In [0]:
display(dbutils.fs.ls('/FileStore/tables/partition_by_address_gender'))

path,name,size,modificationTime
dbfs:/FileStore/tables/partition_by_address_gender/_SUCCESS,_SUCCESS,0,1703243975000
dbfs:/FileStore/tables/partition_by_address_gender/address=INDIA/,address=INDIA/,0,0
dbfs:/FileStore/tables/partition_by_address_gender/address=JAPAN/,address=JAPAN/,0,0
dbfs:/FileStore/tables/partition_by_address_gender/address=RUSSIA/,address=RUSSIA/,0,0
dbfs:/FileStore/tables/partition_by_address_gender/address=USA/,address=USA/,0,0


In [0]:
display(dbutils.fs.ls('/FileStore/tables/partition_by_address_gender/address=INDIA/'))

path,name,size,modificationTime
dbfs:/FileStore/tables/partition_by_address_gender/address=INDIA/gender=f/,gender=f/,0,0
dbfs:/FileStore/tables/partition_by_address_gender/address=INDIA/gender=m/,gender=m/,0,0


In [0]:
df.write.format("csv")\
        .option("header","true")\
        .option("mode","overwrite")\
        .partitionBy("gender","address")\
        .save('/FileStore/tables/partition_by_gender_address')

In [0]:
display(dbutils.fs.ls('/FileStore/tables/partition_by_gender_address'))

path,name,size,modificationTime
dbfs:/FileStore/tables/partition_by_gender_address/_SUCCESS,_SUCCESS,0,1703244149000
dbfs:/FileStore/tables/partition_by_gender_address/gender=f/,gender=f/,0,0
dbfs:/FileStore/tables/partition_by_gender_address/gender=m/,gender=m/,0,0


In [0]:
display(dbutils.fs.ls('/FileStore/tables/partition_by_gender_address/gender=m/'))

path,name,size,modificationTime
dbfs:/FileStore/tables/partition_by_gender_address/gender=m/address=INDIA/,address=INDIA/,0,0
dbfs:/FileStore/tables/partition_by_gender_address/gender=m/address=JAPAN/,address=JAPAN/,0,0
dbfs:/FileStore/tables/partition_by_gender_address/gender=m/address=RUSSIA/,address=RUSSIA/,0,0
dbfs:/FileStore/tables/partition_by_gender_address/gender=m/address=USA/,address=USA/,0,0


In [0]:
df.write.format("csv")\
        .option("header","true")\
        .option("mode","overwrite")\
        .bucketBy(3,"id")\
        .option("path",'/FileStore/tables/bucket_by_id')\
        .saveAsTable("emp_bucket_by_id")

In [0]:
display(dbutils.fs.ls('/FileStore/tables/bucket_by_id'))

path,name,size,modificationTime
dbfs:/FileStore/tables/bucket_by_id/_SUCCESS,_SUCCESS,0,1703244515000
dbfs:/FileStore/tables/bucket_by_id/_committed_1437846646421472739,_committed_1437846646421472739,309,1703244515000
dbfs:/FileStore/tables/bucket_by_id/_started_1437846646421472739,_started_1437846646421472739,0,1703244514000
dbfs:/FileStore/tables/bucket_by_id/part-00000-tid-1437846646421472739-c6d941bb-e88d-4cfa-a83f-9a3ed7394c6d-193-1_00000.c000.csv,part-00000-tid-1437846646421472739-c6d941bb-e88d-4cfa-a83f-9a3ed7394c6d-193-1_00000.c000.csv,270,1703244514000
dbfs:/FileStore/tables/bucket_by_id/part-00000-tid-1437846646421472739-c6d941bb-e88d-4cfa-a83f-9a3ed7394c6d-193-2_00001.c000.csv,part-00000-tid-1437846646421472739-c6d941bb-e88d-4cfa-a83f-9a3ed7394c6d-193-2_00001.c000.csv,113,1703244514000
dbfs:/FileStore/tables/bucket_by_id/part-00000-tid-1437846646421472739-c6d941bb-e88d-4cfa-a83f-9a3ed7394c6d-193-3_00002.c000.csv,part-00000-tid-1437846646421472739-c6d941bb-e88d-4cfa-a83f-9a3ed7394c6d-193-3_00002.c000.csv,115,1703244515000
