## 1. Create SparkSession Object with app specific configuration

In [1]:
from pyspark.sql import SparkSession

spark = SparkSession. \
            builder. \
            appName("Exam-Preparation"). \
            master("yarn"). \
            config("spark.ui.port", "0"). \
            enableHiveSupport(). \
            getOrCreate()

## 2. Place data in Hadoop

In [4]:
%%sh

git clone https://github.com/Akashpatel579/CCA175Exam.git

Cloning into 'CCA175Exam'...


In [5]:
%%sh

hdfs dfs -put CCA175Exam /public/practica

In [8]:
%%sh

hdfs dfs -ls /public/practica/json

Found 1 items
-rw-r--r--   1 evivancovid supergroup     104703 2022-05-18 20:00 /public/practica/json/solar_hot_water_heater_data.json


## 3. Read JSON format data to Spark DataFrame

In [9]:
%%sh

hdfs dfs -tail /public/practica/json/solar_hot_water_heater_data.json

N_DATE": "2016-07-15", "REVISION_NUM": 0}, {"STATUS": "Closed", "POSTAL": "M1R", "DESCRIPTION": "Proposal to install roof-top solar panels on existing single family detached dwelling.", "PERMIT_NUM": "16 197904", "STREET_NAME": "PRINCEWAY", "WORK": "Solar Domestic Hot Water (Res)", "COMPLETED_DATE": "2018-09-11", "ISSUED_DATE": "2016-08-18", "PERMIT_TYPE": "Small Residential Projects", "STREET_TYPE": "DR", "STRUCTURE_TYPE": "SFD - Detached", "STREET_DIRECTION": " ", "STREET_NUM": "75", "_id": 6723, "APPLICATION_DATE": "2016-07-27", "REVISION_NUM": 0}, {"STATUS": "Permit Issued", "POSTAL": "M6H", "DESCRIPTION": "ROOFTOP PV SOLAR PANEL INSTALLATION", "PERMIT_NUM": "17 242944", "STREET_NAME": "PAULINE", "WORK": "Solar Domestic Hot Water (Res)", "COMPLETED_DATE": null, "ISSUED_DATE": "2017-10-26", "PERMIT_TYPE": "Small Residential Projects", "STREET_TYPE": "AVE", "STRUCTURE_TYPE": "SFD - Semi-Detached", "STREET_DIRECTION": " ", "STREET_NUM": "44", "_id": 6724, "APPLICATION_DATE": "2017-10-

In [2]:
df = spark.read.json("/public/practica/json/solar_hot_water_heater_data.json")
df2 = df

In [15]:
type(df)

pyspark.sql.dataframe.DataFrame

## 4. Print Schema of the DataFrame

In [16]:
df.printSchema()

root
 |-- APPLICATION_DATE: string (nullable = true)
 |-- COMPLETED_DATE: string (nullable = true)
 |-- DESCRIPTION: string (nullable = true)
 |-- ISSUED_DATE: string (nullable = true)
 |-- PERMIT_NUM: string (nullable = true)
 |-- PERMIT_TYPE: string (nullable = true)
 |-- POSTAL: string (nullable = true)
 |-- REVISION_NUM: long (nullable = true)
 |-- STATUS: string (nullable = true)
 |-- STREET_DIRECTION: string (nullable = true)
 |-- STREET_NAME: string (nullable = true)
 |-- STREET_NUM: string (nullable = true)
 |-- STREET_TYPE: string (nullable = true)
 |-- STRUCTURE_TYPE: string (nullable = true)
 |-- WORK: string (nullable = true)
 |-- _id: long (nullable = true)



## 5. Print first 5 records, understand and use the truncate parameter

In [18]:
df.show(5, truncate = True)

+----------------+--------------+--------------------+-----------+----------+--------------------+------+------------+-------------+----------------+-----------+----------+-----------+-------------------+--------------------+----+
|APPLICATION_DATE|COMPLETED_DATE|         DESCRIPTION|ISSUED_DATE|PERMIT_NUM|         PERMIT_TYPE|POSTAL|REVISION_NUM|       STATUS|STREET_DIRECTION|STREET_NAME|STREET_NUM|STREET_TYPE|     STRUCTURE_TYPE|                WORK| _id|
+----------------+--------------+--------------------+-----------+----------+--------------------+------+------------+-------------+----------------+-----------+----------+-----------+-------------------+--------------------+----+
|      2009-05-07|    2010-10-05|installation of s...| 2009-05-11| 09 135007|Small Residential...|   M4K|           0|       Closed|                |  GRANDVIEW|         4|        AVE|SFD - Semi-Detached|Solar Domestic Ho...|6561|
|      2009-05-07|          null|Installation of a...| 2009-05-11| 09 135023

## 6. Print column names

In [20]:
df.columns

['APPLICATION_DATE',
 'COMPLETED_DATE',
 'DESCRIPTION',
 'ISSUED_DATE',
 'PERMIT_NUM',
 'PERMIT_TYPE',
 'POSTAL',
 'REVISION_NUM',
 'STATUS',
 'STREET_DIRECTION',
 'STREET_NAME',
 'STREET_NUM',
 'STREET_TYPE',
 'STRUCTURE_TYPE',
 'WORK',
 '_id']

## 7. Drop a column from the DataFrame

In [21]:
df.drop("DESCRIPTION")

APPLICATION_DATE,COMPLETED_DATE,ISSUED_DATE,PERMIT_NUM,PERMIT_TYPE,POSTAL,REVISION_NUM,STATUS,STREET_DIRECTION,STREET_NAME,STREET_NUM,STREET_TYPE,STRUCTURE_TYPE,WORK,_id
2009-05-07,2010-10-05,2009-05-11,09 135007,Small Residential...,M4K,0,Closed,,GRANDVIEW,4,AVE,SFD - Semi-Detached,Solar Domestic Ho...,6561
2009-05-07,,2009-05-11,09 135023,Small Residential...,M4M,0,Inspection,,JONES,81,AVE,SFD - Semi-Detached,Solar Domestic Ho...,6562
2009-05-11,2011-01-14,2009-05-20,09 136204,Small Residential...,M4J,0,Closed,,BOULTBEE,114,AVE,SFD - Semi-Detached,Solar Domestic Ho...,6563
2009-06-11,2010-03-03,2009-08-17,09 148320,Small Residential...,M4M,0,Closed,E,GERRARD,899,ST,Converted House,Solar Domestic Ho...,6564
2009-06-11,,2009-08-17,09 148341,Small Residential...,M4M,0,Permit Issued,,LOGAN,201,AVE,SFD - Semi-Detached,Solar Domestic Ho...,6565
2009-06-11,2011-02-15,2009-08-17,09 148351,Small Residential...,M4K,0,Cancelled,,BROWNING,5,AVE,SFD - Detached,Solar Domestic Ho...,6566
2009-06-19,2010-07-14,2010-02-03,09 152120,Small Residential...,M4M,0,Closed,,LESLIE,271,ST,SFD - Detached,Solar Domestic Ho...,6567
2009-08-04,,2009-11-18,09 153159,Small Residential...,M4M,0,Inspection,,BRIGHTON,11,AVE,SFD - Detached,Solar Domestic Ho...,6568
2009-08-04,,2009-08-17,09 153172,Small Residential...,M9B,0,Permit Issued,,GREENFIELD,69,DR,SFD - Detached,Solar Domestic Ho...,6569
2009-08-04,2017-11-17,2009-08-17,09 153185,Small Residential...,M4L,0,Cancelled,,SANDFORD,11,AVE,SFD - Semi-Detached,Solar Domestic Ho...,6570


## 8. Print number of records

In [26]:
df.count()

164

## 9. Print number of unique value per columns

In [6]:
from pyspark.sql.functions import col, countDistinct

df.agg(*(countDistinct(col(c)).alias(c) for c in df.columns))

APPLICATION_DATE,COMPLETED_DATE,DESCRIPTION,ISSUED_DATE,PERMIT_NUM,PERMIT_TYPE,POSTAL,REVISION_NUM,STATUS,STREET_DIRECTION,STREET_NAME,STREET_NUM,STREET_TYPE,STRUCTURE_TYPE,WORK,_id
72,85,71,69,164,1,43,1,5,4,123,127,8,4,1,164


## 10. How many types of STATUS values are there?

In [4]:
df.select(col("STATUS")).distinct().count()

5

In [3]:
from pyspark.sql.functions import lit, count

df.groupBy("STATUS").agg(count(lit(1)).alias("NUMBER OF APPLICATIONS"))

STATUS,NUMBER OF APPLICATIONS
Permit Issued,21
Inspection,18
Cancelled,12
Work Not Started,2
Closed,111


## 11. Filter the DataFrame based on the values of a single column

In [6]:
df.filter(col("STATUS") == "Cancelled")

APPLICATION_DATE,COMPLETED_DATE,DESCRIPTION,ISSUED_DATE,PERMIT_NUM,PERMIT_TYPE,POSTAL,REVISION_NUM,STATUS,STREET_DIRECTION,STREET_NAME,STREET_NUM,STREET_TYPE,STRUCTURE_TYPE,WORK,_id
2009-06-11,2011-02-15,Permit for instal...,2009-08-17,09 148351,Small Residential...,M4K,0,Cancelled,,BROWNING,5,AVE,SFD - Detached,Solar Domestic Ho...,6566
2009-08-04,2017-11-17,Permit for instal...,2009-08-17,09 153185,Small Residential...,M4L,0,Cancelled,,SANDFORD,11,AVE,SFD - Semi-Detached,Solar Domestic Ho...,6570
2010-01-04,2018-05-25,Permit for instal...,2010-01-07,10 100408,Small Residential...,M4M,0,Cancelled,,LOGAN,237,AVE,SFD - Townhouse,Solar Domestic Ho...,6646
2010-02-25,2016-12-01,install hot water...,,10 123466,Small Residential...,M1V,0,Cancelled,,SANWOOD,5,BLVD,SFD - Detached,Solar Domestic Ho...,6649
2010-07-30,2015-07-23,Permit for instal...,2010-07-30,10 231290,Small Residential...,M4K,0,Cancelled,,CAMBRIDGE,157,AVE,SFD - Detached,Solar Domestic Ho...,6657
2010-09-29,2019-03-29,rooftop solar panel,2015-03-19,10 268200,Small Residential...,M1S,0,Cancelled,,BLUEBERRY,12,DR,SFD - Detached,Solar Domestic Ho...,6659
2010-11-04,2013-03-21,Permit for instal...,2010-11-05,10 291362,Small Residential...,M4E,0,Cancelled,,WILLOW,261,AVE,SFD - Semi-Detached,Solar Domestic Ho...,6670
2010-11-17,2013-03-21,Permit for instal...,2010-11-18,10 298393,Small Residential...,M4E,0,Cancelled,E,GERRARD,2001,ST,SFD - Semi-Detached,Solar Domestic Ho...,6684
2011-12-15,2015-11-17,Roof top solar pa...,2012-04-03,11 326620,Small Residential...,M1C,0,Cancelled,,ROUGE HIGHLANDS,371,DR,SFD - Detached,Solar Domestic Ho...,6712
2012-06-19,2019-10-01,Install one solar...,2012-09-26,12 196161,Small Residential...,M4N,0,Cancelled,,BOWOOD,63,AVE,SFD - Semi-Detached,Solar Domestic Ho...,6714


## 12. Calculate Application processing time for each application

In [6]:
from pyspark.sql.functions import date_format, datediff,col

In [7]:
df.select("APPLICATION_DATE", "ISSUED_DATE"). \
withColumn("APP_PRCS_TIME", datediff(col("ISSUED_DATE"), col("APPLICATION_DATE"))). \
orderBy(col("APP_PRCS_TIME").desc())

APPLICATION_DATE,ISSUED_DATE,APP_PRCS_TIME
2010-09-29,2015-03-19,1632
2009-12-01,2014-04-07,1588
2010-02-18,2013-07-31,1259
2009-11-26,2010-07-30,246
2009-06-19,2010-02-03,229
2009-12-01,2010-04-26,146
2009-12-01,2010-04-26,146
2009-12-01,2010-04-26,146
2011-12-15,2012-04-03,110
2009-08-04,2009-11-18,106


## 13. How many water heaters were issued in year 2015?

In [3]:
from pyspark.sql.functions import * 

df.select("ISSUED_DATE").filter("year(ISSUED_DATE) == 2015").count()
#df.select("ISSUED_DATE").where("year(ISSUED_DATE) = 2015").count()

3

## 14. How many water heaters were issued by year?

In [46]:
df.withColumn("Issued Date", year("ISSUED_DATE")). \
groupBy("Issued Date").agg(count("ISSUED_DATE").alias("Number of Issued Applications")).orderBy(desc("Issued Date")).filter(col("Issued Date").isNotNull())

Issued Date,Number of Issued Applications
2017,1
2016,2
2015,3
2014,3
2013,2
2012,4
2011,10
2010,59
2009,78


## 15. Joined Operations: 
1. Create clone dataframe of data
2. Filter the applications
3. Create column "application_processing_days" by APPLICATION_DATE, ISSUED_DATE and store it in df_procession_days dataframe
4. JOIN df_procession_days dataframe - Use inner join. And make sure the processing_days columns is available with the original dataframe

In [5]:
df_issued = df.filter(col("ISSUED_DATE").isNotNull())

In [6]:
df_procession_days = df_issued.withColumn("processing_days", datediff(col("ISSUED_DATE"), col("APPLICATION_DATE")))

In [7]:
df_joined = df.join(df_procession_days, df._id == df_procession_days._id, how = "inner"). \
    select(df["*"], df_procession_days["processing_days"])

In [8]:
df_joined

APPLICATION_DATE,COMPLETED_DATE,DESCRIPTION,ISSUED_DATE,PERMIT_NUM,PERMIT_TYPE,POSTAL,REVISION_NUM,STATUS,STREET_DIRECTION,STREET_NAME,STREET_NUM,STREET_TYPE,STRUCTURE_TYPE,WORK,_id,processing_days
2009-05-07,2010-10-05,installation of s...,2009-05-11,09 135007,Small Residential...,M4K,0,Closed,,GRANDVIEW,4,AVE,SFD - Semi-Detached,Solar Domestic Ho...,6561,4
2009-05-07,,Installation of a...,2009-05-11,09 135023,Small Residential...,M4M,0,Inspection,,JONES,81,AVE,SFD - Semi-Detached,Solar Domestic Ho...,6562,4
2009-05-11,2011-01-14,Proposal to insta...,2009-05-20,09 136204,Small Residential...,M4J,0,Closed,,BOULTBEE,114,AVE,SFD - Semi-Detached,Solar Domestic Ho...,6563,9
2009-06-11,2010-03-03,Permit for instal...,2009-08-17,09 148320,Small Residential...,M4M,0,Closed,E,GERRARD,899,ST,Converted House,Solar Domestic Ho...,6564,67
2009-06-11,,Permit for instal...,2009-08-17,09 148341,Small Residential...,M4M,0,Permit Issued,,LOGAN,201,AVE,SFD - Semi-Detached,Solar Domestic Ho...,6565,67
2009-06-11,2011-02-15,Permit for instal...,2009-08-17,09 148351,Small Residential...,M4K,0,Cancelled,,BROWNING,5,AVE,SFD - Detached,Solar Domestic Ho...,6566,67
2009-06-19,2010-07-14,Permit for instal...,2010-02-03,09 152120,Small Residential...,M4M,0,Closed,,LESLIE,271,ST,SFD - Detached,Solar Domestic Ho...,6567,229
2009-08-04,,Installation of 2...,2009-11-18,09 153159,Small Residential...,M4M,0,Inspection,,BRIGHTON,11,AVE,SFD - Detached,Solar Domestic Ho...,6568,106
2009-08-04,,Permit for instal...,2009-08-17,09 153172,Small Residential...,M9B,0,Permit Issued,,GREENFIELD,69,DR,SFD - Detached,Solar Domestic Ho...,6569,13
2009-08-04,2017-11-17,Permit for instal...,2009-08-17,09 153185,Small Residential...,M4L,0,Cancelled,,SANDFORD,11,AVE,SFD - Semi-Detached,Solar Domestic Ho...,6570,13


## 16. Find out the First 3 Applicants of the Solar Water Heater plant for each year

In [35]:
from pyspark.sql.window import Window

spec = Window. \
    partitionBy(col("APPLICATION_YEAR")). \
    orderBy(desc("APPLICATION_YEAR"))

In [33]:
type(spec)

pyspark.sql.window.WindowSpec

In [25]:
df.withColumn("APPLICATION_YEAR", year("APPLICATION_DATE")). \
    withColumn("NUMBER", row_number().over(spec)). \
    filter("NUMBER <= 3"). \
    select(col("_id"),col("APPLICATION_YEAR"), col("NUMBER")). \
    orderBy(col("APPLICATION_YEAR").desc(), col("NUMBER")).show(30)

+----+----------------+------+
| _id|APPLICATION_YEAR|NUMBER|
+----+----------------+------+
|6724|            2017|     1|
|6721|            2016|     1|
|6722|            2016|     2|
|6723|            2016|     3|
|6719|            2015|     1|
|6720|            2015|     2|
|6717|            2014|     1|
|6718|            2014|     2|
|6716|            2013|     1|
|6713|            2012|     1|
|6714|            2012|     2|
|6715|            2012|     3|
|6702|            2011|     1|
|6703|            2011|     2|
|6704|            2011|     3|
|6645|            2010|     1|
|6646|            2010|     2|
|6647|            2010|     3|
|6561|            2009|     1|
|6562|            2009|     2|
|6563|            2009|     3|
+----+----------------+------+



## 17. Find Unique Application's Dates and view in ascending order

In [5]:
df.select("APPLICATION_DATE").distinct().orderBy("APPLICATION_DATE")

APPLICATION_DATE
2009-05-07
2009-05-11
2009-06-11
2009-06-19
2009-08-04
2009-08-08
2009-08-20
2009-08-31
2009-09-09
2009-10-02


## 18. Save data partitioned by year, with gzip compression and parquet format

In [31]:
df.withColumn("YEAR", year("APPLICATION_DATE")). \
    write.parquet("/public/mockExam", partitionBy = "YEAR", compression = "gzip")

In [32]:
%%sh

hdfs dfs -ls /public/mockExam

Found 10 items
drwxr-xr-x   - evivancovid supergroup          0 2022-05-26 18:23 /public/mockExam/YEAR=2009
drwxr-xr-x   - evivancovid supergroup          0 2022-05-26 18:23 /public/mockExam/YEAR=2010
drwxr-xr-x   - evivancovid supergroup          0 2022-05-26 18:23 /public/mockExam/YEAR=2011
drwxr-xr-x   - evivancovid supergroup          0 2022-05-26 18:23 /public/mockExam/YEAR=2012
drwxr-xr-x   - evivancovid supergroup          0 2022-05-26 18:23 /public/mockExam/YEAR=2013
drwxr-xr-x   - evivancovid supergroup          0 2022-05-26 18:23 /public/mockExam/YEAR=2014
drwxr-xr-x   - evivancovid supergroup          0 2022-05-26 18:23 /public/mockExam/YEAR=2015
drwxr-xr-x   - evivancovid supergroup          0 2022-05-26 18:23 /public/mockExam/YEAR=2016
drwxr-xr-x   - evivancovid supergroup          0 2022-05-26 18:23 /public/mockExam/YEAR=2017
-rw-r--r--   1 evivancovid supergroup          0 2022-05-26 18:23 /public/mockExam/_SUCCESS


## 19. Read the above partitioned data, which is in parquet format, using spark API's

In [36]:
parquet_df = spark.read.parquet("/public/mockExam/*")

In [39]:
parquet_df.printSchema()

root
 |-- APPLICATION_DATE: string (nullable = true)
 |-- COMPLETED_DATE: string (nullable = true)
 |-- DESCRIPTION: string (nullable = true)
 |-- ISSUED_DATE: string (nullable = true)
 |-- PERMIT_NUM: string (nullable = true)
 |-- PERMIT_TYPE: string (nullable = true)
 |-- POSTAL: string (nullable = true)
 |-- REVISION_NUM: long (nullable = true)
 |-- STATUS: string (nullable = true)
 |-- STREET_DIRECTION: string (nullable = true)
 |-- STREET_NAME: string (nullable = true)
 |-- STREET_NUM: string (nullable = true)
 |-- STREET_TYPE: string (nullable = true)
 |-- STRUCTURE_TYPE: string (nullable = true)
 |-- WORK: string (nullable = true)
 |-- _id: long (nullable = true)



## 20. Read the data related to 2009 from the partitioned data

In [40]:
parquet_df_2009 = spark.read.parquet("/public/mockExam/YEAR=2009")

In [41]:
parquet_df_2009

APPLICATION_DATE,COMPLETED_DATE,DESCRIPTION,ISSUED_DATE,PERMIT_NUM,PERMIT_TYPE,POSTAL,REVISION_NUM,STATUS,STREET_DIRECTION,STREET_NAME,STREET_NUM,STREET_TYPE,STRUCTURE_TYPE,WORK,_id
2009-05-07,2010-10-05,installation of s...,2009-05-11,09 135007,Small Residential...,M4K,0,Closed,,GRANDVIEW,4,AVE,SFD - Semi-Detached,Solar Domestic Ho...,6561
2009-05-07,,Installation of a...,2009-05-11,09 135023,Small Residential...,M4M,0,Inspection,,JONES,81,AVE,SFD - Semi-Detached,Solar Domestic Ho...,6562
2009-05-11,2011-01-14,Proposal to insta...,2009-05-20,09 136204,Small Residential...,M4J,0,Closed,,BOULTBEE,114,AVE,SFD - Semi-Detached,Solar Domestic Ho...,6563
2009-06-11,2010-03-03,Permit for instal...,2009-08-17,09 148320,Small Residential...,M4M,0,Closed,E,GERRARD,899,ST,Converted House,Solar Domestic Ho...,6564
2009-06-11,,Permit for instal...,2009-08-17,09 148341,Small Residential...,M4M,0,Permit Issued,,LOGAN,201,AVE,SFD - Semi-Detached,Solar Domestic Ho...,6565
2009-06-11,2011-02-15,Permit for instal...,2009-08-17,09 148351,Small Residential...,M4K,0,Cancelled,,BROWNING,5,AVE,SFD - Detached,Solar Domestic Ho...,6566
2009-06-19,2010-07-14,Permit for instal...,2010-02-03,09 152120,Small Residential...,M4M,0,Closed,,LESLIE,271,ST,SFD - Detached,Solar Domestic Ho...,6567
2009-08-04,,Installation of 2...,2009-11-18,09 153159,Small Residential...,M4M,0,Inspection,,BRIGHTON,11,AVE,SFD - Detached,Solar Domestic Ho...,6568
2009-08-04,,Permit for instal...,2009-08-17,09 153172,Small Residential...,M9B,0,Permit Issued,,GREENFIELD,69,DR,SFD - Detached,Solar Domestic Ho...,6569
2009-08-04,2017-11-17,Permit for instal...,2009-08-17,09 153185,Small Residential...,M4L,0,Cancelled,,SANDFORD,11,AVE,SFD - Semi-Detached,Solar Domestic Ho...,6570


## 21. Save data partitioned by year, with lzo compression and orc format.

In [43]:
df.withColumn("YEAR", year("APPLICATION_DATE")). \
    write.orc("/public/mockExam", mode = "overwrite", partitionBy = "YEAR", compression = "lzo")

In [44]:
%%sh

hdfs dfs -ls /public/mockExam

Found 10 items
drwxr-xr-x   - evivancovid supergroup          0 2022-05-26 19:07 /public/mockExam/YEAR=2009
drwxr-xr-x   - evivancovid supergroup          0 2022-05-26 19:07 /public/mockExam/YEAR=2010
drwxr-xr-x   - evivancovid supergroup          0 2022-05-26 19:07 /public/mockExam/YEAR=2011
drwxr-xr-x   - evivancovid supergroup          0 2022-05-26 19:07 /public/mockExam/YEAR=2012
drwxr-xr-x   - evivancovid supergroup          0 2022-05-26 19:07 /public/mockExam/YEAR=2013
drwxr-xr-x   - evivancovid supergroup          0 2022-05-26 19:07 /public/mockExam/YEAR=2014
drwxr-xr-x   - evivancovid supergroup          0 2022-05-26 19:07 /public/mockExam/YEAR=2015
drwxr-xr-x   - evivancovid supergroup          0 2022-05-26 19:07 /public/mockExam/YEAR=2016
drwxr-xr-x   - evivancovid supergroup          0 2022-05-26 19:07 /public/mockExam/YEAR=2017
-rw-r--r--   1 evivancovid supergroup          0 2022-05-26 19:07 /public/mockExam/_SUCCESS


## 22. Save data partitioned by year as a Spark Metastore table. Also create bucket by 5 using STATUS column

In [13]:
df.withColumn("year", year("APPLICATION_DATE")). \
    write.bucketBy(5, "STATUS"). \
    saveAsTable("bucketed_table")

In [14]:
df_bucket = spark.read.table("bucketed_table")

In [18]:
df_bucket

APPLICATION_DATE,COMPLETED_DATE,DESCRIPTION,ISSUED_DATE,PERMIT_NUM,PERMIT_TYPE,POSTAL,REVISION_NUM,STATUS,STREET_DIRECTION,STREET_NAME,STREET_NUM,STREET_TYPE,STRUCTURE_TYPE,WORK,_id,year
2009-05-07,2010-10-05,installation of s...,2009-05-11,09 135007,Small Residential...,M4K,0,Closed,,GRANDVIEW,4,AVE,SFD - Semi-Detached,Solar Domestic Ho...,6561,2009
2009-05-11,2011-01-14,Proposal to insta...,2009-05-20,09 136204,Small Residential...,M4J,0,Closed,,BOULTBEE,114,AVE,SFD - Semi-Detached,Solar Domestic Ho...,6563,2009
2009-06-11,2010-03-03,Permit for instal...,2009-08-17,09 148320,Small Residential...,M4M,0,Closed,E,GERRARD,899,ST,Converted House,Solar Domestic Ho...,6564,2009
2009-06-19,2010-07-14,Permit for instal...,2010-02-03,09 152120,Small Residential...,M4M,0,Closed,,LESLIE,271,ST,SFD - Detached,Solar Domestic Ho...,6567,2009
2009-08-04,2010-10-05,Permit for instal...,2009-08-17,09 153191,Small Residential...,M4K,0,Closed,,BROADVIEW,511,AVE,SFD - Detached,Solar Domestic Ho...,6571,2009
2009-08-20,2010-05-21,Permit for instal...,2009-09-21,09 160244,Small Residential...,M4E,0,Closed,,KIMBERLEY,24,AVE,SFD - Detached,Solar Domestic Ho...,6573,2009
2009-08-31,2010-08-16,Permit for instal...,2009-12-03,09 163896,Small Residential...,M4L,0,Closed,,HASTINGS,15,AVE,SFD - Semi-Detached,Solar Domestic Ho...,6575,2009
2009-09-09,2012-07-23,Permit for instal...,2009-09-22,09 166795,Small Residential...,M4L,0,Closed,,GLENMORE,132,RD,SFD - Detached,Solar Domestic Ho...,6578,2009
2009-09-09,2014-10-28,Permit for instal...,2009-09-22,09 166933,Small Residential...,M2M,0,Closed,,GOULDING,143,AVE,SFD - Detached,Solar Domestic Ho...,6580,2009
2009-10-02,2010-01-29,Permit for instal...,2009-10-27,09 175330,Small Residential...,M4L,0,Closed,,HIAWATHA,94,RD,SFD - Detached,Solar Domestic Ho...,6582,2009
