In [0]:
insurance_df = spark.read.format("csv") \
                         .option("inferSchema", "True") \
                         .option("header", "True") \
                         .option("sep", ",") \
                         .load("dbfs:/FileStore/PLSampleDataSets/insurance.csv")

In [0]:
# Expand Spark Jobs
# Expand the individual jobs and show that in each case we have just one stage (because the file size is under 128MB)

In [0]:
insurance_df.groupBy('children')\
            .agg({'charges': 'avg', 'children': 'count'})\
            .withColumnRenamed('avg(charges)', 'average_charges')\
            .orderBy('average_charges')\
            .display()

children,average_charges,count(children)
5,8786.035247222222,18
0,12365.975601635882,574
1,12731.171831635793,324
4,13850.6563112,25
2,15073.563733958328,240
3,15355.31836681528,157


In [0]:
# The family with 5 children seems to have unusually low insurance cost. 
# Let's check why is it the case by zooming in to the data of only families with no. of children = 5

# Seems like most insured people with children = 5 don't smoke

# Please scroll and show that the smoking column is mostly all "no"

In [0]:
children_5 = insurance_df.filter(insurance_df.children == 5)

children_5.display()

age,sex,bmi,children,smoker,region,charges,insuranceclaim
19,female,28.6,5,no,southwest,4687.797,No
31,male,28.5,5,no,northeast,6799.458,No
20,female,37.0,5,no,southwest,4830.63,No
25,male,23.9,5,no,southwest,5080.096,No
45,male,24.31,5,no,southeast,9788.8659,No
52,female,46.75,5,no,southeast,12592.5345,Yes
49,female,31.9,5,no,southwest,11552.904,No
33,male,42.4,5,no,southwest,6666.243,No
33,male,33.44,5,no,southeast,6653.7886,No
46,male,25.8,5,no,southwest,10096.97,Yes


In [0]:
# Let's write out the result to a csv file. go to data> create table> dbfs>filestore>output and you can see that the csv file has been successfully created

In [0]:
dbutils.fs.rm("dbfs:/FileStore/output/children5.csv", True)

children_5.write.csv("dbfs:/FileStore/output/children5.csv")

In [0]:
# Let's read the data and verify the written data.
chidren_5_subset_df = spark.read.format('csv')\
                                .option("inferSchema", 'true') \
                                .option("header", 'true')\
                                .load("dbfs:/FileStore/output/children5.csv")

chidren_5_subset_df.display()

19,female,28.6,5,no4,southwest,4687.797,No7
31,male,28.5,5,no,northeast,6799.458,No
20,female,37.0,5,no,southwest,4830.63,No
25,male,23.9,5,no,southwest,5080.096,No
45,male,24.31,5,no,southeast,9788.8659,No
52,female,46.75,5,no,southeast,12592.5345,Yes
49,female,31.9,5,no,southwest,11552.904,No
33,male,42.4,5,no,southwest,6666.243,No
33,male,33.44,5,no,southeast,6653.7886,No
46,male,25.8,5,no,southwest,10096.97,Yes
39,female,24.225,5,no,northwest,8965.79575,No


In [0]:
# Let's write out the result to a json file. go to data> create table> dbfs>filestore>output and you can see that the json file has been successfully created

In [0]:
dbutils.fs.rm("dbfs:/FileStore/output/children5.json", True)

children_5.write.json("dbfs:/FileStore/output/children5.json")