# Import requirements and load the dataset

In [3]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("sampling with DF API").getOrCreate()

In [5]:
file_path = "/content/location_temp.csv"
df = spark.read.format("csv").option("header","True").load(file_path)

In [6]:
df.show(10)


+-------------------+-----------+------------+
|         event_date|location_id|temp_celcius|
+-------------------+-----------+------------+
|03/04/2019 19:48:06|       loc0|          29|
|03/04/2019 19:53:06|       loc0|          27|
|03/04/2019 19:58:06|       loc0|          28|
|03/04/2019 20:03:06|       loc0|          30|
|03/04/2019 20:08:06|       loc0|          27|
|03/04/2019 20:13:06|       loc0|          27|
|03/04/2019 20:18:06|       loc0|          27|
|03/04/2019 20:23:06|       loc0|          29|
|03/04/2019 20:28:06|       loc0|          32|
|03/04/2019 20:33:06|       loc0|          35|
+-------------------+-----------+------------+
only showing top 10 rows



In [7]:
df.count()

500000

# Working with samples



In [8]:
df1_sample = df.sample(fraction = 0.1, withReplacement = False)

In [11]:
#the sample function does not return the eact fraction specified
df1_sample.count()

50301

In [12]:
# the aggregation function is in key value pairs .agg({"columnName":"aggregation"})
df1_sample.groupBy("location_id").agg({"temp_celcius":"mean"}).show()

+-----------+------------------+
|location_id| avg(temp_celcius)|
+-----------+------------------+
|     loc196|28.954545454545453|
|     loc226|25.198019801980198|
|     loc150| 32.03703703703704|
|     loc292|29.059405940594058|
|     loc311|24.673684210526314|
|      loc22|27.872549019607842|
|      loc31| 24.98936170212766|
|     loc305|27.354545454545455|
|      loc82|27.083333333333332|
|      loc90|23.194174757281555|
|     loc118|             24.25|
|     loc195|27.387096774193548|
|     loc208|              26.1|
|      loc39| 25.19327731092437|
|      loc75| 23.25609756097561|
|     loc228|27.227272727272727|
|     loc203| 26.15740740740741|
|     loc193|30.275510204081634|
|     loc122| 32.25961538461539|
|     loc145| 32.30578512396694|
+-----------+------------------+
only showing top 20 rows



In [13]:

df1_sample.groupBy("location_id").agg({"temp_celcius":"mean"}).orderBy("location_id").show(10)

+-----------+------------------+
|location_id| avg(temp_celcius)|
+-----------+------------------+
|       loc0|29.065934065934066|
|       loc1|27.715686274509803|
|      loc10|25.389473684210525|
|     loc100|27.269565217391303|
|     loc101|25.698924731182796|
|     loc102|30.121495327102803|
|     loc103|24.962962962962962|
|     loc104|26.123809523809523|
|     loc105|          26.09375|
|     loc106|27.189655172413794|
+-----------+------------------+
only showing top 10 rows



In [16]:
# Smaller samples = more agg variance
# Bigger  samples = have less agg variance

df.groupBy("location_id").agg({"temp_celcius":"mean"}).orderBy("location_id").show(10)

+-----------+-----------------+
|location_id|avg(temp_celcius)|
+-----------+-----------------+
|       loc0|           29.176|
|       loc1|           28.246|
|      loc10|           25.337|
|     loc100|           27.297|
|     loc101|           25.317|
|     loc102|           30.327|
|     loc103|           25.341|
|     loc104|           26.204|
|     loc105|           26.217|
|     loc106|           27.201|
+-----------+-----------------+
only showing top 10 rows

