In [2]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as func
from pyspark.sql.types import StringType,StructField,StructType,IntegerType,FloatType

In [3]:
spark = SparkSession.builder.appName("SparkSQL").getOrCreate()

In [4]:
# Making schema to load csv in a structured DataFrame without having the benefit of a header in csv file

schema = StructType([StructField("stationID",StringType(),True), \
                     StructField("date",IntegerType(),True),\
                     StructField("measure_Type",StringType(),True),\
                     StructField("temperature",FloatType(),True)])

In [5]:
df = spark.read.schema(schema).csv("1800.csv")

In [6]:
df.printSchema()

root
 |-- stationID: string (nullable = true)
 |-- date: integer (nullable = true)
 |-- measure_Type: string (nullable = true)
 |-- temperature: float (nullable = true)



In [7]:
minTemps = df.filter(df.measure_Type=="TMIN")

In [8]:
stationTemp = minTemps.select("stationID","temperature")
minTempByStation = stationTemp.groupby("stationID").min("temperature")

In [9]:
minTempByStation.show()

+-----------+----------------+
|  stationID|min(temperature)|
+-----------+----------------+
|ITE00100554|          -148.0|
|EZE00100082|          -135.0|
+-----------+----------------+



In [10]:
# Creating NEW COLUMN temp in fahrenheit 
minTempByStation = minTempByStation.withColumn("temperature",
                                              func.round(func.col("min(Temperature)")*0.1*(9.0/5.0)+32.0,2))\
                                               .select("stationID","Temperature").sort("temperature")         
                                               
                                               


In [11]:
minTempByStation.collect()

[Row(stationID='ITE00100554', Temperature=5.36),
 Row(stationID='EZE00100082', Temperature=7.7)]

In [4]:
spark.stop()