In [0]:

from pyspark.sql import functions as F
from pyspark.sql import types as T

In [0]:
schema = T.StructType([
    T.StructField("AT", T.DoubleType(), True),          
    T.StructField("V", T.DoubleType(), True),           
    T.StructField("AP", T.DoubleType(), True),          
    T.StructField("RH", T.DoubleType(), True),           
    T.StructField("PE", T.DoubleType(), True),        
    
])


data = spark.read.option("header", True).schema(schema).csv("/databricks-datasets/power-plant/*", sep="\t")

data = data.withColumn("RH", F.when(F.col("RH").between(70.0,73.0), "Marchewa").otherwise(F.col("RH")))

data.write.csv("dbfs:/tmp/power-plant.csv", sep="\t", mode="overwrite")

display(data.limit(20))
  


AT,V,AP,RH,PE
14.96,41.76,1024.07,73.17,463.26
25.18,62.96,1020.04,59.08,444.37
5.11,39.4,1012.16,92.14,488.56
20.86,57.32,1010.24,76.64,446.48
10.82,37.5,1009.23,96.62,473.9
26.27,59.44,1012.23,58.77,443.67
15.89,43.96,1014.02,75.24,467.35
9.48,44.71,1019.12,66.43,478.42
14.64,45.0,1021.78,41.25,475.98
11.74,43.56,1015.14,Marchewa,477.5


In [0]:
data_permissive_mode = spark.read.option("header", True).option("mode", "permissive").schema(schema).csv("/tmp/power-plant.csv", sep="\t")

display(data_permissive_mode.limit(20))

AT,V,AP,RH,PE
25.18,62.96,1020.04,59.08,444.37
5.11,39.4,1012.16,92.14,488.56
20.86,57.32,1010.24,76.64,446.48
10.82,37.5,1009.23,96.62,473.9
26.27,59.44,1012.23,58.77,443.67
15.89,43.96,1014.02,75.24,467.35
9.48,44.71,1019.12,66.43,478.42
14.64,45.0,1021.78,41.25,475.98
11.74,43.56,1015.14,,477.5
17.99,43.72,1008.64,75.04,453.02


In [0]:
data_dropmalformed_mode = spark.read.option("header", True).option("mode", "dropMalformed").schema(schema).csv("/tmp/power-plant.csv", sep="\t")

display(data_dropmalformed_mode.limit(20))

AT,V,AP,RH,PE
25.18,62.96,1020.04,59.08,444.37
5.11,39.4,1012.16,92.14,488.56
20.86,57.32,1010.24,76.64,446.48
10.82,37.5,1009.23,96.62,473.9
26.27,59.44,1012.23,58.77,443.67
15.89,43.96,1014.02,75.24,467.35
9.48,44.71,1019.12,66.43,478.42
14.64,45.0,1021.78,41.25,475.98
17.99,43.72,1008.64,75.04,453.02
20.14,46.93,1014.66,64.22,453.99


In [0]:
data_failfast_mode = spark.read.option("header", True).option("mode", "failFast").schema(schema).csv("/tmp/power-plant.csv", sep="\t")

display(data_failfast_mode.limit(20))

In [0]:
data.write.parquet("dbfs:/tmp/power-plant-broken.parquet", mode="overwrite")
data.write.parquet("dbfs:/tmp/power-plant-broken.json", mode="overwrite")

In [0]:
display(spark.read.parquet("/tmp/power-plant-broken.parquet").limit(25))

AT,V,AP,RH,PE
8.34,40.77,1010.84,90.01,480.48
23.64,58.49,1011.4,74.2,445.75
29.74,56.9,1007.15,41.91,438.76
19.07,49.69,1007.22,76.79,453.09
11.8,40.66,1017.13,97.2,464.43
13.97,39.16,1016.05,84.6,470.96
22.1,71.29,1008.2,75.38,442.35
14.47,41.76,1021.98,78.41,464.0
31.25,69.51,1010.25,36.83,428.77
6.77,38.18,1017.8,81.13,484.31


In [0]:
display(spark.read.parquet("/tmp/power-plant-broken.json").limit(40))

AT,V,AP,RH,PE
8.34,40.77,1010.84,90.01,480.48
23.64,58.49,1011.4,74.2,445.75
29.74,56.9,1007.15,41.91,438.76
19.07,49.69,1007.22,76.79,453.09
11.8,40.66,1017.13,97.2,464.43
13.97,39.16,1016.05,84.6,470.96
22.1,71.29,1008.2,75.38,442.35
14.47,41.76,1021.98,78.41,464.0
31.25,69.51,1010.25,36.83,428.77
6.77,38.18,1017.8,81.13,484.31


In [0]:
%fs ls dbfs:/databricks-datasets/


path,name,size,modificationTime
dbfs:/databricks-datasets/COVID/,COVID/,0,0
dbfs:/databricks-datasets/README.md,README.md,976,1532468253000
dbfs:/databricks-datasets/Rdatasets/,Rdatasets/,0,0
dbfs:/databricks-datasets/SPARK_README.md,SPARK_README.md,3359,1455043490000
dbfs:/databricks-datasets/adult/,adult/,0,0
dbfs:/databricks-datasets/airlines/,airlines/,0,0
dbfs:/databricks-datasets/amazon/,amazon/,0,0
dbfs:/databricks-datasets/asa/,asa/,0,0
dbfs:/databricks-datasets/atlas_higgs/,atlas_higgs/,0,0
dbfs:/databricks-datasets/bikeSharing/,bikeSharing/,0,0
