In [128]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

spark = SparkSession.builder.master('local[2]').getOrCreate()

In [129]:
# Create a DataFrame from csv data (automatically infer schema and data types)
# There are other file formats you can read from (e.g., csv, orc, parquet)
# https://spark.apache.org/docs/2.2.0/sql-programming-guide.html#data-sources

stationsInfoFilePath = "hdfs://hdfs-nn:9000/project_tabd/bronze/csvFiles/weather/stationsInfo.csv"

# Read stations info data
stationsInfo = spark.read.load(stationsInfoFilePath, 
                         format="csv", sep=",", inferSchema="true", header="true")


In [130]:
stationsInfo.printSchema()
stationsInfo.show()
stationsInfo.toPandas()

root
 |-- Nº da Estação: integer (nullable = true)
 |-- Estações: string (nullable = true)
 |-- Lat: double (nullable = true)
 |-- Lon: double (nullable = true)

+-------------+--------------------+-----------+-----------+
|Nº da Estação|            Estações|        Lat|        Lon|
+-------------+--------------------+-----------+-----------+
|      1200545|Porto - Pedras Ru...|41.23350278|-8.68133333|
|      1200548| Coimbra (Aeródromo)|    40.1576|-8.46851667|
|      1200551|Viana do Castelo ...|  41.648875|-8.80460556|
|      1200554|    Faro / Aeroporto|37.01657778|-7.97195278|
|      1200558|   Évora / Aeródromo|38.53654167|-7.88795833|
|      1200560|   Viseu / Aeródromo|40.71492778|-7.89591667|
|      1200562|                Beja|38.02572778|-7.86731944|
|      1200567|Vila Real / Aeród...|41.27420833|-7.71711389|
|      1200570|      Castelo Branco|    39.8395|-7.47866944|
|      1200571|          Portalegre|39.29418333|-7.42131667|
|      1200575|            Bragança|41.803883

Unnamed: 0,Nº da Estação,Estações,Lat,Lon
0,1200545,Porto - Pedras Rubras,41.233503,-8.681333
1,1200548,Coimbra (Aeródromo),40.1576,-8.468517
2,1200551,Viana do Castelo - Chafé,41.648875,-8.804606
3,1200554,Faro / Aeroporto,37.016578,-7.971953
4,1200558,Évora / Aeródromo,38.536542,-7.887958
5,1200560,Viseu / Aeródromo,40.714928,-7.895917
6,1200562,Beja,38.025728,-7.867319
7,1200567,Vila Real / Aeródromo,41.274208,-7.717114
8,1200570,Castelo Branco,39.8395,-7.478669
9,1200571,Portalegre,39.294183,-7.421317


In [131]:
# Show statistics
stationsInfo_stats = stationsInfo.summary()
stationsInfo_stats.toPandas()

Unnamed: 0,summary,Nº da Estação,Estações,Lat,Lon
0,count,18.0,18,18.0,18.0
1,mean,1203942.7222222222,,39.92121496888889,-8.161268518333335
2,stddev,4920.241825782978,,1.3824123596259537,0.6691612055364312
3,min,1200545.0,Aveiro / Universidade,37.01657778,-9.12749444
4,25%,1200558.0,,38.76620278,-8.73666111
5,50%,1200570.0,,39.8395,-8.45110833
6,75%,1210683.0,,41.23350278,-7.71711389
7,max,1210770.0,Évora / Aeródromo,41.80388333,-6.74283056


In [132]:
# Create a DataFrame from csv data (automatically infer schema and data types)
avgTemperatureFilePath = "hdfs://hdfs-nn:9000/project_tabd/bronze/csvFiles/weather/avgTemperature.csv"

# Read average temperature data
avgTemperature = spark.read.load(avgTemperatureFilePath, 
                         format="csv", sep=",", inferSchema="true", header="true")

In [133]:
avgTemperature.printSchema()
avgTemperature.show()
avgTemperature.toPandas()

root
 |-- ANO: integer (nullable = true)
 |-- MÊS: integer (nullable = true)
 |-- DIA: integer (nullable = true)
 |-- 1200545: double (nullable = true)
 |-- 1200548: double (nullable = true)
 |-- 1200551: double (nullable = true)
 |-- 1200554: double (nullable = true)
 |-- 1200558: double (nullable = true)
 |-- 1200560: double (nullable = true)
 |-- 1200562: double (nullable = true)
 |-- 1200567: double (nullable = true)
 |-- 1200570: double (nullable = true)
 |-- 1200571: double (nullable = true)
 |-- 1200575: double (nullable = true)
 |-- 1200579: double (nullable = true)
 |-- 1210622: double (nullable = true)
 |-- 1210683: double (nullable = true)
 |-- 1210702: double (nullable = true)
 |-- 1210718: double (nullable = true)
 |-- 1210734: double (nullable = true)
 |-- 1210770: double (nullable = true)

+----+---+---+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+
| ANO|MÊS|DIA|1200545|120

Unnamed: 0,ANO,MÊS,DIA,1200545,1200548,1200551,1200554,1200558,1200560,1200562,...,1200570,1200571,1200575,1200579,1210622,1210683,1210702,1210718,1210734,1210770
0,2000,1,1,5.1,6.3,-990.0,10.8,7.0,6.9,-990.0,...,5.7,10.2,1.2,-990.0,-990.0,6.1,-990.0,-990.0,6.7,7.3
1,2000,1,2,4.8,5.2,-990.0,10.5,7.1,6.0,-990.0,...,6.0,8.8,0.3,8.2,-990.0,5.2,-990.0,-990.0,6.3,7.9
2,2000,1,3,5.8,5.6,-990.0,10.4,6.2,5.6,-990.0,...,5.1,8.4,-1.7,7.3,-990.0,4.0,-990.0,-990.0,-990.0,5.5
3,2000,1,4,6.3,6.3,-990.0,11.6,6.1,6.1,7.9,...,4.7,7.8,-1.8,7.8,-990.0,3.0,-990.0,-990.0,7.0,5.0
4,2000,1,5,7.7,7.1,-990.0,11.4,4.9,4.3,7.8,...,2.1,6.8,0.0,7.7,-990.0,0.8,8.3,-990.0,6.2,6.6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7666,2020,12,27,-990.0,7.3,7.1,9.6,6.1,4.1,7.1,...,5.9,6.0,1.7,9.0,4.4,4.1,8.0,6.4,6.9,6.9
7667,2020,12,28,-990.0,9.1,10.5,13.2,9.0,5.0,9.8,...,8.2,6.3,3.9,11.5,7.7,2.6,11.4,11.0,10.5,11.6
7668,2020,12,29,-990.0,7.1,7.7,9.8,6.3,3.4,7.4,...,6.3,4.8,3.0,9.8,5.9,1.3,9.6,8.5,8.4,9.6
7669,2020,12,30,-990.0,6.8,5.8,9.7,6.3,3.3,6.2,...,5.3,5.0,3.5,9.1,4.5,0.5,6.9,6.1,7.8,6.4


In [134]:
# Show statistics
avgTemperature_stats = avgTemperature.summary()
avgTemperature_stats.toPandas()

Unnamed: 0,summary,ANO,MÊS,DIA,1200545,1200548,1200551,1200554,1200558,1200560,...,1200570,1200571,1200575,1200579,1210622,1210683,1210702,1210718,1210734,1210770
0,count,7671.0,7671.0,7671.0,7671.0,7671.0,7671.0,7671.0,7671.0,7671.0,...,7671.0,7671.0,7671.0,7671.0,7671.0,7671.0,7671.0,7671.0,7671.0,7671.0
1,mean,2010.0,6.522487289792726,15.730934689088777,-64.6218224481816,-1.4519228262286552,-326.8201798983199,7.583183418068077,8.967996349889198,5.658988397862114,...,7.714809020988155,-25.740698735497254,1.0019032720636167,3.17156824403597,-103.06729239994772,-89.30032590275107,-20.506752704992863,-419.7172467735642,-12.165063225133594,-65.00703949941325
2,stddev,6.056341295094361,3.449049354929467,8.801213685996998,271.31182151970336,128.36090831503176,475.73967444437113,103.17602583194734,85.12974844027828,87.12351622402113,...,91.05259512627944,199.997798512608,108.20796683997813,116.54238552166912,323.4201649933665,301.81911611736945,186.99807805321012,498.067695820931,166.92893956804554,274.83437780877546
3,min,2000.0,1.0,1.0,-990.0,-990.0,-990.0,-990.0,-990.0,-990.0,...,-990.0,-990.0,-990.0,-990.0,-990.0,-990.0,-990.0,-990.0,-990.0,-990.0
4,25%,2005.0,4.0,8.0,10.8,11.2,-990.0,14.1,10.9,8.2,...,10.2,9.5,7.0,12.8,8.9,4.9,12.1,-990.0,11.8,11.4
5,50%,2010.0,7.0,16.0,14.2,15.0,11.1,17.6,15.3,12.4,...,14.8,14.1,11.8,16.4,13.7,9.9,15.5,9.3,15.8,15.7
6,75%,2015.0,10.0,23.0,17.8,18.9,16.2,22.2,21.3,17.6,...,21.5,20.6,18.5,20.5,18.5,16.4,18.7,16.5,20.4,20.6
7,max,2020.0,12.0,31.0,30.6,31.6,28.0,35.8,36.3,31.9,...,34.8,35.3,29.8,35.1,30.5,31.1,29.2,28.7,33.8,34.7


In [135]:
# import dependencies
from pyspark.sql.functions import lit
from pyspark.sql import functions as sf
from pyspark.sql.functions import col
import pandas as pd
spark.conf.set("spark.sql.execution.arrow.enabled", "true")

In [155]:
#transformar schema do csv original average temperature. 
data = stationsInfo.select(col("Nº da Estação"))
pandasDF = data.toPandas()
names = avgTemperature.schema.names
df_array = []
for index, row in pandasDF.iterrows():
    for name in names:
        if str(row["Nº da Estação"]) == str(name):
            df_array.append(avgTemperature.select(col("ANO"), col("MÊS"), col("DIA"), col(name).alias("avgTemperature")) \
            .withColumn("Nº da Estação", lit(name)) \
            .withColumn("Date", sf.concat(sf.format_string("%04d",col("ANO")), sf.lit("-"), sf.format_string("%02d",col("MÊS")), sf.lit("-"), sf.format_string("%02d",col("DIA")))) \
            .toPandas())
            

                                   
results = pd.concat(df_array, ignore_index=True)
print(results)

#Create PySpark DataFrame from Pandas
avgTemperature_sparkDF=spark.createDataFrame(results) 
avgTemperature_sparkDF.printSchema()


avgTemperatureSilver = avgTemperature_sparkDF.join(stationsInfo, "Nº da Estação")
avgTemperatureSilver.printSchema()


         ANO  MÊS  DIA  avgTemperature Nº da Estação        Date
0       2000    1    1             5.1       1200545  2000-01-01
1       2000    1    2             4.8       1200545  2000-01-02
2       2000    1    3             5.8       1200545  2000-01-03
3       2000    1    4             6.3       1200545  2000-01-04
4       2000    1    5             7.7       1200545  2000-01-05
...      ...  ...  ...             ...           ...         ...
138073  2020   12   27             6.9       1210770  2020-12-27
138074  2020   12   28            11.6       1210770  2020-12-28
138075  2020   12   29             9.6       1210770  2020-12-29
138076  2020   12   30             6.4       1210770  2020-12-30
138077  2020   12   31             7.7       1210770  2020-12-31

[138078 rows x 6 columns]
root
 |-- ANO: integer (nullable = true)
 |-- MÊS: integer (nullable = true)
 |-- DIA: integer (nullable = true)
 |-- avgTemperature: double (nullable = true)
 |-- Nº da Estação: string (nullab

In [156]:
# Create a DataFrame from csv data (automatically infer schema and data types)
maxTemperatureFilePath = "hdfs://hdfs-nn:9000/project_tabd/bronze/csvFiles/weather/maxTemperature.csv"

# Read max temperatures data
maxTemperature = spark.read.load(maxTemperatureFilePath, 
                         format="csv", sep=",", inferSchema="true", header="true")

In [157]:
maxTemperature.printSchema()
maxTemperature.show()

root
 |-- ANO: integer (nullable = true)
 |-- MÊS: integer (nullable = true)
 |-- DIA: integer (nullable = true)
 |-- 1200545: double (nullable = true)
 |-- 1200548: double (nullable = true)
 |-- 1200551: double (nullable = true)
 |-- 1200554: double (nullable = true)
 |-- 1200558: double (nullable = true)
 |-- 1200560: double (nullable = true)
 |-- 1200562: double (nullable = true)
 |-- 1200567: double (nullable = true)
 |-- 1200570: double (nullable = true)
 |-- 1200571: double (nullable = true)
 |-- 1200575: double (nullable = true)
 |-- 1200579: double (nullable = true)
 |-- 1210622: double (nullable = true)
 |-- 1210683: double (nullable = true)
 |-- 1210702: double (nullable = true)
 |-- 1210718: double (nullable = true)
 |-- 1210734: double (nullable = true)
 |-- 1210770: double (nullable = true)

+----+---+---+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+
| ANO|MÊS|DIA|1200545|120

In [158]:
#transformar schema do csv original max temperature. 
data = stationsInfo.select(col("Nº da Estação"))

pandasDF = data.toPandas()
names = maxTemperature.schema.names
df_array = []
for index, row in pandasDF.iterrows():
    for name in names:
        if str(row["Nº da Estação"]) == str(name):
            df_array.append(maxTemperature.select(col("ANO"), col("MÊS"), col("DIA"), col(name).alias("maxTemperature")) \
            .withColumn("Nº da Estação", lit(name)) \
            .withColumn("Date", sf.concat(sf.format_string("%04d",col("ANO")), sf.lit("-"), sf.format_string("%02d",col("MÊS")), sf.lit("-"), sf.format_string("%02d",col("DIA")))) \
            .toPandas())
            
            
results = pd.concat(df_array, ignore_index=True)
print(results)

#Create PySpark DataFrame from Pandas
maxTemperature_sparkDF=spark.createDataFrame(results) 

maxTemperatureSilver = maxTemperature_sparkDF.join(stationsInfo, "Nº da Estação")
maxTemperatureSilver.printSchema()

         ANO  MÊS  DIA  maxTemperature Nº da Estação        Date
0       2000    1    1            10.8       1200545  2000-01-01
1       2000    1    2            11.5       1200545  2000-01-02
2       2000    1    3            11.9       1200545  2000-01-03
3       2000    1    4            12.1       1200545  2000-01-04
4       2000    1    5            13.4       1200545  2000-01-05
...      ...  ...  ...             ...           ...         ...
138073  2020   12   27            14.1       1210770  2020-12-27
138074  2020   12   28            14.4       1210770  2020-12-28
138075  2020   12   29            13.7       1210770  2020-12-29
138076  2020   12   30            13.4       1210770  2020-12-30
138077  2020   12   31            15.6       1210770  2020-12-31

[138078 rows x 6 columns]
root
 |-- Nº da Estação: string (nullable = true)
 |-- ANO: integer (nullable = true)
 |-- MÊS: integer (nullable = true)
 |-- DIA: integer (nullable = true)
 |-- maxTemperature: double (nullab

In [159]:
# Create a DataFrame from csv data (automatically infer schema and data types)
minTemperatureFilePath = "hdfs://hdfs-nn:9000/project_tabd/bronze/csvFiles/weather/minTemperature.csv"

# Read stations info data
minTemperature = spark.read.load(minTemperatureFilePath, 
                         format="csv", sep=",", inferSchema="true", header="true")

In [160]:
#transformar schema do csv original minimum temperature. 
data = stationsInfo.select(col("Nº da Estação"))

pandasDF = data.toPandas()
names = minTemperature.schema.names
df_array = []
for index, row in pandasDF.iterrows():
    for name in names:
        if str(row["Nº da Estação"]) == str(name):
            df_array.append(minTemperature.select(col("ANO"), col("MÊS"), col("DIA"), col(name).alias("minTemperature")) \
            .withColumn("Nº da Estação", lit(name)) \
            .withColumn("Date", sf.concat(sf.format_string("%04d",col("ANO")), sf.lit("-"), sf.format_string("%02d",col("MÊS")), sf.lit("-"), sf.format_string("%02d",col("DIA")))) \
            .toPandas())
            
            
results = pd.concat(df_array, ignore_index=True)
print(results)

#Create PySpark DataFrame from Pandas
minTemperature_sparkDF=spark.createDataFrame(results) 


minTemperatureSilver = minTemperature_sparkDF.join(stationsInfo, "Nº da Estação")
minTemperatureSilver.printSchema()

         ANO  MÊS  DIA  minTemperature Nº da Estação        Date
0       2000    1    1             1.2       1200545  2000-01-01
1       2000    1    2             1.0       1200545  2000-01-02
2       2000    1    3             2.5       1200545  2000-01-03
3       2000    1    4             2.5       1200545  2000-01-04
4       2000    1    5             3.3       1200545  2000-01-05
...      ...  ...  ...             ...           ...         ...
138073  2020   12   27            -1.8       1210770  2020-12-27
138074  2020   12   28             5.3       1210770  2020-12-28
138075  2020   12   29             3.3       1210770  2020-12-29
138076  2020   12   30             1.1       1210770  2020-12-30
138077  2020   12   31             0.6       1210770  2020-12-31

[138078 rows x 6 columns]
root
 |-- Nº da Estação: string (nullable = true)
 |-- ANO: integer (nullable = true)
 |-- MÊS: integer (nullable = true)
 |-- DIA: integer (nullable = true)
 |-- minTemperature: double (nullab

In [161]:
# Create a DataFrame from csv data (automatically infer schema and data types)
radiationTotalFilePath = "hdfs://hdfs-nn:9000/project_tabd/bronze/csvFiles/weather/radiationTotal.csv"

# Read global radiation data
radiationTotal = spark.read.load(radiationTotalFilePath, 
                         format="csv", sep=",", inferSchema="true", header="true")

In [162]:
radiationTotal.printSchema()

root
 |-- ANO: integer (nullable = true)
 |-- MÊS: integer (nullable = true)
 |-- DIA: integer (nullable = true)
 |-- 1200545: double (nullable = true)
 |-- 1200548: double (nullable = true)
 |-- 1200551: double (nullable = true)
 |-- 1200554: double (nullable = true)
 |-- 1200558: double (nullable = true)
 |-- 1200560: double (nullable = true)
 |-- 1200562: double (nullable = true)
 |-- 1200567: double (nullable = true)
 |-- 1200570: double (nullable = true)
 |-- 1200571: double (nullable = true)
 |-- 1200575: double (nullable = true)
 |-- 1200579: double (nullable = true)
 |-- 1210622: double (nullable = true)
 |-- 1210683: double (nullable = true)
 |-- 1210702: double (nullable = true)
 |-- 1210718: double (nullable = true)
 |-- 1210734: double (nullable = true)
 |-- 1210770: double (nullable = true)



In [163]:
#transformar schema do csv original average temperature. 
data = stationsInfo.select(col("Nº da Estação"))

pandasDF = data.toPandas()
names = radiationTotal.schema.names
df_array = []
for index, row in pandasDF.iterrows():
    for name in names:
        if str(row["Nº da Estação"]) == str(name):
            df_array.append(radiationTotal.select(col("ANO"), col("MÊS"), col("DIA"), col(name).alias("radiationTotal")) \
            .withColumn("Nº da Estação", lit(name)) \
            .withColumn("Date", sf.concat(sf.format_string("%04d",col("ANO")), sf.lit("-"), sf.format_string("%02d",col("MÊS")), sf.lit("-"), sf.format_string("%02d",col("DIA")))) \
            .toPandas())
            
results = pd.concat(df_array, ignore_index=True)
print(results)

#Create PySpark DataFrame from Pandas
radiationTotal_sparkDF=spark.createDataFrame(results) 

radiationTotalSilver = radiationTotal_sparkDF.join(stationsInfo, "Nº da Estação")
radiationTotalSilver.printSchema()

         ANO  MÊS  DIA  radiationTotal Nº da Estação        Date
0       2000    1    1          -990.0       1200545  2000-01-01
1       2000    1    2          -990.0       1200545  2000-01-02
2       2000    1    3          -990.0       1200545  2000-01-03
3       2000    1    4          -990.0       1200545  2000-01-04
4       2000    1    5          -990.0       1200545  2000-01-05
...      ...  ...  ...             ...           ...         ...
138073  2020   12   27         10080.4       1210770  2020-12-27
138074  2020   12   28          7576.7       1210770  2020-12-28
138075  2020   12   29          8961.5       1210770  2020-12-29
138076  2020   12   30          8947.5       1210770  2020-12-30
138077  2020   12   31          7381.8       1210770  2020-12-31

[138078 rows x 6 columns]
root
 |-- Nº da Estação: string (nullable = true)
 |-- ANO: integer (nullable = true)
 |-- MÊS: integer (nullable = true)
 |-- DIA: integer (nullable = true)
 |-- radiationTotal: double (nullab

In [164]:
# Create a DataFrame from csv data (automatically infer schema and data types)
rainQtyFilePath = "hdfs://hdfs-nn:9000/project_tabd/bronze/csvFiles/weather/rainQty.csv"

# Read rain quantity data
rainQty = spark.read.load(rainQtyFilePath, 
                         format="csv", sep=",", inferSchema="true", header="true")

In [165]:
#transformar schema do csv original rain quantity. 
data = stationsInfo.select(col("Nº da Estação"))

pandasDF = data.toPandas()
names = rainQty.schema.names
df_array = []
for index, row in pandasDF.iterrows():
    for name in names:
        if str(row["Nº da Estação"]) == str(name):
            df_array.append(rainQty.select(col("ANO"), col("MÊS"), col("DIA"), col(name).alias("rainQty")) \
            .withColumn("Nº da Estação", lit(name)) \
            .withColumn("Date", sf.concat(sf.format_string("%04d",col("ANO")), sf.lit("-"), sf.format_string("%02d",col("MÊS")), sf.lit("-"), sf.format_string("%02d",col("DIA")))) \
            .toPandas())
            
            
results = pd.concat(df_array, ignore_index=True)
print(results)

#Create PySpark DataFrame from Pandas
rainQty_sparkDF=spark.createDataFrame(results) 
rainQty_sparkDF.printSchema()
rainQty_sparkDF.toPandas()

rainQtySilver = rainQty_sparkDF.join(stationsInfo, "Nº da Estação")
rainQtySilver.printSchema()
rainQtySilver.toPandas()

         ANO  MÊS  DIA  rainQty Nº da Estação        Date
0       2000    1    1      0.0       1200545  2000-01-01
1       2000    1    2      0.1       1200545  2000-01-02
2       2000    1    3      0.0       1200545  2000-01-03
3       2000    1    4      0.0       1200545  2000-01-04
4       2000    1    5      0.0       1200545  2000-01-05
...      ...  ...  ...      ...           ...         ...
138073  2020   12   27      0.0       1210770  2020-12-27
138074  2020   12   28      0.7       1210770  2020-12-28
138075  2020   12   29      0.9       1210770  2020-12-29
138076  2020   12   30      0.1       1210770  2020-12-30
138077  2020   12   31      0.8       1210770  2020-12-31

[138078 rows x 6 columns]
root
 |-- ANO: integer (nullable = true)
 |-- MÊS: integer (nullable = true)
 |-- DIA: integer (nullable = true)
 |-- rainQty: double (nullable = true)
 |-- Nº da Estação: string (nullable = true)
 |-- Date: string (nullable = true)

root
 |-- Nº da Estação: string (nullable =

Unnamed: 0,Nº da Estação,ANO,MÊS,DIA,rainQty,Date,Estações,Lat,Lon
0,1200545,2000,1,1,0.0,2000-01-01,Porto - Pedras Rubras,41.233503,-8.681333
1,1200545,2000,1,2,0.1,2000-01-02,Porto - Pedras Rubras,41.233503,-8.681333
2,1200545,2000,1,3,0.0,2000-01-03,Porto - Pedras Rubras,41.233503,-8.681333
3,1200545,2000,1,4,0.0,2000-01-04,Porto - Pedras Rubras,41.233503,-8.681333
4,1200545,2000,1,5,0.0,2000-01-05,Porto - Pedras Rubras,41.233503,-8.681333
...,...,...,...,...,...,...,...,...,...
138073,1210770,2020,12,27,0.0,2020-12-27,Setúbal,38.548497,-8.890783
138074,1210770,2020,12,28,0.7,2020-12-28,Setúbal,38.548497,-8.890783
138075,1210770,2020,12,29,0.9,2020-12-29,Setúbal,38.548497,-8.890783
138076,1210770,2020,12,30,0.1,2020-12-30,Setúbal,38.548497,-8.890783


In [166]:
#create keys to join all the dataframes in one table
DF1 = avgTemperatureSilver.withColumn("key", sf.concat(sf.col("Nº da Estação"), sf.lit("_"), sf.col("Date")))

DF2 = maxTemperatureSilver.withColumn("key", sf.concat(sf.col("Nº da Estação"), sf.lit("_"), sf.col("Date")))

DF2 = DF2.select(col("key"), col("maxTemperature"))

DF3 = minTemperatureSilver.withColumn("key", sf.concat(sf.col("Nº da Estação"), sf.lit("_"), sf.col("Date")))

DF3 = DF3.select(col("key"), col("minTemperature"))

sparkDF1 = DF1.join(DF2, "key")
sparkDF2 = sparkDF1.join(DF3, "key")


DF4 = radiationTotalSilver.withColumn("key", sf.concat(sf.col("Nº da Estação"), sf.lit("_"), sf.col("Date")))

DF4 = DF4.select(col("key"), col("radiationTotal"))

sparkDF3 = sparkDF2.join(DF4, "key")


DF5 = rainQtySilver.withColumn("key", sf.concat(sf.col("Nº da Estação"), sf.lit("_"), sf.col("date")))                  
                                        
DF5 = DF5.select(col("key"), col("rainQty"))

weatherDFSilver = sparkDF3.join(DF5, "key")

weatherDFSilver = weatherDFSilver.select(to_date(col("date"), "yyyy-MM-dd").alias("date"), 
                                         col("ANO").alias("year"),
                                         col("MÊS").alias("month"),
                                         col("DIA").alias("day"),
                                         col("Nº da Estação").alias("station_id"),
                                         col("Estações").alias("station_name"),
                                         col("Lat").alias("latitude"),
                                         col("Lon").alias("longitude"),
                                         col("avgTemperature").alias("air_temperature_avg"), 
                                         col("minTemperature").alias("air_temperature_min"), 
                                         col("maxTemperature").alias("air_temperature_max"), 
                                         col("radiationTotal").alias("global_radiation_total"),
                                         col("rainQty").alias("rain_precipitation_qty"))

#sort dataframe by date and station_id
weatherDFSilver = weatherDFSilver.sort(col("date"), col("station_id"))

# df.select(col("input"),to_date(col("input"),"MM-dd-yyyy").alias("date")) \
#   .show()



In [167]:
weatherDFSilver.printSchema()
weatherDFSilver.toPandas()

root
 |-- date: date (nullable = true)
 |-- year: integer (nullable = true)
 |-- month: integer (nullable = true)
 |-- day: integer (nullable = true)
 |-- station_id: string (nullable = true)
 |-- station_name: string (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- air_temperature_avg: double (nullable = true)
 |-- air_temperature_min: double (nullable = true)
 |-- air_temperature_max: double (nullable = true)
 |-- global_radiation_total: double (nullable = true)
 |-- rain_precipitation_qty: double (nullable = true)



Unnamed: 0,date,year,month,day,station_id,station_name,latitude,longitude,air_temperature_avg,air_temperature_min,air_temperature_max,global_radiation_total,rain_precipitation_qty
0,2000-01-01,2000,1,1,1200545,Porto - Pedras Rubras,41.233503,-8.681333,5.1,1.2,10.8,-990.0,0.0
1,2000-01-01,2000,1,1,1200548,Coimbra (Aeródromo),40.157600,-8.468517,6.3,1.7,12.3,-990.0,0.0
2,2000-01-01,2000,1,1,1200551,Viana do Castelo - Chafé,41.648875,-8.804606,-990.0,-990.0,-990.0,-990.0,-990.0
3,2000-01-01,2000,1,1,1200554,Faro / Aeroporto,37.016578,-7.971953,10.8,5.7,14.8,-990.0,0.0
4,2000-01-01,2000,1,1,1200558,Évora / Aeródromo,38.536542,-7.887958,7.0,1.1,14.6,10526.9,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
138073,2020-12-31,2020,12,31,1210683,Guarda,40.528558,-7.278675,2.9,0.4,5.5,3451.9,3.3
138074,2020-12-31,2020,12,31,1210702,Aveiro / Universidade,40.635400,-8.659611,9.4,6.5,14.1,4223.4,15.4
138075,2020-12-31,2020,12,31,1210718,Leiria / Aeródromo,39.780553,-8.820967,7.8,2.0,13.6,4998.1,6.1
138076,2020-12-31,2020,12,31,1210734,Santarém - Fonte Boa / Est. Zootécnica,39.201261,-8.736661,7.7,1.8,15.2,6469.2,2.4


As you can see, you now have a Spark DataFrame created from csv file in HDFS :)

You can now use the Spark DataFrame functions to interact with DataFrames: Spark Dataframe operations

A complete list of DataFrame operations with examples can be found here. There you can find how to apply filter and group functions, joins, iterate through the dataframe and apply scala functions, drop dupicates, among many other operations at the DataFrame level.

A complete list of Spark SQL functions with examples to apply to the columns of DataFrames can be found here. There you can find how to check for nulls, do string replaces, among many others.

#What about SQL queries?

Did you realize what you just did? You performed a filtering and group by operation over potentially hundreds or thousands of machines working at your disposal :D

Welcome to Apache Spark, where you write code whose computation is automatically distributed for you!!!

I'm wondering if we can write SQL on top of a JSON file. Let's see…

In [168]:
# First you need to create a temporary view
weatherDFSilver.createOrReplaceTempView("weather")

# Then you can write all the SQL you want, as you're json is seen as a regular table now.
sqlized_df = spark.sql(
    """
    SELECT * 
    FROM weather
    WHERE year = 2020
    """
)

sqlized_df.toPandas()

Unnamed: 0,date,year,month,day,station_id,station_name,latitude,longitude,air_temperature_avg,air_temperature_min,air_temperature_max,global_radiation_total,rain_precipitation_qty
0,2020-01-01,2020,1,1,1200545,Porto - Pedras Rubras,41.233503,-8.681333,-990.0,-990.0,-990.0,-990.0,-990.0
1,2020-01-01,2020,1,1,1200548,Coimbra (Aeródromo),40.157600,-8.468517,8.4,3.5,14.3,-990.0,0.1
2,2020-01-01,2020,1,1,1200551,Viana do Castelo - Chafé,41.648875,-8.804606,5.1,0.2,11.5,-990.0,0.2
3,2020-01-01,2020,1,1,1200554,Faro / Aeroporto,37.016578,-7.971953,12.4,6.8,16.4,11423.2,0.0
4,2020-01-01,2020,1,1,1200558,Évora / Aeródromo,38.536542,-7.887958,8.0,1.9,16.2,-990.0,0.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6583,2020-12-31,2020,12,31,1210683,Guarda,40.528558,-7.278675,2.9,0.4,5.5,3451.9,3.3
6584,2020-12-31,2020,12,31,1210702,Aveiro / Universidade,40.635400,-8.659611,9.4,6.5,14.1,4223.4,15.4
6585,2020-12-31,2020,12,31,1210718,Leiria / Aeródromo,39.780553,-8.820967,7.8,2.0,13.6,4998.1,6.1
6586,2020-12-31,2020,12,31,1210734,Santarém - Fonte Boa / Est. Zootécnica,39.201261,-8.736661,7.7,1.8,15.2,6469.2,2.4


Say what?
No way, SQL over JSON, TXT or other files?? :O

That's cool and everything, but what about writing sql and visualize the results? There's no chance that's possible, pff

In [170]:
# with partitions
weatherDFSilver \
    .repartition(1) \
    .write \
    .partitionBy("year") \
    .format("parquet") \
    .mode("overwrite") \
    .save("hdfs://hdfs-nn:9000/warehouse/tabd.db/parquet_table_weather/")