In [0]:
bronze_global_df = spark.table("bronze_global_temperatures")


In [0]:
bronze_global_df.printSchema()


root
 |-- dt: date (nullable = true)
 |-- LandAverageTemperature: double (nullable = true)
 |-- LandAverageTemperatureUncertainty: double (nullable = true)
 |-- LandMaxTemperature: double (nullable = true)
 |-- LandMaxTemperatureUncertainty: double (nullable = true)
 |-- LandMinTemperature: double (nullable = true)
 |-- LandMinTemperatureUncertainty: double (nullable = true)
 |-- LandAndOceanAverageTemperature: double (nullable = true)
 |-- LandAndOceanAverageTemperatureUncertainty: double (nullable = true)



In [0]:
from pyspark.sql.functions import to_date, col

silver_global_df = (
    bronze_global_df
        .withColumn("date", to_date(col("dt")))
        .drop("dt")
)


In [0]:
silver_global_df = silver_global_df.filter(col("date").isNotNull())


In [0]:
silver_global_df.printSchema()
silver_global_df.show(5, truncate=False)


root
 |-- LandAverageTemperature: double (nullable = true)
 |-- LandAverageTemperatureUncertainty: double (nullable = true)
 |-- LandMaxTemperature: double (nullable = true)
 |-- LandMaxTemperatureUncertainty: double (nullable = true)
 |-- LandMinTemperature: double (nullable = true)
 |-- LandMinTemperatureUncertainty: double (nullable = true)
 |-- LandAndOceanAverageTemperature: double (nullable = true)
 |-- LandAndOceanAverageTemperatureUncertainty: double (nullable = true)
 |-- date: date (nullable = true)

+----------------------+---------------------------------+------------------+-----------------------------+------------------+-----------------------------+------------------------------+-----------------------------------------+----------+
|LandAverageTemperature|LandAverageTemperatureUncertainty|LandMaxTemperature|LandMaxTemperatureUncertainty|LandMinTemperature|LandMinTemperatureUncertainty|LandAndOceanAverageTemperature|LandAndOceanAverageTemperatureUncertainty|date      |
+-

In [0]:
(
    silver_global_df
        .write
        .format("delta")
        .mode("overwrite")
        .saveAsTable("silver_global_temperatures")
)


In [0]:
spark.sql("SELECT * FROM silver_global_temperatures LIMIT 5").show(truncate=False)


+----------------------+---------------------------------+------------------+-----------------------------+------------------+-----------------------------+------------------------------+-----------------------------------------+----------+
|LandAverageTemperature|LandAverageTemperatureUncertainty|LandMaxTemperature|LandMaxTemperatureUncertainty|LandMinTemperature|LandMinTemperatureUncertainty|LandAndOceanAverageTemperature|LandAndOceanAverageTemperatureUncertainty|date      |
+----------------------+---------------------------------+------------------+-----------------------------+------------------+-----------------------------+------------------------------+-----------------------------------------+----------+
|3.0340000000000003    |3.574                            |NULL              |NULL                         |NULL              |NULL                         |NULL                          |NULL                                     |1750-01-01|
|3.083                 |3.702       

In [0]:
bronze_country_df = spark.table("bronze_land_temperatures_country")


In [0]:
bronze_country_df.printSchema()
bronze_country_df.show(5, truncate=False)


root
 |-- dt: date (nullable = true)
 |-- AverageTemperature: double (nullable = true)
 |-- AverageTemperatureUncertainty: double (nullable = true)
 |-- Country: string (nullable = true)

+----------+------------------+-----------------------------+-------+
|dt        |AverageTemperature|AverageTemperatureUncertainty|Country|
+----------+------------------+-----------------------------+-------+
|1743-11-01|4.3839999999999995|2.294                        |Åland  |
|1743-12-01|NULL              |NULL                         |Åland  |
|1744-01-01|NULL              |NULL                         |Åland  |
|1744-02-01|NULL              |NULL                         |Åland  |
|1744-03-01|NULL              |NULL                         |Åland  |
+----------+------------------+-----------------------------+-------+
only showing top 5 rows


In [0]:
from pyspark.sql.functions import to_date, col

silver_country_df = (
    bronze_country_df
        .withColumn("date", to_date(col("dt")))
        .drop("dt")
)


In [0]:
silver_country_df = (
    silver_country_df
        .filter(col("date").isNotNull())
        .filter(col("Country").isNotNull())
)


In [0]:
silver_country_df = silver_country_df.withColumnRenamed(
    "Country", "country"
)


In [0]:
silver_country_df.printSchema()
silver_country_df.show(5, truncate=False)


root
 |-- AverageTemperature: double (nullable = true)
 |-- AverageTemperatureUncertainty: double (nullable = true)
 |-- country: string (nullable = true)
 |-- date: date (nullable = true)

+------------------+-----------------------------+-------+----------+
|AverageTemperature|AverageTemperatureUncertainty|country|date      |
+------------------+-----------------------------+-------+----------+
|4.3839999999999995|2.294                        |Åland  |1743-11-01|
|NULL              |NULL                         |Åland  |1743-12-01|
|NULL              |NULL                         |Åland  |1744-01-01|
|NULL              |NULL                         |Åland  |1744-02-01|
|NULL              |NULL                         |Åland  |1744-03-01|
+------------------+-----------------------------+-------+----------+
only showing top 5 rows


In [0]:
(
    silver_country_df
        .write
        .format("delta")
        .mode("overwrite")
        .saveAsTable("silver_land_temperatures_country")
)


In [0]:
spark.sql(
    "SELECT country, date, AverageTemperature FROM silver_land_temperatures_country LIMIT 5"
).show(truncate=False)


+-------+----------+------------------+
|country|date      |AverageTemperature|
+-------+----------+------------------+
|Åland  |1743-11-01|4.3839999999999995|
|Åland  |1743-12-01|NULL              |
|Åland  |1744-01-01|NULL              |
|Åland  |1744-02-01|NULL              |
|Åland  |1744-03-01|NULL              |
+-------+----------+------------------+



In [0]:
bronze_state_df = spark.table("bronze_land_temperatures_state")


In [0]:
bronze_state_df.printSchema()
bronze_state_df.show(5, truncate=False)


root
 |-- dt: date (nullable = true)
 |-- AverageTemperature: double (nullable = true)
 |-- AverageTemperatureUncertainty: double (nullable = true)
 |-- State: string (nullable = true)
 |-- Country: string (nullable = true)

+----------+------------------+-----------------------------+-----+-------+
|dt        |AverageTemperature|AverageTemperatureUncertainty|State|Country|
+----------+------------------+-----------------------------+-----+-------+
|1855-05-01|25.544            |1.171                        |Acre |Brazil |
|1855-06-01|24.228            |1.103                        |Acre |Brazil |
|1855-07-01|24.371            |1.044                        |Acre |Brazil |
|1855-08-01|25.427            |1.073                        |Acre |Brazil |
|1855-09-01|25.675            |1.014                        |Acre |Brazil |
+----------+------------------+-----------------------------+-----+-------+
only showing top 5 rows


In [0]:
from pyspark.sql.functions import to_date, col

silver_state_df = (
    bronze_state_df
        .withColumn("date", to_date(col("dt")))
        .drop("dt")
)


In [0]:
silver_state_df = (
    silver_state_df
        .filter(col("date").isNotNull())
        .filter(col("State").isNotNull())
        .filter(col("Country").isNotNull())
)


In [0]:
silver_state_df = (
    silver_state_df
        .withColumnRenamed("State", "state")
        .withColumnRenamed("Country", "country")
)


In [0]:
silver_state_df.printSchema()
silver_state_df.show(5, truncate=False)


root
 |-- AverageTemperature: double (nullable = true)
 |-- AverageTemperatureUncertainty: double (nullable = true)
 |-- state: string (nullable = true)
 |-- country: string (nullable = true)
 |-- date: date (nullable = true)

+------------------+-----------------------------+-----+-------+----------+
|AverageTemperature|AverageTemperatureUncertainty|state|country|date      |
+------------------+-----------------------------+-----+-------+----------+
|25.544            |1.171                        |Acre |Brazil |1855-05-01|
|24.228            |1.103                        |Acre |Brazil |1855-06-01|
|24.371            |1.044                        |Acre |Brazil |1855-07-01|
|25.427            |1.073                        |Acre |Brazil |1855-08-01|
|25.675            |1.014                        |Acre |Brazil |1855-09-01|
+------------------+-----------------------------+-----+-------+----------+
only showing top 5 rows


In [0]:
(
    silver_state_df
        .write
        .format("delta")
        .mode("overwrite")
        .saveAsTable("silver_land_temperatures_state")
)


In [0]:
spark.sql(
    "SELECT country, state, date, AverageTemperature FROM silver_land_temperatures_state LIMIT 5"
).show(truncate=False)


+-------+-----+----------+------------------+
|country|state|date      |AverageTemperature|
+-------+-----+----------+------------------+
|Brazil |Acre |1855-05-01|25.544            |
|Brazil |Acre |1855-06-01|24.228            |
|Brazil |Acre |1855-07-01|24.371            |
|Brazil |Acre |1855-08-01|25.427            |
|Brazil |Acre |1855-09-01|25.675            |
+-------+-----+----------+------------------+



In [0]:
bronze_city_df = spark.table("bronze_land_temperatures_city")


In [0]:
bronze_city_df.printSchema()
bronze_city_df.show(5, truncate=False)


root
 |-- dt: date (nullable = true)
 |-- AverageTemperature: double (nullable = true)
 |-- AverageTemperatureUncertainty: double (nullable = true)
 |-- City: string (nullable = true)
 |-- Country: string (nullable = true)
 |-- Latitude: string (nullable = true)
 |-- Longitude: string (nullable = true)

+----------+------------------+-----------------------------+-------+-------------+--------+---------+
|dt        |AverageTemperature|AverageTemperatureUncertainty|City   |Country      |Latitude|Longitude|
+----------+------------------+-----------------------------+-------+-------------+--------+---------+
|1849-01-01|26.704            |1.435                        |Abidjan|Côte D'Ivoire|5.63N   |3.23W    |
|1849-02-01|27.434            |1.362                        |Abidjan|Côte D'Ivoire|5.63N   |3.23W    |
|1849-03-01|28.101            |1.612                        |Abidjan|Côte D'Ivoire|5.63N   |3.23W    |
|1849-04-01|26.14             |1.3869999999999998           |Abidjan|Côte D'I

In [0]:
from pyspark.sql.functions import to_date, col

silver_city_df = (
    bronze_city_df
        .withColumn("date", to_date(col("dt")))
        .drop("dt")
)


In [0]:
silver_city_df = (
    silver_city_df
        .filter(col("date").isNotNull())
        .filter(col("City").isNotNull())
        .filter(col("Country").isNotNull())
)


In [0]:
silver_city_df = (
    silver_city_df
        .withColumnRenamed("City", "city")
        .withColumnRenamed("Country", "country")
        .withColumnRenamed("Latitude", "latitude")
        .withColumnRenamed("Longitude", "longitude")
)


In [0]:
silver_city_df.printSchema()
silver_city_df.show(5, truncate=False)


root
 |-- AverageTemperature: double (nullable = true)
 |-- AverageTemperatureUncertainty: double (nullable = true)
 |-- city: string (nullable = true)
 |-- country: string (nullable = true)
 |-- latitude: string (nullable = true)
 |-- longitude: string (nullable = true)
 |-- date: date (nullable = true)

+------------------+-----------------------------+-------+-------------+--------+---------+----------+
|AverageTemperature|AverageTemperatureUncertainty|city   |country      |latitude|longitude|date      |
+------------------+-----------------------------+-------+-------------+--------+---------+----------+
|26.704            |1.435                        |Abidjan|Côte D'Ivoire|5.63N   |3.23W    |1849-01-01|
|27.434            |1.362                        |Abidjan|Côte D'Ivoire|5.63N   |3.23W    |1849-02-01|
|28.101            |1.612                        |Abidjan|Côte D'Ivoire|5.63N   |3.23W    |1849-03-01|
|26.14             |1.3869999999999998           |Abidjan|Côte D'Ivoire|5.6

In [0]:
(
    silver_city_df
        .write
        .format("delta")
        .mode("overwrite")
        .saveAsTable("silver_land_temperatures_city")
)


In [0]:
spark.sql(
    "SELECT city, country, date, AverageTemperature FROM silver_land_temperatures_city LIMIT 5"
).show(truncate=False)


+-------+-------------+----------+------------------+
|city   |country      |date      |AverageTemperature|
+-------+-------------+----------+------------------+
|Abidjan|Côte D'Ivoire|1849-01-01|26.704            |
|Abidjan|Côte D'Ivoire|1849-02-01|27.434            |
|Abidjan|Côte D'Ivoire|1849-03-01|28.101            |
|Abidjan|Côte D'Ivoire|1849-04-01|26.14             |
|Abidjan|Côte D'Ivoire|1849-05-01|25.427            |
+-------+-------------+----------+------------------+

