# US Cities Temperature
## 1. ETL Process

In [1]:
import os

from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType as R, StructField as Fld, StringType as Str, DateType as Date, DecimalType as Dec
from pyspark.sql.functions import count, when, col

In [2]:
spark = SparkSession \
    .builder \
    .config('spark.jars.packages', 'org.apache.hadoop:hadoop-aws:2.7.0') \
    .appName('us_cities_temperature') \
    .getOrCreate()

Enter AWS credentials below and uncomment if data are to be stored in S3:

In [3]:
# spark.sparkContext \
#      ._jsc \
#      .hadoopConfiguration().set('fs.s3a.access.key', <enter access key>)

# spark.sparkContext \
#      ._jsc \
#      .hadoopConfiguration().set('fs.s3a.secret.key', <enter secret key>)
    
# spark.sparkContext \
#      ._jsc \
#      .hadoopConfiguration().set('fs.s3a.endpoint', 's3.amazonaws.com')

# spark.sparkContext \
#      ._jsc \
#      .hadoopConfiguration().set('mapreduce.fileoutputcommitter.algorithm.version', '2')

At first, a custom schema is created so that the average temperature and uncertainty data are rounded to 2 decimals and the other entries are read as strings. 

In [4]:
temperatureSchema = R([
    Fld('dt', Date()),
    Fld('AverageTemperature', Dec(4,2)),
    Fld('AverageTemperatureUncertainty', Dec(4,2)),
    Fld('City', Str()),
    Fld('Country', Str()),
    Fld('Latitude', Str()),
    Fld('Longitude', Str())
])

The data are read from the .csv file into a Spark DataFrame.

In [5]:
path = 'datasets/GlobalLandTemperaturesByCity.csv'
df = spark.read.csv(path, header=True, schema=temperatureSchema)

The average yearly temperature and temperature uncertainty are calculated only for cities in the U.S. by taking into account measurements in the 21st century. The measurements from 2013 are ingonred since October, November and December are missing, meaning that the average temperature for that year will appear misleadingly higher.

In [6]:
df.createOrReplaceTempView('temperatures')

In [7]:
df = spark.sql("""
    SELECT REPLACE(UPPER(City), 'SAINT', 'ST') AS city, 
           CAST(AVG(AverageTemperature) AS DECIMAL(4,2)) AS average_temperature, 
           CAST(AVG(AverageTemperatureUncertainty) AS DECIMAL(4,2)) AS average_temperature_uncertainty 
      FROM (
             SELECT *, YEAR(dt) AS year, MONTH(dt) AS month
               FROM temperatures
           )
     WHERE Country = "United States" AND year >= 2000 AND year < 2013
     GROUP BY City
     ORDER BY City
""")

Save DataFrame as .parquet in s3a://us-immigration-project or locally.

In [8]:
# write to S3 bucket
# output_data = 's3a://us-immigration-project/'
# df.write.parquet(os.path.join(output_data, 'us_cities_temperature'), 'overwrite')

# for local write
# df.write.parquet('./output/us_cities_temperature', 'overwrite')

## 2. Dataset Info

DataFrame Schema:

In [9]:
df.printSchema()

root
 |-- city: string (nullable = true)
 |-- average_temperature: decimal(4,2) (nullable = true)
 |-- average_temperature_uncertainty: decimal(4,2) (nullable = true)



Sample DataFrame records:

In [10]:
df.show(n=2, truncate=False, vertical=True)

-RECORD 0----------------------------------
 city                            | ABILENE 
 average_temperature             | 17.76   
 average_temperature_uncertainty | 0.27    
-RECORD 1----------------------------------
 city                            | AKRON   
 average_temperature             | 10.75   
 average_temperature_uncertainty | 0.25    
only showing top 2 rows



DataFrame rows and columns:

In [11]:
print('rows: {}'.format(df.count()), 'columns: {}'.format(len(df.columns)))

rows: 248 columns: 3


## 3. Data Quality Check

Check for nulls in each column:

In [12]:
df.select([count(when(col(c).isNull(), c)).alias(c) for c in df.columns]).show(truncate=False, vertical=True)

-RECORD 0------------------------------
 city                            | 0   
 average_temperature             | 0   
 average_temperature_uncertainty | 0   



## 4. Primary Key & Columns Related to Other Datasets

Total Cities in DataFrame:

In [13]:
df.select('city').orderBy('city').count()

248

Display all cities:

In [14]:
df.select('city').orderBy('city').show(248, truncate=False)

+-----------------+
|city             |
+-----------------+
|ABILENE          |
|AKRON            |
|ALBUQUERQUE      |
|ALEXANDRIA       |
|ALLENTOWN        |
|AMARILLO         |
|ANAHEIM          |
|ANCHORAGE        |
|ANN ARBOR        |
|ANTIOCH          |
|ARLINGTON        |
|ARVADA           |
|ATLANTA          |
|AURORA           |
|AUSTIN           |
|BAKERSFIELD      |
|BALTIMORE        |
|BATON ROUGE      |
|BEAUMONT         |
|BELLEVUE         |
|BERKELEY         |
|BIRMINGHAM       |
|BOSTON           |
|BRIDGEPORT       |
|BROWNSVILLE      |
|BUFFALO          |
|BURBANK          |
|CAMBRIDGE        |
|CAPE CORAL       |
|CARROLLTON       |
|CARY             |
|CEDAR RAPIDS     |
|CHANDLER         |
|CHARLESTON       |
|CHARLOTTE        |
|CHATTANOOGA      |
|CHESAPEAKE       |
|CHICAGO          |
|CHULA VISTA      |
|CINCINNATI       |
|CLARKSVILLE      |
|CLEARWATER       |
|CLEVELAND        |
|COLORADO SPRINGS |
|COLUMBIA         |
|COLUMBUS         |
|CONCORD          |
