# Weather Data Analytics
This notebook performs some basic weather data analytics using the PySpark RDD interface.

## Helper Methods
First we need some helper methods for converting the raw data into something that we can work with. We decide to use Python dictionaries instead of classes, since custom classes cannot be used within Zeppelin due to serialization issues

In [1]:
def _get_float(str):
    """
    Helper method for converting a string to a float. If this is not possible, None will be returned instead
    """
    if len(str) == 0:
        return None
    try:
        return float(str)
    except ValueError:
        return None


def extract_station(line):
    """
    Extract weather station data from a raw CSV line
    """
    raw_columns = line.split(',')
    columns = [c.replace('"', '') for c in raw_columns]

    usaf = columns[0]
    wban = columns[1]
    name = columns[2]
    country = columns[3]
    state = columns[4]
    icao = columns[5]
    latitude = _get_float(columns[6])
    longitude = _get_float(columns[7])
    elevation = _get_float(columns[8])
    date_begin = columns[9]
    date_end = columns[10]
    return {
        'usaf': usaf,
        'wban': wban,
        'name': name,
        'country': country,
        'state': state,
        'icao': icao,
        'latitude': latitude,
        'longitude': longitude,
        'elevation': elevation,
        'date_begin': date_begin,
        'date_end': date_end,
    }


def extract_weather(line):
    """
    Extract weather data from a raw data line.
    """
    date = line[15:23]
    time = line[23:27]
    usaf = line[4:10]
    wban = line[10:15]
    airTemperatureQuality = line[92] == '1'
    airTemperature = float(line[87:92]) / 10
    windSpeedQuality = line[69] == '1'
    windSpeed = float(line[65:69]) / 10
    return {
        'date': date,
        'time': time,
        'usaf': usaf,
        'wban': wban,
        'airTemperatureQuality': airTemperatureQuality,
        'airTemperature': airTemperature,
        'windSpeedQuality': windSpeedQuality,
        'windSpeed': windSpeed,
    }

## Test extraction methods

In [2]:
# Load stations from 's3://dimajix-training/data/weather/isd-history'.
# Transform the data into Python dictionary using extract_station
stations = sc.textFile('s3://dimajix-training/data/weather/isd-history').map(
    extract_station
)

# Print a couple of elements from the transformed RDD
for s in stations.take(5):
    print(s)

{'latitude': None, 'state': 'STATE', 'wban': 'WBAN', 'name': 'STATION NAME', 'usaf': 'USAF', 'date_end': 'END', 'date_begin': 'BEGIN', 'longitude': None, 'country': 'CTRY', 'icao': 'ICAO', 'elevation': None}
{'latitude': None, 'state': '', 'wban': '99999', 'name': 'CWOS 07005', 'usaf': '007005', 'date_end': '20120127', 'date_begin': '20120127', 'longitude': None, 'country': '', 'icao': '', 'elevation': None}
{'latitude': None, 'state': '', 'wban': '99999', 'name': 'CWOS 07011', 'usaf': '007011', 'date_end': '20121129', 'date_begin': '20111025', 'longitude': None, 'country': '', 'icao': '', 'elevation': None}
{'latitude': 0.0, 'state': '', 'wban': '99999', 'name': 'WXPOD 7018', 'usaf': '007018', 'date_end': '20130730', 'date_begin': '20110309', 'longitude': 0.0, 'country': '', 'icao': '', 'elevation': 7018.0}
{'latitude': None, 'state': '', 'wban': '99999', 'name': 'CWOS 07025', 'usaf': '007025', 'date_end': '20120127', 'date_begin': '20120127', 'longitude': None, 'country': '', 'icao':

In [3]:
# Load weather from 's3://dimajix-training/data/weather/2014'.
# Transform the data into Python dictionary using extract_weather
weather = sc.textFile('s3://dimajix-training/data/weather/2014').map(extract_weather)

# Print a couple of elements from the transformed RDD
for w in weather.take(5):
    print(w)

{'wban': '99999', 'windSpeedQuality': True, 'usaf': '010060', 'airTemperatureQuality': True, 'date': '20140101', 'windSpeed': 3.0, 'airTemperature': -13.6, 'time': '0100'}
{'wban': '99999', 'windSpeedQuality': True, 'usaf': '010060', 'airTemperatureQuality': True, 'date': '20140101', 'windSpeed': 2.0, 'airTemperature': -14.2, 'time': '0200'}
{'wban': '99999', 'windSpeedQuality': True, 'usaf': '010060', 'airTemperatureQuality': True, 'date': '20140101', 'windSpeed': 4.0, 'airTemperature': -10.7, 'time': '0400'}
{'wban': '99999', 'windSpeedQuality': True, 'usaf': '010060', 'airTemperatureQuality': True, 'date': '20140101', 'windSpeed': 3.0, 'airTemperature': -11.2, 'time': '0500'}
{'wban': '99999', 'windSpeedQuality': True, 'usaf': '010060', 'airTemperatureQuality': True, 'date': '20140101', 'windSpeed': 5.0, 'airTemperature': -10.0, 'time': '0600'}


# Join Data Sets

In order to analyse the data, we need to join the weather data with the station data, so we can get more detailed information where the weather actually was recorded.

In [4]:
# Create a key for every weather station using the values for 'usaf' and 'wban' from every record.
# This can be done using the keyBy method.
station_index = stations.keyBy(lambda data: data['usaf'] + data['wban'])

# Create a key for every weather measurement element using the values for 'usaf' and 'wban' from every record.
# This can be done using the keyBy method.
weather_index = weather.keyBy(lambda data: data['usaf'] + data['wban'])

# Now join weather and stations together using the keyed data. This can be done using the join method
joined_weather = weather_index.join(station_index)

# Print some elements from joined_weather.
for d in joined_weather.take(5):
    print(d)

('72427303804', ({'wban': '03804', 'windSpeedQuality': False, 'usaf': '724273', 'airTemperatureQuality': False, 'date': '20140101', 'windSpeed': 1.5, 'airTemperature': -2.2, 'time': '0053'}, {'state': 'WV', 'wban': '03804', 'name': 'MID-OHIO VALLEY REGIONAL AIRPORT', 'usaf': '724273', 'date_end': '20151122', 'latitude': 39.2, 'longitude': -81.27, 'date_begin': '19900208', 'country': 'US', 'icao': 'KPKB', 'elevation': 253.3}))
('72427303804', ({'wban': '03804', 'windSpeedQuality': False, 'usaf': '724273', 'airTemperatureQuality': False, 'date': '20140101', 'windSpeed': 1.5, 'airTemperature': -2.2, 'time': '0153'}, {'state': 'WV', 'wban': '03804', 'name': 'MID-OHIO VALLEY REGIONAL AIRPORT', 'usaf': '724273', 'date_end': '20151122', 'latitude': 39.2, 'longitude': -81.27, 'date_begin': '19900208', 'country': 'US', 'icao': 'KPKB', 'elevation': 253.3}))
('72427303804', ({'wban': '03804', 'windSpeedQuality': False, 'usaf': '724273', 'airTemperatureQuality': False, 'date': '20140101', 'windSpe

## Caching Data

The join was really expensive. Before continuing you might want to cache the data and give it a nice name (for example "joined weather data") before continuing with the next steps.

In [5]:
# Cache the data for next operations
joined_weather.setName("joined weather data").cache()

joined weather data PythonRDD[14] at RDD at PythonRDD.scala:43

## Create appropriate Keys
We want to analyze the data grouped by country and year. So we need to create appropriate keys.

This will be done using a helper methid extract_country_year_weather, which should return a tuple

    ((country, year), weather)

for every record in joined_weather.

Pay attention to the layout of the elements in joined_weather, as can been see from the output above

In [6]:
def extract_country_year_weather(data):
    # data is a nested tuple, so we first need to extract the weather and the station data
    station = data[1][1]
    weather = data[1][0]
    # Now extract country from station
    country = station['country']
    # and the year from the weather measurement data
    year = weather['date'][0:4]
    return ((country, year), weather)


weather_per_country_and_year = joined_weather.map(extract_country_year_weather)

## Perform Aggregation
We want to extract minimum and maximum of wind speed and of temperature per year and country (i.e. using the joined data above). We also want to consider cases where data is not valid (i.e. windSpeedQuality is False or airTemperature is False).

We will implement custom aggregation functions that work on dictionaries

In [7]:
def nullsafe_min(a, b):
    """
    Helper method for taking the min of two values. Also gracefully handles None values
    """
    from builtins import min

    if a is None:
        return b
    if b is None:
        return a
    return min(a, b)


def nullsafe_max(a, b):
    """
    Helper method for taking the max of two values. Also gracefully handles None values
    """
    from builtins import max

    if a is None:
        return b
    if b is None:
        return a
    return max(a, b)


# Neutral value used in aggregation
zero_wmm = {
    'minTemperature': None,
    'maxTemperature': None,
    'minWindSpeed': None,
    'maxWindSpeed': None,
}


def reduce_wmm(wmm, data):
    """
    Used for merging in a new weather data set into an existing WeatherMinMax object. The incoming
    objects will not be modified, instead a new object will be returned.
    :param wmm: A Python dictionary representing min/max information
    :param data: A Python dictionary representring weather measurement information
    :returns: A new Python dictionary representing min/max information
    """
    if data['airTemperatureQuality']:
        minTemperature = nullsafe_min(wmm['minTemperature'], data['airTemperature'])
        maxTemperature = nullsafe_max(wmm['maxTemperature'], data['airTemperature'])
    else:
        minTemperature = wmm['minTemperature']
        maxTemperature = wmm['maxTemperature']

    if data['windSpeedQuality']:
        minWindSpeed = nullsafe_min(wmm['minWindSpeed'], data['windSpeed'])
        maxWindSpeed = nullsafe_max(wmm['maxWindSpeed'], data['windSpeed'])
    else:
        minWindSpeed = wmm['minWindSpeed']
        maxWindSpeed = wmm['maxWindSpeed']

    return {
        'minTemperature': minTemperature,
        'maxTemperature': maxTemperature,
        'minWindSpeed': minWindSpeed,
        'maxWindSpeed': maxWindSpeed,
    }


def combine_wmm(left, right):
    """
    Used for combining two WeatherMinMax objects into a new WeatherMinMax object
    :param self: First Python dictionary representing min/max information
    :param other: Second Python dictionary representing min/max information
    :returns: A new Python dictionary representing combined min/max information
    """
    minTemperature = nullsafe_min(left['minTemperature'], right['minTemperature'])
    maxTemperature = nullsafe_max(left['maxTemperature'], right['maxTemperature'])
    minWindSpeed = nullsafe_min(left['minWindSpeed'], right['minWindSpeed'])
    maxWindSpeed = nullsafe_max(left['maxWindSpeed'], right['maxWindSpeed'])

    return {
        'minTemperature': minTemperature,
        'maxTemperature': maxTemperature,
        'minWindSpeed': minWindSpeed,
        'maxWindSpeed': maxWindSpeed,
    }

In [8]:
# Aggregate min/max information per year and country
weather_minmax = weather_per_country_and_year.aggregateByKey(
    zero_wmm, reduce_wmm, combine_wmm
)

for m in weather_minmax.take(5):
    print(m)

(('FI', '2014'), {'maxWindSpeed': 18.0, 'minTemperature': -28.6, 'maxTemperature': 30.3, 'minWindSpeed': 0.0})
(('BE', '2014'), {'maxWindSpeed': 16.0, 'minTemperature': -7.0, 'maxTemperature': 33.1, 'minWindSpeed': 0.0})
(('DA', '2014'), {'maxWindSpeed': 17.0, 'minTemperature': -9.0, 'minWindSpeed': 0.0, 'maxTemperature': 30.2})
(('US', '2014'), {'maxWindSpeed': 31.0, 'minTemperature': -37.2, 'maxTemperature': 41.2, 'minWindSpeed': 0.0})
(('IC', '2014'), {'maxWindSpeed': 29.3, 'minTemperature': -7.0, 'minWindSpeed': 0.0, 'maxTemperature': 18.0})


# Format Output

We want to create CSV data, so we need to reformat the Python dicts to nicely looking strings

In [9]:
def format_result(row):
    # Every row contains the key and the data.
    #   key is (country, year)
    #   value is Python dictionary containing min/max information
    (k, v) = row
    country = k[0]
    year = k[1]
    minT = v['minTemperature'] or 0.0
    maxT = v['maxTemperature'] or 0.0
    minW = v['minWindSpeed'] or 0.0
    maxW = v['maxWindSpeed'] or 0.0
    # Create a CSV line containing 'country,year,minTemperature,maxTemperature,minWindSpeed,maxWindSpeed'
    line = "%s,%s,%f,%f,%f,%f" % (country, year, minT, maxT, minW, maxW)
    # Encode as UTF-8, or we might experience some problems
    return line.encode('utf-8')


result = weather_minmax.map(format_result).collect()

for l in result:
    print(l)

b'FI,2014,-28.600000,30.300000,0.000000,18.000000'
b'BE,2014,-7.000000,33.100000,0.000000,16.000000'
b'DA,2014,-9.000000,30.200000,0.000000,17.000000'
b'US,2014,-37.200000,41.200000,0.000000,31.000000'
b'IC,2014,-7.000000,18.000000,0.000000,29.300000'
b'AU,2014,-11.000000,34.000000,0.000000,16.500000'
b'CH,2014,11.000000,34.000000,0.000000,15.000000'
b'SF,2014,0.900000,37.400000,0.000000,13.400000'
b'AS,2014,0.900000,45.600000,0.000000,14.400000'
b'UK,2014,-6.000000,30.400000,0.000000,20.600000'
b'PO,2014,-1.000000,32.000000,0.000000,15.400000'
b'LU,2014,-10.000000,32.100000,0.000000,13.400000'
b'NL,2014,-9.000000,35.000000,0.000000,27.300000'
b'PL,2014,-15.000000,32.000000,0.000000,14.900000'
b'GM,2014,-9.000000,31.000000,0.000000,13.400000'
b'SC,2014,20.000000,32.000000,0.000000,30.400000'
b'SW,2014,-34.500000,28.900000,1.000000,16.000000'
b'AM,2014,-19.000000,39.000000,0.000000,16.000000'
b'NO,2014,-35.700000,32.000000,0.000000,35.500000'
b'FR,2014,-9.000000,36.100000,0.000000,16.50

# Bonus: Process all Years

In [None]:
def load_year(year):
    dirname = 'data/weather/%d' % year
    return sc.textFile(dirname).map(extract_weather)


years_data = [load_year(year) for year in range(2004, 2015)]
all_weather_data = sc.union(years_data)

weather_index = all_weather_data.keyBy(lambda data: data['usaf'] + data['wban'])

# Now join weather and stations together using the keyed data. This can be done using the join method
joined_weather = weather_index.join(station_index)
weather_per_country_and_year = joined_weather.map(extract_country_year_weather)
weather_minmax = weather_per_country_and_year.aggregateByKey(
    zero_wmm, reduce_wmm, combine_wmm
)
result = weather_minmax.map(format_result).collect()

for l in result:
    print(l)