In [1]:
import pandas as pd
import numpy as np
from datetime import datetime

In [2]:
raw_data = pd.read_csv("../Project_data/Weather Data/74509023244.csv")

  raw_data = pd.read_csv("../Project_data/Weather Data/74509023244.csv")


In [3]:
# Set index as timestamp
raw_data.index = pd.to_datetime(raw_data['DATE'])

In [4]:
# Only use Source 7 (ASOS/AWOS observation merged with USAF SURFACE HOURLY observation)
raw_data = raw_data[raw_data.SOURCE == 7]

In [5]:
# Make a new dataframe with the timestamp index.
data = raw_data[[]].copy()

# Extract Data
Pull out relevant data from the NOAA data.
Use Federal Climate Complex Data Documentation for Integrated Surface Data (ISD)

https://www.ibm.com/docs/en/environmental-intel-suite?topic=efa-2-day-hourly-forecast-48-hour

### Available Fields from api.weather.com forecasts
```
"cloudCover":            Cloud Cover (0-100)  
"pressureMeanSeaLevel":  Pressure (hPa)  
"temperature":           Temperature (C)  
"temperatureDewPoint":   Dew Point (C)  
"visibility":            Visibility (km)  
"windDirection":         Wind Direction (degrees)  
"windSpeed":             Wind Speed (km/h)  
```

In [6]:
# GA1 - SKY-COVER-LAYER: <coverage_code>,<coverage_quality>,<base_height>,<height_quality>,<cloud_type>,<cloud_type_quality>
# SKY-COVER-LAYER identifier
# The identifier that represents a SKY-COVER-LAYER.

# Make value 0-100. None if invalid.
CLOUD_COVER_MAP = {
    "00": 0,    # None, SKC or CLR
    "01": 10,   # One okta - 1/10 or less but not zero
    "02": 25,   # Two oktas - 2/10 - 3/10, or FEW
    "03": 40,   # Three oktas - 4/10
    "04": 50,   # Four oktas - 5/10, or SCT
    "05": 60,   # Five oktas - 6/10
    "06": 75,   # Six oktas - 7/10 - 8/10
    "07": 90,   # Seven oktas - 9/10 or more but not 10/10, or BKN
    "08": 100,  # Eight oktas - 10/10, or OVC
    "09": None  # Sky obscured, or cloud amount cannot be estimated (We'll fill this in later)
    }

data['cloudCover'] = raw_data['GA1'].str.split(",", expand=True)[0].map(CLOUD_COVER_MAP)
    

In [7]:
# GD1 - SKY-COVER-SUMMATION-STATE: <coverage_code>,<coverage_code2>,<coverage_quality>,<height>,<height_quality>,<characteristic_code>
# SKY-COVER-SUMMATION-STATE coverage code
# The code that denotes the portion of the total celestial dome covered by all layers of clouds and other
# obscuring phenomena at or below a given height.

# Make value 0-100. None if invalid.
CLOUD_COVER_MAP = {
    "0": 0,    # Clear - No coverage
    "1": 12,   # FEW - 2/8 or less coverage (not including zero)
    "2": 44,   # SCATTERED - 3/8-4/8 coverage
    "3": 75,   # BROKEN - 5/8-7/8 coverage
    "4": 100,  # OVERCAST - 8/8 coverage
    "5": None, # OBSCURED
    "6": None, # PARTIALLY OBSCURED
    "9": None  # MISSING
    }

data['cloudCover'] = raw_data['GD1'].str.split(",", expand=True)[0].map(CLOUD_COVER_MAP)
    
# Fill forward the cloudCover data. We've got discrete levels, so interpolation doesn't work great here
data['cloudCover'] = data['cloudCover'].ffill()

In [8]:
# MA1 - ATMOSPHERIC-PRESSURE-OBSERVATION: <altimeter_setting>,<altimeter_quality>,<station_pressure>,<station_pressure_quality>
# ATMOSPHERIC-PRESSURE-OBSERVATION station pressure rate
# The atmospheric pressure at the observation point.
# MIN: 04500 MAX: 10900 UNITS: Hectopascals
# SCALING FACTOR: 10
# 99999 = Missing.

data['pressureMeanSeaLevel'] = raw_data['MA1'].str.split(",", expand=True)[2].astype(float) / 10

# Interpolate any missing values
data['pressureMeanSeaLevel'] = data['pressureMeanSeaLevel'].replace(9999.9, np.nan)
data['pressureMeanSeaLevel'] = data['pressureMeanSeaLevel'].interpolate(method ='linear', limit_direction ='forward') 

In [9]:
# TMP - AIR-TEMPERATURE-OBSERVATION air temperature: <temperature>,<temperature_quality>
# The temperature of the air.
# MIN: -0932 MAX: +0618 UNITS: Degrees Celsius
# SCALING FACTOR: 10
# +9999 = Missing.

data['temperature'] = raw_data['TMP'].str.split(",", expand=True)[0].astype(float) / 10

# Interpolate any missing values
data['temperature'] = data['temperature'].replace(999.9, np.nan)
data['temperature'] = data['temperature'].interpolate(method ='linear', limit_direction ='forward') 

In [10]:
# DEW - AIR-TEMPERATURE-OBSERVATION dew point temperature: <dew_point>,<dew_point_quality>
# The temperature to which a given parcel of air must be cooled at constant pressure and water vapor
# content in order for saturation to occur.
# MIN: -0982 MAX: +0368 UNITS: Degrees Celsius
# SCALING FACTOR: 10
# +9999 = Missing
data['temperatureDewPoint'] = raw_data['DEW'].str.split(",", expand=True)[0].astype(float) / 10

# Interpolate any missing values
data['temperatureDewPoint'] = data['temperatureDewPoint'].replace(999.9, np.nan)
data['temperatureDewPoint'] = data['temperatureDewPoint'].interpolate(method ='linear', limit_direction ='forward') 

In [11]:
# VIS - VISIBILITY-OBSERVATION distance dimension: <distance>,<distance_quality>,<variability>,<variability_quality>
# The horizontal distance at which an object can be seen and identified.
# MIN: 000000 MAX: 160000 UNITS: Meters
# Missing = 999999
# NOTE: Values greater than 160000 are entered as 160000
data['visibility'] = raw_data['VIS'].str.split(",", expand=True)[0].astype(float) / 1000 # Convert to km

# Interpolate any missing values
data['visibility'] = data['visibility'].replace(999.999, np.nan)
data['visibility'] = data['visibility'].interpolate(method ='linear', limit_direction ='forward') 

In [12]:
# WND - WIND-OBSERVATION: <direction_angle>,<direction_quality>,<type>,<speed>,<speed_quality>
# WIND-OBSERVATION direction angle
# The angle, measured in a clockwise direction, between true north and the direction from which the wind is blowing.
# MIN: 001 MAX: 360 UNITS: Angular Degrees
# SCALING FACTOR: 1
# 999 = Missing. If type code (below) = V, then 999 indicates variable wind direction.

data['windDirection'] = raw_data['WND'].str.split(",", expand=True)[0].astype(float)

# Interpolate any missing values
data['windDirection'] = data['windDirection'].replace(999.0, np.nan)
data['windDirection'] = data['windDirection'].interpolate(method ='linear', limit_direction ='forward') 
data['windDirection'] = data['windDirection'].bfill() # If the first values are empty, fill them in

# WIND-OBSERVATION speed rate
# The rate of horizontal travel of air past a fixed point.
# MIN: 0000 MAX: 0900 UNITS: meters per second
# SCALING FACTOR: 10
# 9999 = Missing

data['windSpeed'] = raw_data['WND'].str.split(",", expand=True)[3].astype(float) / 10 * 3600 / 1000 # Convert to km/h

# Interpolate any missing values
data['windSpeed'] = data['windSpeed'].replace(3599.64, np.nan)
data['windSpeed'] = data['windSpeed'].interpolate(method ='linear', limit_direction ='forward') 

## Normalize Timestamps
Resample data to a normalized once perhour, on-the-hour timestamp

In [13]:
# Filter out duplicate indexes
data = data[~data.index.duplicated(keep='first')]
# resample the data 
resampled_data = data.resample(rule='1h', origin=datetime(2018,1,1,0,0)).ffill()

In [14]:
# Export Data
resampled_data.to_csv("../Project_data/Normalized Data/noaa_sunnyvale_weather_hourly_2018.csv")