In [37]:
import numpy as np
import pandas as pd

In [39]:
# import weather dataset and select relevant columns
weather = pd.read_csv('data-raw/weather.csv', index_col=False)
# Select columns we feel might have impact on data, also columns that have less than 50% 
weather = weather[['DATE', 'HourlyAltimeterSetting', 'HourlyDewPointTemperature', 'HourlyDryBulbTemperature', 
                   'HourlySeaLevelPressure', 'HourlyStationPressure', 'HourlyPressureChange', 'HourlyPressureTendency', 
                   'HourlyVisibility', 'HourlyWindSpeed', 'HourlySkyConditions', 'HourlyPrecipitation', 
                   'HourlyWetBulbTemperature', 'HourlyWindDirection', 'HourlyRelativeHumidity', 'REPORT_TYPE']]


# handle non-numeric values: change those values to NaN and forward fill later
weather.HourlyStationPressure = pd.to_numeric(weather.HourlyStationPressure, errors='coerce')
weather.HourlyVisibility = pd.to_numeric(weather.HourlyVisibility, errors='coerce')
weather.HourlyPrecipitation = pd.to_numeric(weather.HourlyPrecipitation, errors='coerce')
weather.HourlyWindDirection = pd.to_numeric(weather.HourlyWindDirection, errors='coerce')
weather.HourlyAltimeterSetting = pd.to_numeric(weather.HourlyAltimeterSetting, errors='coerce')
weather.HourlySeaLevelPressure = pd.to_numeric(weather.HourlySeaLevelPressure, errors='coerce')

# convert na values to 0 (no pressure change)
weather.HourlyPressureChange = np.nan_to_num(weather.HourlyPressureChange)
weather.HourlyPressureTendency = np.nan_to_num(weather.HourlyPressureTendency)

# transform sky conditions to two dummy columns: cloud_1 == 1 for overcast sky, cloud_0 == 1 for clear sky
weather['cloud_1'] = 0
weather['cloud_0'] = 1
weather.HourlySkyConditions = weather.HourlySkyConditions.fillna(method='ffill')
for i in range(0, len(weather)):
    v = weather.HourlySkyConditions[i]
    if ':08 ' in v or ':09 ' in v or ':10 ' in v:
        weather.cloud_1[i] = 1
        weather.cloud_0[i] = 0


# transform wind direction to four dummie columns: wind_north, wind_east, wind_south, wind_west
weather['wind_north'] = 0
weather['wind_east'] = 0
weather['wind_south'] = 0
weather['wind_west'] = 0
weather.HourlyWindDirection = weather.HourlyWindDirection.fillna(method='ffill')
for i in range(0, len(weather)):
    d = weather.HourlyWindDirection[i]
    if d > 315 or d <= 45:
        weather.wind_north[i] = 1
    elif d > 225 and d <= 315:
        weather.wind_west[i] = 1
    elif d >135 and d <= 225:
        weather.wind_south[i] = 1
    else:
        weather.wind_east[i] = 1

# Get rid of multiple weather readings per hour by using one reporting type
weather = weather[weather.REPORT_TYPE == 'FM-15']

# forward fill the other columns, assume weather doesn't change in an hour
weather = weather.fillna(method='ffill')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a

In [43]:
# Create new variable in format yyyymmddhh to allow join with flight data
weather['join_time'] = weather['DATE'].astype('str')
weather['join_time'] = weather['join_time'].str.replace('T', '')
weather['join_time'] = weather['join_time'].str.replace('-', '')
weather['join_time'] = weather.join_time.str[:-6]

Unnamed: 0,DATE,HourlyAltimeterSetting,HourlyDewPointTemperature,HourlyDryBulbTemperature,HourlySeaLevelPressure,HourlyStationPressure,HourlyPressureChange,HourlyPressureTendency,HourlyVisibility,HourlyWindSpeed,...,HourlyWindDirection,HourlyRelativeHumidity,REPORT_TYPE,cloud_1,cloud_0,wind_north,wind_east,wind_south,wind_west,join_time
0,2017-12-01T00:53:00,30.14,40.0,45.0,30.16,29.67,-0.0,3.0,10.0,13.0,...,200.0,83.0,FM-15,0,1,0,0,1,0,2017120100
1,2017-12-01T01:53:00,30.13,39.0,44.0,30.16,29.66,0.0,0.0,10.0,10.0,...,180.0,83.0,FM-15,0,1,0,0,1,0,2017120101
2,2017-12-01T02:53:00,30.12,39.0,43.0,30.15,29.65,0.0,0.0,10.0,8.0,...,150.0,86.0,FM-15,0,1,0,0,1,0,2017120102
3,2017-12-01T03:53:00,30.11,38.0,42.0,30.13,29.64,0.03,8.0,10.0,9.0,...,130.0,85.0,FM-15,0,1,0,1,0,0,2017120103
5,2017-12-01T04:53:00,30.09,37.0,41.0,30.11,29.62,0.0,0.0,10.0,8.0,...,140.0,86.0,FM-15,0,1,0,0,1,0,2017120104


In [44]:
# Save to .csv to use to combine with weather data
weather.to_csv('data/weather_cleaned.csv', index=False)