In [1]:
!rm clean_hawaii_measurements.csv clean_hawaii_stations.csv

In [2]:
# Dependencies
import pandas as pd
import numpy as np

In [3]:
# Read data for data engineering
measures = pd.read_csv('hawaii_measurements.csv')
stations = pd.read_csv('hawaii_stations.csv')

In [4]:
measures.head()

Unnamed: 0,station,date,prcp,tobs
0,USC00519397,2010-01-01,0.08,65
1,USC00519397,2010-01-02,0.0,63
2,USC00519397,2010-01-03,0.0,74
3,USC00519397,2010-01-04,0.0,76
4,USC00519397,2010-01-06,,73


In [5]:
measures['prcp'].max()

11.529999999999999

In [6]:
measures['prcp'].min()

0.0

In [7]:
measures['tobs'].max()

87

In [8]:
measures['tobs'].min()

53

In [9]:
stations.head()

Unnamed: 0,station,name,latitude,longitude,elevation
0,USC00519397,"WAIKIKI 717.2, HI US",21.2716,-157.8168,3.0
1,USC00513117,"KANEOHE 838.1, HI US",21.4234,-157.8015,14.6
2,USC00514830,"KUALOA RANCH HEADQUARTERS 886.9, HI US",21.5213,-157.8374,7.0
3,USC00517948,"PEARL CITY, HI US",21.3934,-157.9751,11.9
4,USC00518838,"UPPER WAHIAWA 874.3, HI US",21.4992,-158.0111,306.6


In [10]:
stations['latitude'].max()

21.5213

In [11]:
stations['latitude'].min()

21.271599999999999

In [12]:
stations['longitude'].max()

-157.71138999999999

In [13]:
stations['longitude'].min()

-158.0111

In [14]:
stations['elevation'].max()

306.60000000000002

In [15]:
stations['elevation'].min()

0.90000000000000002

In [16]:
stations['elevation'].median()

14.6

In [17]:
# hmm seems to be a couple of outliers based on elevation
outlier_stations_by_elevation = stations.loc[stations['elevation'] > 40]
outlier_stations_by_elevation
# Validated using google maps and coordinates that these elevations are valid

Unnamed: 0,station,name,latitude,longitude,elevation
4,USC00518838,"UPPER WAHIAWA 874.3, HI US",21.4992,-158.0111,306.6
8,USC00516128,"MANOA LYON ARBO 785.2, HI US",21.3331,-157.8025,152.4


In [18]:
majority_stations_by_elevation = stations.loc[stations['elevation'] <= 40]
majority_stations_by_elevation

Unnamed: 0,station,name,latitude,longitude,elevation
0,USC00519397,"WAIKIKI 717.2, HI US",21.2716,-157.8168,3.0
1,USC00513117,"KANEOHE 838.1, HI US",21.4234,-157.8015,14.6
2,USC00514830,"KUALOA RANCH HEADQUARTERS 886.9, HI US",21.5213,-157.8374,7.0
3,USC00517948,"PEARL CITY, HI US",21.3934,-157.9751,11.9
5,USC00519523,"WAIMANALO EXPERIMENTAL FARM, HI US",21.33556,-157.71139,19.5
6,USC00519281,"WAIHEE 837.5, HI US",21.45167,-157.84889,32.9
7,USC00511918,"HONOLULU OBSERVATORY 702.2, HI US",21.3152,-157.9992,0.9


In [19]:
def find_columns_with_missing_values(df):
    missing_data_columns = []
    if df.isnull().values.any():
        missing_data_columns = [x for x in df.columns.values if df[df[x].isnull()].shape[0] > 0]
    return missing_data_columns

In [20]:
# if stations has any missing values find columns where missing values are found
find_columns_with_missing_values(stations)
# None found

[]

In [21]:
# No missing data for stations so save to "clean" csv 
stations.to_csv('clean_hawaii_stations.csv', index = False)

In [22]:
# if measures has any missing values find columns where missing values are found
find_columns_with_missing_values(measures)
# 'prcp' column has missing values

['prcp']

In [23]:
# Evaluate missing measures prcp values
# See if they look like duplicate data or data entered in error
rows_with_missing_prcp = measures[measures['prcp'].isnull()]
rows_with_missing_prcp.head()
# Data looks valid except for 'prcp' = NaN

Unnamed: 0,station,date,prcp,tobs
4,USC00519397,2010-01-06,,73
26,USC00519397,2010-01-30,,70
29,USC00519397,2010-02-03,,67
43,USC00519397,2010-02-19,,63
61,USC00519397,2010-03-11,,73


In [24]:
# Check Distribution by station for missing 'prcp' rows (see if concentrated in one station)
groupbystations = measures[measures['prcp'].isnull()].groupby(['station']).size()
groupbystations
# Missing prcp values for stations are pretty much spread out like all data (see next)

station
USC00511918     47
USC00513117     13
USC00514830    265
USC00516128    128
USC00517948    689
USC00518838    169
USC00519397     39
USC00519523     97
dtype: int64

In [25]:
# Distribution by station for all data: 
groupbystations = measures.groupby(['station']).size()
groupbystations

station
USC00511918    1979
USC00513117    2709
USC00514830    2202
USC00516128    2612
USC00517948    1372
USC00518838     511
USC00519281    2772
USC00519397    2724
USC00519523    2669
dtype: int64

In [26]:
# Check Distribution by year for missing 'prcp' data (see if concentrated in one year)
m = measures
m['year'] = m.date.str[:4]
groupbyyear = m[m['prcp'].isnull()].groupby(['year']).size()
groupbyyear
# Missing prcp values are pretty much spread out over several years similar to years for all data (see next)

year
2010    103
2011    168
2012    170
2013    196
2014    195
2015    245
2016    240
2017    130
dtype: int64

In [27]:
# Distribution by year for all data:
m['year'] = m.date.str[:4]
groupbyyear = m.groupby(['year']).size()
groupbyyear

year
2010    2784
2011    2733
2012    2640
2013    2670
2014    2597
2015    2420
2016    2309
2017    1397
dtype: int64

In [28]:
# Check Distribution by year and month (see if concentrated in one month)
m['month'] = m.date.str[5:7]
groupbyyearmonth = m[m['prcp'].isnull()].groupby(['year'] + ['month']).size()
groupbyyearmonth
# Missing prcp vallues are pretty much spread out over 'year' and 'month' simlar to 'year' and 'month' for all data (see Next)

year  month
2010  01        5
      02        9
      03       11
      04        9
      05       12
      06       10
      07        9
      08        7
      09        7
      10        6
      11       13
      12        5
2011  01        7
      02       23
      03       17
      04        8
      05       18
      06       15
      07       19
      08       16
      09       10
      10       14
      11       13
      12        8
2012  01        8
      02       17
      03       13
      04       20
      05       13
      06       17
               ..
2015  03       20
      04       14
      05        8
      06       15
      07       25
      08       29
      09       21
      10       28
      11       25
      12       22
2016  01       14
      02       25
      03       23
      04       16
      05       23
      06       21
      07       24
      08       21
      09       17
      10       19
      11       15
      12       22
2017  01       21
      02       1

In [29]:
groupbyyearmonth = m.groupby(['year'] + ['month']).size()
groupbyyearmonth

year  month
2010  01       201
      02       178
      03       227
      04       226
      05       246
      06       242
      07       235
      08       259
      09       248
      10       252
      11       235
      12       235
2011  01       245
      02       221
      03       231
      04       238
      05       246
      06       227
      07       223
      08       223
      09       208
      10       227
      11       221
      12       223
2012  01       241
      02       226
      03       222
      04       226
      05       214
      06       211
              ... 
2015  03       205
      04       192
      05       209
      06       203
      07       184
      08       205
      09       205
      10       217
      11       197
      12       184
2016  01       184
      02       187
      03       196
      04       192
      05       194
      06       194
      07       187
      08       201
      09       190
      10       198
      11       186


In [30]:
#Evaluate percentage of Rows with missing 'prcp' to All Rows
# Count of rows with missing prcp data
rows_with_missing_prcp = measures[measures['prcp'].isnull()].shape[0]
rows_with_missing_prcp
# 1447
# Count of rows having prcp values
rows_with_prcp = measures.dropna()
rows_with_prcp.shape[0]
# 18103
# get % of measures rows with missing prcp data to all measures rows
rows_with_missing_prcp/(rows_with_prcp.shape[0] + rows_with_missing_prcp)*100
# 7.4%

7.40153452685422

In [31]:
# Since the % of rows with missing prcp values is fairly substantial and spread evenly throughout the data... instead of 
# removing missing rows or replacing with 0, I replaced measures missing prcp values with the overall prcp mean value... This 
# retains the tob average and maintains the same prcp overall average.
measures = measures.fillna(measures['prcp'].mean())
measures.to_csv('clean_hawaii_measurements.csv', index=False)